Convert core (except toke.c) to use isFOO_utf8_safe()

author Karl Williamson <khw@cpan.org>

Wed, 30 Nov 2016 16:53:17 +0000 (09:53 -0700)

committer Karl Williamson <khw@cpan.org>

Fri, 23 Dec 2016 23:48:34 +0000 (16:48 -0700)
author Karl Williamson <khw@cpan.org>
Wed, 30 Nov 2016 16:53:17 +0000 (09:53 -0700)
committer Karl Williamson <khw@cpan.org>
Fri, 23 Dec 2016 23:48:34 +0000 (16:48 -0700)
diff --git a/gv.c b/gv.c

index 775951b..2570cf0 100644 (file)
--- a/gv.c
+++ b/gv.c
@@ -1591,7 +1591,10 @@ S_parse_gv_stash_name(pTHX_ HV **stash, GV **gv, const char **name,
  
      PERL_ARGS_ASSERT_PARSE_GV_STASH_NAME;
      
-    if (full_len > 2 && **name == '*' && isIDFIRST_lazy_if(*name + 1, is_utf8)) {
+    if (   full_len > 2
+        && **name == '*'
+        && isIDFIRST_lazy_if_safe(*name + 1, name_end, is_utf8))
+    {
          /* accidental stringify on a GV? */
          (*name)++;
      }
@@ -1676,7 +1679,7 @@ S_gv_is_in_main(pTHX_ const char *name, STRLEN len, const U32 is_utf8)
      PERL_ARGS_ASSERT_GV_IS_IN_MAIN;
      
      /* If it's an alphanumeric variable */
-    if ( len && isIDFIRST_lazy_if(name, is_utf8) ) {
+    if ( len && isIDFIRST_lazy_if_safe(name, name + len, is_utf8) ) {
          /* Some "normal" variables are always in main::,
           * like INC or STDOUT.
           */
@@ -2400,8 +2403,12 @@ Perl_gv_fetchpvn_flags(pTHX_ const char *nambeg, STRLEN full_len, I32 flags,
                  UTF8fARG(is_utf8, name_end-nambeg, nambeg));
      gv_init_pvn(gv, stash, name, len, (add & GV_ADDMULTI)|is_utf8);
  
-    if ( isIDFIRST_lazy_if(name, is_utf8) && !ckWARN(WARN_ONCE) )
+    if (   full_len != 0
+        && isIDFIRST_lazy_if_safe(name, name + full_len, is_utf8)
+        && !ckWARN(WARN_ONCE) )
+    {
          GvMULTI_on(gv) ;
+    }
  
      /* set up magic where warranted */
      if ( gv_magicalize(gv, stash, name, len, sv_type) ) {
@@ -2492,8 +2499,12 @@ Perl_gv_check(pTHX_ HV *stash)
                  )
                      gv_check(hv);              /* nested package */
             }
-            else if ( *HeKEY(entry) != '_'
-                        && isIDFIRST_lazy_if(HeKEY(entry), HeUTF8(entry)) ) {
+            else if (   HeKLEN(entry) != 0
+                     && *HeKEY(entry) != '_'
+                     && isIDFIRST_lazy_if_safe(HeKEY(entry),
+                                               HeKEY(entry) + HeKLEN(entry),
+                                               HeUTF8(entry)) )
+            {
                  const char *file;
                 gv = MUTABLE_GV(HeVAL(entry));
                 if (SvTYPE(gv) != SVt_PVGV || GvMULTI(gv))
diff --git a/op.c b/op.c

index 722ee35..394efef 100644 (file)
--- a/op.c
+++ b/op.c
@@ -652,11 +652,12 @@ Perl_allocmy(pTHX_ const char *const name, const STRLEN len, const U32 flags)
                    (UV)flags);
  
      /* complain about "my $<special_var>" etc etc */
-    if (len &&
-       !(is_our ||
-         isALPHA(name[1]) ||
-         ((flags & SVf_UTF8) && isIDFIRST_utf8((U8 *)name+1)) ||
-         (name[1] == '_' && len > 2)))
+    if (   len
+        && !(  is_our
+            || isALPHA(name[1])
+            || (   (flags & SVf_UTF8)
+                && isIDFIRST_utf8_safe((U8 *)name+1, name + len))
+            || (name[1] == '_' && len > 2)))
      {
         if (!(flags & SVf_UTF8 && UTF8_IS_START(name[1]))
          && isASCII(name[1])
diff --git a/pp.c b/pp.c

index b198b47..6fb20f6 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -5794,7 +5794,7 @@ PP(pp_split)
      orig = s;
      if (RX_EXTFLAGS(rx) & RXf_SKIPWHITE) {
         if (do_utf8) {
-           while (isSPACE_utf8(s))
+           while (isSPACE_utf8_safe(s, strend))
                 s += UTF8SKIP(s);
         }
         else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) {
@@ -5819,9 +5819,9 @@ PP(pp_split)
             m = s;
             /* this one uses 'm' and is a negative test */
             if (do_utf8) {
-               while (m < strend && ! isSPACE_utf8(m) ) {
+               while (m < strend && ! isSPACE_utf8_safe(m, strend) ) {
                     const int t = UTF8SKIP(m);
-                   /* isSPACE_utf8 returns FALSE for malform utf8 */
+                   /* isSPACE_utf8_safe returns FALSE for malform utf8 */
                     if (strend - m < t)
                         m = strend;
                     else
@@ -5859,7 +5859,7 @@ PP(pp_split)
  
             /* this one uses 's' and is a positive test */
             if (do_utf8) {
-               while (s < strend && isSPACE_utf8(s) )
+               while (s < strend && isSPACE_utf8_safe(s, strend) )
                     s +=  UTF8SKIP(s);
             }
             else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET)
diff --git a/pp_pack.c b/pp_pack.c

index a75229a..ee4c69e 100644 (file)
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -1073,9 +1073,14 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c
                 /* 'A' strips both nulls and spaces */
                 const char *ptr;
                 if (utf8 && (symptr->flags & FLAG_WAS_UTF8)) {
-                   for (ptr = s+len-1; ptr >= s; ptr--)
-                       if (*ptr != 0 && !UTF8_IS_CONTINUATION(*ptr) &&
-                           !isSPACE_utf8(ptr)) break;
+                    for (ptr = s+len-1; ptr >= s; ptr--) {
+                        if (   *ptr != 0
+                            && !UTF8_IS_CONTINUATION(*ptr)
+                            && !isSPACE_utf8_safe(ptr, strend))
+                        {
+                            break;
+                        }
+                    }
                     if (ptr >= s) ptr += UTF8SKIP(ptr);
                     else ptr++;
                     if (ptr > s+len)
diff --git a/regcomp.c b/regcomp.c

index 095b13f..7578a25 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -8271,17 +8271,18 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
  
      assert (RExC_parse <= RExC_end);
      if (RExC_parse == RExC_end) NOOP;
-    else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
+    else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
           /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
            * using do...while */
         if (UTF)
             do {
                 RExC_parse += UTF8SKIP(RExC_parse);
-           } while (isWORDCHAR_utf8((U8*)RExC_parse));
+           } while (   RExC_parse < RExC_end
+                     && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
         else
             do {
                 RExC_parse++;
-           } while (isWORDCHAR(*RExC_parse));
+           } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
      } else {
          RExC_parse++; /* so the <- from the vFAIL is after the offending
                           character */
diff --git a/regexec.c b/regexec.c

index e9c74e6..f720e7d 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1678,7 +1678,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
      tmp = TEST_UV(tmp);                                                        \
      LOAD_UTF8_CHARCLASS_ALNUM();                                               \
      REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
-        if (tmp == ! (TEST_UTF8((U8 *) s))) {                                  \
+        if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
              tmp = !tmp;                                                        \
              IF_SUCCESS;                                                        \
          }                                                                      \
@@ -2050,7 +2050,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
              goto do_boundu;
          }
  
-        FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
+        FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
          break;
  
      case NBOUNDL:
@@ -2063,14 +2063,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
              goto do_nboundu;
          }
  
-        FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
+        FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
          break;
  
      case BOUND: /* regcomp.c makes sure that this only has the traditional \b
                     meaning */
          assert(FLAGS(c) == TRADITIONAL_BOUND);
  
-        FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
+        FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
          break;
  
      case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
@@ -2084,7 +2084,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     meaning */
          assert(FLAGS(c) == TRADITIONAL_BOUND);
  
-        FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
+        FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
          break;
  
      case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
@@ -2096,7 +2096,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
  
      case NBOUNDU:
          if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
-            FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+            FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
              break;
          }
  
@@ -2109,7 +2109,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
        do_boundu:
          switch((bound_type) FLAGS(c)) {
              case TRADITIONAL_BOUND:
-                FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+                FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
                  break;
              case GCB_BOUND:
                  if (s == reginfo->strbeg) {
@@ -2387,7 +2387,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
          if (utf8_target) {
              /* The complement of something that matches only ASCII matches all
               * non-ASCII, plus everything in ASCII that isn't in the class. */
-            REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
+            REXEC_FBC_UTF8_CLASS_SCAN(   ! isASCII_utf8_safe(s, strend)
                                        || ! _generic_isCC_A(*s, FLAGS(c)));
              break;
          }
@@ -2451,27 +2451,27 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                                             macros */
                  case _CC_ENUM_SPACE:
                      REXEC_FBC_UTF8_CLASS_SCAN(
-                                        to_complement ^ cBOOL(isSPACE_utf8(s)));
+                        to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
                      break;
  
                  case _CC_ENUM_BLANK:
                      REXEC_FBC_UTF8_CLASS_SCAN(
-                                        to_complement ^ cBOOL(isBLANK_utf8(s)));
+                        to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
                      break;
  
                  case _CC_ENUM_XDIGIT:
                      REXEC_FBC_UTF8_CLASS_SCAN(
-                                       to_complement ^ cBOOL(isXDIGIT_utf8(s)));
+                       to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
                      break;
  
                  case _CC_ENUM_VERTSPACE:
                      REXEC_FBC_UTF8_CLASS_SCAN(
-                                       to_complement ^ cBOOL(isVERTWS_utf8(s)));
+                       to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
                      break;
  
                  case _CC_ENUM_CNTRL:
                      REXEC_FBC_UTF8_CLASS_SCAN(
-                                        to_complement ^ cBOOL(isCNTRL_utf8(s)));
+                        to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
                      break;
  
                  default:
@@ -2496,9 +2496,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
           * FBC macro instead of being expanded out.  Since we've loaded the
           * swash, we don't have to check for that each time through the loop */
          REXEC_FBC_UTF8_CLASS_SCAN(
-                to_complement ^ cBOOL(_generic_utf8(
+                to_complement ^ cBOOL(_generic_utf8_safe(
                                        classnum,
                                        s,
+                                      strend,
                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
                                                    (U8 *) s, TRUE))));
          break;
@@ -6067,12 +6068,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                 if (locinput == reginfo->strbeg)
                     b1 = isWORDCHAR_LC('\n');
                 else {
-                    b1 = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
-                                                        (U8*)(reginfo->strbeg)));
+                    b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
+                                                        (U8*)(reginfo->strbeg)),
+                                                 (U8*)(reginfo->strend));
                 }
                  b2 = (NEXTCHR_IS_EOS)
                      ? isWORDCHAR_LC('\n')
-                    : isWORDCHAR_LC_utf8((U8*)locinput);
+                    : isWORDCHAR_LC_utf8_safe((U8*) locinput,
+                                              (U8*) reginfo->strend);
             }
             else { /* Here the string isn't utf8 */
                 b1 = (locinput == reginfo->strbeg)
@@ -6146,11 +6149,15 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                          bool b1, b2;
                          b1 = (locinput == reginfo->strbeg)
                               ? 0 /* isWORDCHAR_L1('\n') */
-                             : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
-                                                       (U8*)(reginfo->strbeg)));
+                             : isWORDCHAR_utf8_safe(
+                                               reghop3((U8*)locinput,
+                                                       -1,
+                                                       (U8*)(reginfo->strbeg)),
+                                                    (U8*) reginfo->strend);
                          b2 = (NEXTCHR_IS_EOS)
                              ? 0 /* isWORDCHAR_L1('\n') */
-                            : isWORDCHAR_utf8((U8*)locinput);
+                            : isWORDCHAR_utf8_safe((U8*)locinput,
+                                                   (U8*) reginfo->strend);
                          match = cBOOL(b1 != b2);
                          break;
                      }
@@ -8969,7 +8976,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
              /* The complement of something that matches only ASCII matches all
               * non-ASCII, plus everything in ASCII that isn't in the class. */
             while (hardcount < max && scan < loceol
-                   && (! isASCII_utf8(scan)
+                   && (   ! isASCII_utf8_safe(scan, reginfo->strend)
                         || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
              {
                  scan += UTF8SKIP(scan);
@@ -9037,7 +9044,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                      case _CC_ENUM_SPACE:
                          while (hardcount < max
                                 && scan < loceol
-                               && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
+                               && (to_complement
+                                   ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
                          {
                              scan += UTF8SKIP(scan);
                              hardcount++;
@@ -9046,7 +9054,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                      case _CC_ENUM_BLANK:
                          while (hardcount < max
                                 && scan < loceol
-                               && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
+                               && (to_complement
+                                    ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
                          {
                              scan += UTF8SKIP(scan);
                              hardcount++;
@@ -9055,7 +9064,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                      case _CC_ENUM_XDIGIT:
                          while (hardcount < max
                                 && scan < loceol
-                               && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
+                               && (to_complement
+                                   ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
                          {
                              scan += UTF8SKIP(scan);
                              hardcount++;
@@ -9064,7 +9074,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                      case _CC_ENUM_VERTSPACE:
                          while (hardcount < max
                                 && scan < loceol
-                               && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
+                               && (to_complement
+                                   ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
                          {
                              scan += UTF8SKIP(scan);
                              hardcount++;
@@ -9073,7 +9084,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                      case _CC_ENUM_CNTRL:
                          while (hardcount < max
                                 && scan < loceol
-                               && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
+                               && (to_complement
+                                   ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
                          {
                              scan += UTF8SKIP(scan);
                              hardcount++;
@@ -9099,9 +9111,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
          }
  
          while (hardcount < max && scan < loceol
-               && to_complement ^ cBOOL(_generic_utf8(
+               && to_complement ^ cBOOL(_generic_utf8_safe(
                                         classnum,
                                         scan,
+                                       loceol,
                                         swash_fetch(PL_utf8_swash_ptrs[classnum],
                                                     (U8 *) scan,
                                                     TRUE))))
author	Karl Williamson <khw@cpan.org>
	Wed, 30 Nov 2016 16:53:17 +0000 (09:53 -0700)
committer	Karl Williamson <khw@cpan.org>
	Fri, 23 Dec 2016 23:48:34 +0000 (16:48 -0700)
gv.c		patch \| blob \| blame \| history
op.c		patch \| blob \| blame \| history
pp.c		patch \| blob \| blame \| history
pp_pack.c		patch \| blob \| blame \| history
regcomp.c		patch \| blob \| blame \| history
regexec.c		patch \| blob \| blame \| history