This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Convert core (except toke.c) to use isFOO_utf8_safe()
authorKarl Williamson <khw@cpan.org>
Wed, 30 Nov 2016 16:53:17 +0000 (09:53 -0700)
committerKarl Williamson <khw@cpan.org>
Fri, 23 Dec 2016 23:48:34 +0000 (16:48 -0700)
The previous commit added this feature; now this commit uses it in core.
toke.c is deferred to the next commit to aid in possible future
bisecting, because some of the changes there seem somewhat more likely
to expose bugs.

gv.c
op.c
pp.c
pp_pack.c
regcomp.c
regexec.c

diff --git a/gv.c b/gv.c
index 775951b..2570cf0 100644 (file)
--- a/gv.c
+++ b/gv.c
@@ -1591,7 +1591,10 @@ S_parse_gv_stash_name(pTHX_ HV **stash, GV **gv, const char **name,
 
     PERL_ARGS_ASSERT_PARSE_GV_STASH_NAME;
     
-    if (full_len > 2 && **name == '*' && isIDFIRST_lazy_if(*name + 1, is_utf8)) {
+    if (   full_len > 2
+        && **name == '*'
+        && isIDFIRST_lazy_if_safe(*name + 1, name_end, is_utf8))
+    {
         /* accidental stringify on a GV? */
         (*name)++;
     }
@@ -1676,7 +1679,7 @@ S_gv_is_in_main(pTHX_ const char *name, STRLEN len, const U32 is_utf8)
     PERL_ARGS_ASSERT_GV_IS_IN_MAIN;
     
     /* If it's an alphanumeric variable */
-    if ( len && isIDFIRST_lazy_if(name, is_utf8) ) {
+    if ( len && isIDFIRST_lazy_if_safe(name, name + len, is_utf8) ) {
         /* Some "normal" variables are always in main::,
          * like INC or STDOUT.
          */
@@ -2400,8 +2403,12 @@ Perl_gv_fetchpvn_flags(pTHX_ const char *nambeg, STRLEN full_len, I32 flags,
                 UTF8fARG(is_utf8, name_end-nambeg, nambeg));
     gv_init_pvn(gv, stash, name, len, (add & GV_ADDMULTI)|is_utf8);
 
-    if ( isIDFIRST_lazy_if(name, is_utf8) && !ckWARN(WARN_ONCE) )
+    if (   full_len != 0
+        && isIDFIRST_lazy_if_safe(name, name + full_len, is_utf8)
+        && !ckWARN(WARN_ONCE) )
+    {
         GvMULTI_on(gv) ;
+    }
 
     /* set up magic where warranted */
     if ( gv_magicalize(gv, stash, name, len, sv_type) ) {
@@ -2492,8 +2499,12 @@ Perl_gv_check(pTHX_ HV *stash)
                 )
                     gv_check(hv);              /* nested package */
            }
-            else if ( *HeKEY(entry) != '_'
-                        && isIDFIRST_lazy_if(HeKEY(entry), HeUTF8(entry)) ) {
+            else if (   HeKLEN(entry) != 0
+                     && *HeKEY(entry) != '_'
+                     && isIDFIRST_lazy_if_safe(HeKEY(entry),
+                                               HeKEY(entry) + HeKLEN(entry),
+                                               HeUTF8(entry)) )
+            {
                 const char *file;
                gv = MUTABLE_GV(HeVAL(entry));
                if (SvTYPE(gv) != SVt_PVGV || GvMULTI(gv))
diff --git a/op.c b/op.c
index 722ee35..394efef 100644 (file)
--- a/op.c
+++ b/op.c
@@ -652,11 +652,12 @@ Perl_allocmy(pTHX_ const char *const name, const STRLEN len, const U32 flags)
                   (UV)flags);
 
     /* complain about "my $<special_var>" etc etc */
-    if (len &&
-       !(is_our ||
-         isALPHA(name[1]) ||
-         ((flags & SVf_UTF8) && isIDFIRST_utf8((U8 *)name+1)) ||
-         (name[1] == '_' && len > 2)))
+    if (   len
+        && !(  is_our
+            || isALPHA(name[1])
+            || (   (flags & SVf_UTF8)
+                && isIDFIRST_utf8_safe((U8 *)name+1, name + len))
+            || (name[1] == '_' && len > 2)))
     {
        if (!(flags & SVf_UTF8 && UTF8_IS_START(name[1]))
         && isASCII(name[1])
diff --git a/pp.c b/pp.c
index b198b47..6fb20f6 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -5794,7 +5794,7 @@ PP(pp_split)
     orig = s;
     if (RX_EXTFLAGS(rx) & RXf_SKIPWHITE) {
        if (do_utf8) {
-           while (isSPACE_utf8(s))
+           while (isSPACE_utf8_safe(s, strend))
                s += UTF8SKIP(s);
        }
        else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET) {
@@ -5819,9 +5819,9 @@ PP(pp_split)
            m = s;
            /* this one uses 'm' and is a negative test */
            if (do_utf8) {
-               while (m < strend && ! isSPACE_utf8(m) ) {
+               while (m < strend && ! isSPACE_utf8_safe(m, strend) ) {
                    const int t = UTF8SKIP(m);
-                   /* isSPACE_utf8 returns FALSE for malform utf8 */
+                   /* isSPACE_utf8_safe returns FALSE for malform utf8 */
                    if (strend - m < t)
                        m = strend;
                    else
@@ -5859,7 +5859,7 @@ PP(pp_split)
 
            /* this one uses 's' and is a positive test */
            if (do_utf8) {
-               while (s < strend && isSPACE_utf8(s) )
+               while (s < strend && isSPACE_utf8_safe(s, strend) )
                    s +=  UTF8SKIP(s);
            }
            else if (get_regex_charset(RX_EXTFLAGS(rx)) == REGEX_LOCALE_CHARSET)
index a75229a..ee4c69e 100644 (file)
--- a/pp_pack.c
+++ b/pp_pack.c
@@ -1073,9 +1073,14 @@ S_unpack_rec(pTHX_ tempsym_t* symptr, const char *s, const char *strbeg, const c
                /* 'A' strips both nulls and spaces */
                const char *ptr;
                if (utf8 && (symptr->flags & FLAG_WAS_UTF8)) {
-                   for (ptr = s+len-1; ptr >= s; ptr--)
-                       if (*ptr != 0 && !UTF8_IS_CONTINUATION(*ptr) &&
-                           !isSPACE_utf8(ptr)) break;
+                    for (ptr = s+len-1; ptr >= s; ptr--) {
+                        if (   *ptr != 0
+                            && !UTF8_IS_CONTINUATION(*ptr)
+                            && !isSPACE_utf8_safe(ptr, strend))
+                        {
+                            break;
+                        }
+                    }
                    if (ptr >= s) ptr += UTF8SKIP(ptr);
                    else ptr++;
                    if (ptr > s+len)
index 095b13f..7578a25 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -8271,17 +8271,18 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
 
     assert (RExC_parse <= RExC_end);
     if (RExC_parse == RExC_end) NOOP;
-    else if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
+    else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) {
          /* Note that the code here assumes well-formed UTF-8.  Skip IDFIRST by
           * using do...while */
        if (UTF)
            do {
                RExC_parse += UTF8SKIP(RExC_parse);
-           } while (isWORDCHAR_utf8((U8*)RExC_parse));
+           } while (   RExC_parse < RExC_end
+                     && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end));
        else
            do {
                RExC_parse++;
-           } while (isWORDCHAR(*RExC_parse));
+           } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse));
     } else {
         RExC_parse++; /* so the <- from the vFAIL is after the offending
                          character */
index e9c74e6..f720e7d 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1678,7 +1678,7 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */                 \
     tmp = TEST_UV(tmp);                                                        \
     LOAD_UTF8_CHARCLASS_ALNUM();                                               \
     REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */                     \
-        if (tmp == ! (TEST_UTF8((U8 *) s))) {                                  \
+        if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) {          \
             tmp = !tmp;                                                        \
             IF_SUCCESS;                                                        \
         }                                                                      \
@@ -2050,7 +2050,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             goto do_boundu;
         }
 
-        FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
+        FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
         break;
 
     case NBOUNDL:
@@ -2063,14 +2063,14 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             goto do_nboundu;
         }
 
-        FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
+        FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe);
         break;
 
     case BOUND: /* regcomp.c makes sure that this only has the traditional \b
                    meaning */
         assert(FLAGS(c) == TRADITIONAL_BOUND);
 
-        FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
+        FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
         break;
 
     case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b
@@ -2084,7 +2084,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                    meaning */
         assert(FLAGS(c) == TRADITIONAL_BOUND);
 
-        FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8);
+        FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
         break;
 
     case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b
@@ -2096,7 +2096,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
 
     case NBOUNDU:
         if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) {
-            FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+            FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
             break;
         }
 
@@ -2109,7 +2109,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
       do_boundu:
         switch((bound_type) FLAGS(c)) {
             case TRADITIONAL_BOUND:
-                FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
+                FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe);
                 break;
             case GCB_BOUND:
                 if (s == reginfo->strbeg) {
@@ -2387,7 +2387,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
         if (utf8_target) {
             /* The complement of something that matches only ASCII matches all
              * non-ASCII, plus everything in ASCII that isn't in the class. */
-            REXEC_FBC_UTF8_CLASS_SCAN(! isASCII_utf8(s)
+            REXEC_FBC_UTF8_CLASS_SCAN(   ! isASCII_utf8_safe(s, strend)
                                       || ! _generic_isCC_A(*s, FLAGS(c)));
             break;
         }
@@ -2451,27 +2451,27 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                                            macros */
                 case _CC_ENUM_SPACE:
                     REXEC_FBC_UTF8_CLASS_SCAN(
-                                        to_complement ^ cBOOL(isSPACE_utf8(s)));
+                        to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend)));
                     break;
 
                 case _CC_ENUM_BLANK:
                     REXEC_FBC_UTF8_CLASS_SCAN(
-                                        to_complement ^ cBOOL(isBLANK_utf8(s)));
+                        to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend)));
                     break;
 
                 case _CC_ENUM_XDIGIT:
                     REXEC_FBC_UTF8_CLASS_SCAN(
-                                       to_complement ^ cBOOL(isXDIGIT_utf8(s)));
+                       to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend)));
                     break;
 
                 case _CC_ENUM_VERTSPACE:
                     REXEC_FBC_UTF8_CLASS_SCAN(
-                                       to_complement ^ cBOOL(isVERTWS_utf8(s)));
+                       to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend)));
                     break;
 
                 case _CC_ENUM_CNTRL:
                     REXEC_FBC_UTF8_CLASS_SCAN(
-                                        to_complement ^ cBOOL(isCNTRL_utf8(s)));
+                        to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend)));
                     break;
 
                 default:
@@ -2496,9 +2496,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
          * FBC macro instead of being expanded out.  Since we've loaded the
          * swash, we don't have to check for that each time through the loop */
         REXEC_FBC_UTF8_CLASS_SCAN(
-                to_complement ^ cBOOL(_generic_utf8(
+                to_complement ^ cBOOL(_generic_utf8_safe(
                                       classnum,
                                       s,
+                                      strend,
                                       swash_fetch(PL_utf8_swash_ptrs[classnum],
                                                   (U8 *) s, TRUE))));
         break;
@@ -6067,12 +6068,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                if (locinput == reginfo->strbeg)
                    b1 = isWORDCHAR_LC('\n');
                else {
-                    b1 = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
-                                                        (U8*)(reginfo->strbeg)));
+                    b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1,
+                                                        (U8*)(reginfo->strbeg)),
+                                                 (U8*)(reginfo->strend));
                }
                 b2 = (NEXTCHR_IS_EOS)
                     ? isWORDCHAR_LC('\n')
-                    : isWORDCHAR_LC_utf8((U8*)locinput);
+                    : isWORDCHAR_LC_utf8_safe((U8*) locinput,
+                                              (U8*) reginfo->strend);
            }
            else { /* Here the string isn't utf8 */
                b1 = (locinput == reginfo->strbeg)
@@ -6146,11 +6149,15 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
                         bool b1, b2;
                         b1 = (locinput == reginfo->strbeg)
                              ? 0 /* isWORDCHAR_L1('\n') */
-                             : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
-                                                       (U8*)(reginfo->strbeg)));
+                             : isWORDCHAR_utf8_safe(
+                                               reghop3((U8*)locinput,
+                                                       -1,
+                                                       (U8*)(reginfo->strbeg)),
+                                                    (U8*) reginfo->strend);
                         b2 = (NEXTCHR_IS_EOS)
                             ? 0 /* isWORDCHAR_L1('\n') */
-                            : isWORDCHAR_utf8((U8*)locinput);
+                            : isWORDCHAR_utf8_safe((U8*)locinput,
+                                                   (U8*) reginfo->strend);
                         match = cBOOL(b1 != b2);
                         break;
                     }
@@ -8969,7 +8976,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
             /* The complement of something that matches only ASCII matches all
              * non-ASCII, plus everything in ASCII that isn't in the class. */
            while (hardcount < max && scan < loceol
-                   && (! isASCII_utf8(scan)
+                   && (   ! isASCII_utf8_safe(scan, reginfo->strend)
                        || ! _generic_isCC_A((U8) *scan, FLAGS(p))))
             {
                 scan += UTF8SKIP(scan);
@@ -9037,7 +9044,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                     case _CC_ENUM_SPACE:
                         while (hardcount < max
                                && scan < loceol
-                               && (to_complement ^ cBOOL(isSPACE_utf8(scan))))
+                               && (to_complement
+                                   ^ cBOOL(isSPACE_utf8_safe(scan, loceol))))
                         {
                             scan += UTF8SKIP(scan);
                             hardcount++;
@@ -9046,7 +9054,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                     case _CC_ENUM_BLANK:
                         while (hardcount < max
                                && scan < loceol
-                               && (to_complement ^ cBOOL(isBLANK_utf8(scan))))
+                               && (to_complement
+                                    ^ cBOOL(isBLANK_utf8_safe(scan, loceol))))
                         {
                             scan += UTF8SKIP(scan);
                             hardcount++;
@@ -9055,7 +9064,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                     case _CC_ENUM_XDIGIT:
                         while (hardcount < max
                                && scan < loceol
-                               && (to_complement ^ cBOOL(isXDIGIT_utf8(scan))))
+                               && (to_complement
+                                   ^ cBOOL(isXDIGIT_utf8_safe(scan, loceol))))
                         {
                             scan += UTF8SKIP(scan);
                             hardcount++;
@@ -9064,7 +9074,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                     case _CC_ENUM_VERTSPACE:
                         while (hardcount < max
                                && scan < loceol
-                               && (to_complement ^ cBOOL(isVERTWS_utf8(scan))))
+                               && (to_complement
+                                   ^ cBOOL(isVERTWS_utf8_safe(scan, loceol))))
                         {
                             scan += UTF8SKIP(scan);
                             hardcount++;
@@ -9073,7 +9084,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
                     case _CC_ENUM_CNTRL:
                         while (hardcount < max
                                && scan < loceol
-                               && (to_complement ^ cBOOL(isCNTRL_utf8(scan))))
+                               && (to_complement
+                                   ^ cBOOL(isCNTRL_utf8_safe(scan, loceol))))
                         {
                             scan += UTF8SKIP(scan);
                             hardcount++;
@@ -9099,9 +9111,10 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
         }
 
         while (hardcount < max && scan < loceol
-               && to_complement ^ cBOOL(_generic_utf8(
+               && to_complement ^ cBOOL(_generic_utf8_safe(
                                        classnum,
                                        scan,
+                                       loceol,
                                        swash_fetch(PL_utf8_swash_ptrs[classnum],
                                                    (U8 *) scan,
                                                    TRUE))))