This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Convert to use isFOO_utf8_safe() macros
authorKarl Williamson <khw@cpan.org>
Wed, 14 Dec 2016 01:34:12 +0000 (18:34 -0700)
committerKarl Williamson <khw@cpan.org>
Fri, 23 Dec 2016 23:48:34 +0000 (16:48 -0700)
toke.c

diff --git a/toke.c b/toke.c
index 3355db2..2996177 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -527,11 +527,17 @@ S_no_op(pTHX_ const char *const what, char *s)
        if (is_first)
            Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
                    "\t(Missing semicolon on previous line?)\n");
-       else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) {
+        else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr,
+                                                           PL_bufend,
+                                                           UTF))
+        {
            const char *t;
-           for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':');
-                                                            t += UTF ? UTF8SKIP(t) : 1)
+           for (t = PL_oldoldbufptr;
+                 (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':');
+                 t += UTF ? UTF8SKIP(t) : 1)
+            {
                NOOP;
+            }
            if (t < PL_bufptr && isSPACE(*t))
                Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
                        "\t(Do you need to predeclare %" UTF8f "?)\n",
@@ -1886,7 +1892,7 @@ S_check_uni(pTHX)
     while (isSPACE(*PL_last_uni))
        PL_last_uni++;
     s = PL_last_uni;
-    while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
+    while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-')
        s += UTF ? UTF8SKIP(s) : 1;
     if ((t = strchr(s, '(')) && t < PL_bufptr)
        return;
@@ -2056,7 +2062,7 @@ S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack)
 
     start = skipspace(start);
     s = start;
-    if (isIDFIRST_lazy_if(s,UTF)
+    if (   isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
         || (allow_pack && *s == ':' && s[1] == ':') )
     {
        s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len);
@@ -3241,8 +3247,12 @@ S_scan_const(pTHX_ char *start)
           (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
           */
        else if (*s == '@' && s[1]) {
-           if (UTF ? isIDFIRST_utf8((U8*)s+1) : isWORDCHAR_A(s[1]))
+           if (UTF
+               ? isIDFIRST_utf8_safe(s+1, send)
+               : isWORDCHAR_A(s[1]))
+            {
                break;
+            }
            if (strchr(":'{$", s[1]))
                break;
            if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-'))
@@ -3981,7 +3991,7 @@ S_intuit_more(pTHX_ char *s)
            case '&':
            case '$':
                weight -= seen[un_char] * 10;
-               if (isWORDCHAR_lazy_if(s+1,UTF)) {
+               if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) {
                    int len;
                     char *tmp = PL_bufend;
                     PL_bufend = (char*)send;
@@ -4457,11 +4467,17 @@ S_check_scalar_slice(pTHX_ char *s)
 {
     s++;
     while (*s == ' ' || *s == '\t') s++;
-    if (*s == 'q' && s[1] == 'w'
-     && !isWORDCHAR_lazy_if(s+2,UTF))
+    if (*s == 'q' && s[1] == 'w' && !isWORDCHAR_lazy_if_safe(s+2,
+                                                             PL_bufend,
+                                                             UTF))
+    {
        return;
-    while (*s && (isWORDCHAR_lazy_if(s,UTF) || strchr(" \t$#+-'\"", *s)))
-       s += UTF ? UTF8SKIP(s) : 1;
+    }
+    while (    isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)
+           || (*s && strchr(" \t$#+-'\"", *s)))
+    {
+        s += UTF ? UTF8SKIP(s) : 1;
+    }
     if (*s == '}' || *s == ']')
        pl_yylval.ival = OPpSLICEWARNING;
 }
@@ -4868,7 +4884,7 @@ Perl_yylex(pTHX)
                 break;
             }
             s = skipspace(s);
-            if (isIDFIRST_lazy_if(s, UTF)) {
+            if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
                 char *dest = PL_tokenbuf + 1;
                 /* read var name, including sigil, into PL_tokenbuf */
                 PL_tokenbuf[0] = sigil;
@@ -4912,7 +4928,7 @@ Perl_yylex(pTHX)
                                                   1 /* 1 means die */ );
                 NOT_REACHED; /* NOTREACHED */
             }
-            if (isIDFIRST_utf8((U8*)s)) {
+            if (isIDFIRST_utf8_safe(s, PL_bufend)) {
                 goto keylookup;
             }
         }
@@ -5445,7 +5461,7 @@ Perl_yylex(pTHX)
                    PL_expect = XPOSTDEREF;
                    TOKEN(ARROW);
                }
-               if (isIDFIRST_lazy_if(s,UTF)) {
+               if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
                    s = force_word(s,METHOD,FALSE,TRUE);
                    TOKEN(ARROW);
                }
@@ -5630,7 +5646,7 @@ Perl_yylex(pTHX)
         grabattrs:
            s = skipspace(s);
            attrs = NULL;
-           while (isIDFIRST_lazy_if(s,UTF)) {
+            while (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
                I32 tmp;
                SV *sv;
                d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len);
@@ -5836,7 +5852,7 @@ Perl_yylex(pTHX)
                while (d < PL_bufend && SPACE_OR_TAB(*d))
                    d++;
            }
-           if (d < PL_bufend && isIDFIRST_lazy_if(d,UTF)) {
+            if (d < PL_bufend && isIDFIRST_lazy_if_safe(d, PL_bufend, UTF)) {
                d = scan_word(d, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1,
                              FALSE, &len);
                while (d < PL_bufend && SPACE_OR_TAB(*d))
@@ -5956,13 +5972,19 @@ Perl_yylex(pTHX)
                    }
                    else
                        /* skip plain q word */
-                       while (t < PL_bufend && isWORDCHAR_lazy_if(t,UTF))
+                       while (   t < PL_bufend
+                               && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
+                        {
                            t += UTF ? UTF8SKIP(t) : 1;
+                        }
                }
-               else if (isWORDCHAR_lazy_if(t,UTF)) {
+               else if (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) {
                    t += UTF ? UTF8SKIP(t) : 1;
-                   while (t < PL_bufend && isWORDCHAR_lazy_if(t,UTF))
+                   while (   t < PL_bufend
+                           && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF))
+                    {
                        t += UTF ? UTF8SKIP(t) : 1;
+                    }
                }
                while (t < PL_bufend && isSPACE(*t))
                    t++;
@@ -6057,8 +6079,9 @@ Perl_yylex(pTHX)
        }
        s--;
        if (PL_expect == XOPERATOR) {
-           if (PL_bufptr == PL_linestart && ckWARN(WARN_SEMICOLON)
-               && isIDFIRST_lazy_if(s,UTF))
+           if (   PL_bufptr == PL_linestart
+                && ckWARN(WARN_SEMICOLON)
+               && isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
            {
                CopLINE_dec(PL_curcop);
                Perl_warner(aTHX_ packWARN(WARN_SEMICOLON), "%s", PL_warn_nosemi);
@@ -6340,7 +6363,10 @@ Perl_yylex(pTHX)
            POSTDEREF('$');
        }
 
-       if (s[1] == '#' && (isIDFIRST_lazy_if(s+2,UTF) || strchr("{$:+-@", s[2]))) {
+       if (   s[1] == '#'
+            && (   isIDFIRST_lazy_if_safe(s+2, PL_bufend, UTF)
+                || strchr("{$:+-@", s[2])))
+        {
            PL_tokenbuf[0] = '@';
            s = scan_ident(s + 1, PL_tokenbuf + 1,
                           sizeof PL_tokenbuf - 1, FALSE);
@@ -6389,8 +6415,12 @@ Perl_yylex(pTHX)
                    if (ckWARN(WARN_SYNTAX)) {
                        char *t = s+1;
 
-                       while (isSPACE(*t) || isWORDCHAR_lazy_if(t,UTF) || *t == '$')
+                        while (   isSPACE(*t)
+                               || isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)
+                               || *t == '$')
+                        {
                            t += UTF ? UTF8SKIP(t) : 1;
+                        }
                        if (*t++ == ',') {
                            PL_bufptr = skipspace(PL_bufptr); /* XXX can realloc */
                            while (t < PL_bufend && *t != ']')
@@ -6411,7 +6441,7 @@ Perl_yylex(pTHX)
                            do {
                                t++;
                            } while (isSPACE(*t));
-                           if (isIDFIRST_lazy_if(t,UTF)) {
+                           if (isIDFIRST_lazy_if_safe(t, PL_bufend, UTF)) {
                                STRLEN len;
                                t = scan_word(t, tmpbuf, sizeof tmpbuf, TRUE,
                                              &len);
@@ -6434,9 +6464,12 @@ Perl_yylex(pTHX)
                    PL_expect = XOPERATOR;
                else if (strchr("$@\"'`q", *s))
                    PL_expect = XTERM;          /* e.g. print $fh "foo" */
-               else if (strchr("&*<%", *s) && isIDFIRST_lazy_if(s+1,UTF))
+               else if (   strchr("&*<%", *s)
+                         && isIDFIRST_lazy_if_safe(s+1, PL_bufend, UTF))
+                {
                    PL_expect = XTERM;          /* e.g. print $fh &sub */
-               else if (isIDFIRST_lazy_if(s,UTF)) {
+                }
+               else if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
                    char tmpbuf[sizeof PL_tokenbuf];
                    int t2;
                    scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len);
@@ -6535,10 +6568,10 @@ Perl_yylex(pTHX)
         }
        else {
            /* Disable warning on "study /blah/" */
-           if (PL_oldoldbufptr == PL_last_uni
-            && (*PL_last_uni != 's' || s - PL_last_uni < 5
-                || memNE(PL_last_uni, "study", 5)
-                || isWORDCHAR_lazy_if(PL_last_uni+5,UTF)
+           if (    PL_oldoldbufptr == PL_last_uni
+                && (   *PL_last_uni != 's' || s - PL_last_uni < 5
+                    || memNE(PL_last_uni, "study", 5)
+                    || isWORDCHAR_lazy_if_safe(PL_last_uni+5, PL_bufend, UTF)
             ))
                check_uni();
            s = scan_pat(s,OP_MATCH);
@@ -7065,8 +7098,8 @@ Perl_yylex(pTHX)
                    s = skipspace(s);
 
                    /* Two barewords in a row may indicate method call. */
-
-                   if ((isIDFIRST_lazy_if(s,UTF) || *s == '$')
+                   if (   (   isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
+                            || *s == '$')
                         && (tmp = intuit_method(s, lex ? NULL : sv, cv)))
                     {
                        goto method;
@@ -7151,9 +7184,11 @@ Perl_yylex(pTHX)
 
                /* If followed by a bareword, see if it looks like indir obj. */
 
-               if (tmp == 1 && !orig_keyword
-                       && (isIDFIRST_lazy_if(s,UTF) || *s == '$')
-                       && (tmp = intuit_method(s, lex ? NULL : sv, cv))) {
+               if (   tmp == 1
+                    && !orig_keyword
+                    && (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF) || *s == '$')
+                    && (tmp = intuit_method(s, lex ? NULL : sv, cv)))
+                {
                  method:
                    if (lex && !off) {
                        assert(cSVOPx(pl_yylval.opval)->op_sv == sv);
@@ -7647,7 +7682,9 @@ Perl_yylex(pTHX)
                return REPORT(0);
            pl_yylval.ival = CopLINE(PL_curcop);
            s = skipspace(s);
-           if (PL_expect == XSTATE && isIDFIRST_lazy_if(s,UTF)) {
+            if (   PL_expect == XSTATE
+                && isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
+            {
                char *p = s;
 
                if ((PL_bufend - p) >= 3
@@ -7660,7 +7697,7 @@ Perl_yylex(pTHX)
                    p += 3;
                p = skipspace(p);
                 /* skip optional package name, as in "for my abc $x (..)" */
-               if (isIDFIRST_lazy_if(p,UTF)) {
+               if (isIDFIRST_lazy_if_safe(p, PL_bufend, UTF)) {
                    p = scan_word(p, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len);
                    p = skipspace(p);
                }
@@ -7903,7 +7940,7 @@ Perl_yylex(pTHX)
            }
            PL_in_my = (U16)tmp;
            s = skipspace(s);
-           if (isIDFIRST_lazy_if(s,UTF)) {
+            if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
                s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len);
                if (len == 3 && strEQs(PL_tokenbuf, "sub"))
                    goto really_sub;
@@ -7953,10 +7990,10 @@ Perl_yylex(pTHX)
 
        case KEY_open:
            s = skipspace(s);
-           if (isIDFIRST_lazy_if(s,UTF)) {
-          const char *t;
-          d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE,
-              &len);
+            if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
+                const char *t;
+                d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE,
+                              &len);
                for (t=d; isSPACE(*t);)
                    t++;
                if ( *t && strchr("|&*+-=!?:.", *t) && ckWARN_d(WARN_PRECEDENCE)
@@ -8113,9 +8150,13 @@ Perl_yylex(pTHX)
            {
                *PL_tokenbuf = '\0';
                s = force_word(s,BAREWORD,TRUE,TRUE);
-               if (isIDFIRST_lazy_if(PL_tokenbuf,UTF))
+                if (isIDFIRST_lazy_if_safe(PL_tokenbuf,
+                                           PL_tokenbuf + sizeof(PL_tokenbuf),
+                                           UTF))
+                {
                    gv_stashpvn(PL_tokenbuf, strlen(PL_tokenbuf),
                                 GV_ADD | (UTF ? SVf_UTF8 : 0));
+                }
                else if (*s == '<')
                    yyerror("<> at require-statement should be quotes");
            }
@@ -8317,7 +8358,7 @@ Perl_yylex(pTHX)
                s = skipspace(s);
                 d = SvPVX(PL_linestr)+off;
 
-               if (isIDFIRST_lazy_if(s,UTF)
+                if (   isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)
                     || *s == '\''
                     || (*s == ':' && s[1] == ':'))
                {
@@ -8763,10 +8804,10 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what)
        s++;
     while (s < PL_bufend && isSPACE(*s))
        s++;
-    if (isIDFIRST_lazy_if(s,UTF)) {
+    if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) {
        const char * const w = s;
         s += UTF ? UTF8SKIP(s) : 1;
-       while (isWORDCHAR_lazy_if(s,UTF))
+       while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
            s += UTF ? UTF8SKIP(s) : 1;
        while (s < PL_bufend && isSPACE(*s))
            s++;
@@ -8945,15 +8986,16 @@ S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package,
     for (;;) {
         if (*d >= e)
             Perl_croak(aTHX_ "%s", ident_too_long);
-        if (is_utf8 && isIDFIRST_utf8((U8*)*s)) {
+        if (is_utf8 && isIDFIRST_utf8_safe(*s, PL_bufend)) {
              /* The UTF-8 case must come first, otherwise things
              * like c\N{COMBINING TILDE} would start failing, as the
              * isWORDCHAR_A case below would gobble the 'c' up.
              */
 
             char *t = *s + UTF8SKIP(*s);
-            while (isIDCONT_utf8((U8*)t))
+            while (isIDCONT_utf8_safe((const U8*) t, (const U8*) PL_bufend)) {
                 t += UTF8SKIP(t);
+            }
             if (*d + (t - *s) > e)
                 Perl_croak(aTHX_ "%s", ident_too_long);
             Copy(*s, *d, t - *s, char);
@@ -8965,7 +9007,10 @@ S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package,
                 *(*d)++ = *(*s)++;
             } while (isWORDCHAR_A(**s) && *d < e);
         }
-        else if (allow_package && **s == '\'' && isIDFIRST_lazy_if(*s+1,is_utf8)) {
+        else if (   allow_package
+                 && **s == '\''
+                 && isIDFIRST_lazy_if_safe((*s)+1, PL_bufend, is_utf8))
+        {
             *(*d)++ = ':';
             *(*d)++ = ':';
             (*s)++;
@@ -9016,10 +9061,10 @@ S_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STRLEN
  *      Because all ASCII characters have the same representation whether
  *      encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and
  *      '{' without knowing if is UTF-8 or not. */
-#define VALID_LEN_ONE_IDENT(s, is_utf8)                                       \
-    (isGRAPH_A(*(s)) || ((is_utf8)                                            \
-                         ? isIDFIRST_utf8((U8*) (s))                          \
-                         : (isGRAPH_L1(*s)                                    \
+#define VALID_LEN_ONE_IDENT(s, e, is_utf8)                                  \
+    (isGRAPH_A(*(s)) || ((is_utf8)                                          \
+                         ? isIDFIRST_utf8_safe(s, e)                        \
+                         : (isGRAPH_L1(*s)                                  \
                             && LIKELY((U8) *(s) != LATIN1_TO_NATIVE(0xAD)))))
 
 STATIC char *
@@ -9060,7 +9105,7 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
     /* Here, it is not a run-of-the-mill identifier name */
 
     if (*s == '$' && s[1]
-        && (isIDFIRST_lazy_if(s+1,is_utf8)
+        && (   isIDFIRST_lazy_if_safe(s+1, PL_bufend, is_utf8)
             || isDIGIT_A((U8)s[1])
             || s[1] == '$'
             || s[1] == '{'
@@ -9083,7 +9128,7 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
     if ((s <= PL_bufend - (is_utf8)
                           ? UTF8SKIP(s)
                           : 1)
-        && VALID_LEN_ONE_IDENT(s, is_utf8))
+        && VALID_LEN_ONE_IDENT(s, PL_bufend, is_utf8))
     {
         if (is_utf8) {
             const STRLEN skip = UTF8SKIP(s);
@@ -9111,7 +9156,7 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
         bool skip;
         char *s2;
         /* If we were processing {...} notation then...  */
-       if (isIDFIRST_lazy_if(d,is_utf8)) {
+        if (isIDFIRST_lazy_if_safe(d, e, is_utf8)) {
             /* if it starts as a valid identifier, assume that it is one.
                (the later check for } being at the expected point will trap
                cases where this doesn't pan out.)  */
@@ -9227,7 +9272,7 @@ S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charse
     STRLEN charlen = UTF ? UTF8SKIP(*s) : 1;
 
     if ( charlen != 1 || ! strchr(valid_flags, c) ) {
-        if (isWORDCHAR_lazy_if(*s, UTF)) {
+        if (isWORDCHAR_lazy_if_safe( *s, PL_bufend, UTF)) {
             yyerror_pv(Perl_form(aTHX_ "Unknown regexp modifier \"/%.*s\"", (int)charlen, *s),
                        UTF ? SVf_UTF8 : 0);
             (*s) += charlen;
@@ -9612,10 +9657,12 @@ S_scan_heredoc(pTHX_ char *s)
            s++, term = '\'';
        else
            term = '"';
-       if (!isWORDCHAR_lazy_if(s,UTF))
+       if (! isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF))
            deprecate("bare << to mean <<\"\"");
        peek = s;
-       while (isWORDCHAR_lazy_if(peek,UTF)) {
+        while (
+               isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF))
+        {
            peek += UTF ? UTF8SKIP(peek) : 1;
        }
        len = (peek - s >= e - d) ? (e - d) : (peek - s);
@@ -10032,8 +10079,9 @@ S_scan_inputsymbol(pTHX_ char *start)
     if (*d == '$' && d[1]) d++;
 
     /* allow <Pkg'VALUE> or <Pkg::VALUE> */
-    while (*d && (isWORDCHAR_lazy_if(d,UTF) || *d == '\'' || *d == ':'))
+    while (isWORDCHAR_lazy_if_safe(d, e, UTF) || *d == '\'' || *d == ':') {
        d += UTF ? UTF8SKIP(d) : 1;
+    }
 
     /* If we've tried to read what we allow filehandles to look like, and
        there's still text left, then it must be a glob() and not a getline.
@@ -11905,7 +11953,7 @@ Perl_parse_label(pTHX_ U32 flags)
        STRLEN wlen, bufptr_pos;
        lex_read_space(0);
        t = s = PL_bufptr;
-        if (!isIDFIRST_lazy_if(s, UTF))
+        if (!isIDFIRST_lazy_if_safe(s, PL_bufend, UTF))
            goto no_label;
        t = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &wlen);
        if (word_takes_any_delimiter(s, wlen))