This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
dist/Safe/t/safeutf8.t: Generalize to non-ASCII platform
[perl5.git] / toke.c
diff --git a/toke.c b/toke.c
index 67b6096..0eeafd4 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -503,6 +503,9 @@ S_ao(pTHX_ int toketype)
  * It prints "Missing operator before end of line" if there's nothing
  * after the missing operator, or "... before <...>" if there is something
  * after the missing operator.
+ *
+ * PL_bufptr is expected to point to the start of the thing that was found,
+ * and s after the next token or partial token.
  */
 
 STATIC void
@@ -1249,7 +1252,7 @@ buffer has reached the end of the input text.
 */
 
 #define LEX_FAKE_EOF 0x80000000
-#define LEX_NO_TERM  0x40000000
+#define LEX_NO_TERM  0x40000000 /* here-doc */
 
 bool
 Perl_lex_next_chunk(pTHX_ U32 flags)
@@ -1263,6 +1266,8 @@ Perl_lex_next_chunk(pTHX_ U32 flags)
     bool got_some;
     if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
        Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
+    if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
+       return FALSE;
     linestr = PL_parser->linestr;
     buf = SvPVX(linestr);
     if (!(flags & LEX_KEEP_PREVIOUS) &&
@@ -1517,6 +1522,8 @@ Perl_lex_read_space(pTHX_ U32 flags)
                incline(s);
                need_incline = 0;
            }
+       } else if (!c) {
+           s++;
        } else {
            break;
        }
@@ -1793,13 +1800,13 @@ S_skipspace_flags(pTHX_ char *s, U32 flags)
 {
     PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
     if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
-       while (s < PL_bufend && SPACE_OR_TAB(*s))
+       while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
            s++;
     } else {
        STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
        PL_bufptr = s;
        lex_read_space(flags | LEX_KEEP_PREVIOUS |
-               (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
+               (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
                    LEX_NO_NEXT_CHUNK : 0));
        s = PL_bufptr;
        PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
@@ -1902,6 +1909,7 @@ S_force_next(pTHX_ I32 type)
        tokereport(type, &NEXTVAL_NEXTTOKE);
     }
 #endif
+    assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
     PL_nexttype[PL_nexttoke] = type;
     PL_nexttoke++;
     if (PL_lex_state != LEX_KNOWNEXT) {
@@ -2276,7 +2284,9 @@ S_sublex_start(pTHX)
        return THING;
     }
     if (op_type == OP_CONST) {
-       SV *sv = tokeq(PL_lex_stuff);
+       SV *sv = PL_lex_stuff;
+       PL_lex_stuff = NULL;
+       sv = tokeq(sv);
 
        if (SvTYPE(sv) == SVt_PVIV) {
            /* Overloaded constants, nothing fancy: Convert to SVt_PV: */
@@ -2287,7 +2297,6 @@ S_sublex_start(pTHX)
            sv = nsv;
        }
        pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
-       PL_lex_stuff = NULL;
        return THING;
     }
 
@@ -2367,6 +2376,13 @@ S_sublex_push(pTHX)
     PL_lex_stuff = NULL;
     PL_sublex_info.repl = NULL;
 
+    /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
+       set for an inner quote-like operator and then an error causes scope-
+       popping.  We must not have a PL_lex_stuff value left dangling, as
+       that breaks assumptions elsewhere.  See bug #123617.  */
+    SAVEGENERICSV(PL_lex_stuff);
+    SAVEGENERICSV(PL_sublex_info.repl);
+
     PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
        = SvPVX(PL_linestr);
     PL_bufend += SvCUR(PL_linestr);
@@ -2471,7 +2487,6 @@ S_sublex_done(pTHX)
        PL_bufend = SvPVX(PL_linestr);
        PL_bufend += SvCUR(PL_linestr);
        PL_expect = XOPERATOR;
-       PL_sublex_info.sub_inwhat = 0;
        return ')';
     }
 }
@@ -3288,44 +3303,43 @@ S_scan_const(pTHX_ char *start)
                /* Here it looks like a named character */
 
                if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
-                   I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
-                               | PERL_SCAN_SILENT_ILLDIGIT
-                               | PERL_SCAN_DISALLOW_PREFIX;
-                   STRLEN len;
-
                    s += 2;         /* Skip to next char after the 'U+' */
-                   len = e - s;
-                   uv = grok_hex(s, &len, &flags, NULL);
-                   if (len == 0
-                    || (  len != (STRLEN)(e - s) && s[len] != '.'
-                       && PL_lex_inpat))
-                   {
-                     bad_NU:
-                       yyerror("Invalid hexadecimal number in \\N{U+...}");
-                       s = e + 1;
-                       continue;
-                   }
-
                    if (PL_lex_inpat) {
 
                         /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
-                        const char * const orig_s = s - 5;
-                        while (*s == '.') {
-                            s++;
-                            len = e - s;
-                            uv = grok_hex(s, &len, &flags, NULL);
-                            if (!len
-                             || (len != (STRLEN)(e - s) && s[len] != '.'))
-                                goto bad_NU;
+                        /* Check the syntax.  */
+                        const char *orig_s;
+                        orig_s = s - 5;
+                        if (!isXDIGIT(*s)) {
+                          bad_NU:
+                            yyerror(
+                                "Invalid hexadecimal number in \\N{U+...}"
+                            );
+                            s = e + 1;
+                            continue;
+                        }
+                        while (++s < e) {
+                            if (isXDIGIT(*s))
+                                continue;
+                            else if ((*s == '.' || *s == '_')
+                                  && isXDIGIT(s[1]))
+                                continue;
+                            goto bad_NU;
                         }
 
-                        /* Pass everything through unchanged.  The reason we
-                         * evaluate the numbers is to make sure there wasn't a
-                         * syntax error.  +1 is for the '}' */
+                        /* Pass everything through unchanged.
+                         * +1 is for the '}' */
                         Copy(orig_s, d, e - orig_s + 1, char);
                         d += e - orig_s + 1;
                    }
                    else {  /* Not a pattern: convert the hex to string */
+                        I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+                               | PERL_SCAN_SILENT_ILLDIGIT
+                               | PERL_SCAN_DISALLOW_PREFIX;
+                        STRLEN len = e - s;
+                        uv = grok_hex(s, &len, &flags, NULL);
+                        if (len == 0 || (len != (STRLEN)(e - s)))
+                            goto bad_NU;
 
                          /* If the destination is not in utf8, unconditionally
                          * recode it to be so.  This is because \N{} implies
@@ -4300,13 +4314,8 @@ Perl_yylex(pTHX)
        SvREFCNT_dec(tmp);
     } );
 
-    switch (PL_lex_state) {
-    case LEX_NORMAL:
-    case LEX_INTERPNORMAL:
-       break;
-
     /* when we've already built the next token, just pull it out of the queue */
-    case LEX_KNOWNEXT:
+    if (PL_nexttoke) {
        PL_nexttoke--;
        pl_yylval = PL_nextval[PL_nexttoke];
        if (!PL_nexttoke) {
@@ -4331,6 +4340,12 @@ Perl_yylex(pTHX)
            }
            return REPORT(next_type == 'p' ? pending_ident() : next_type);
        }
+    }
+
+    switch (PL_lex_state) {
+    case LEX_NORMAL:
+    case LEX_INTERPNORMAL:
+       break;
 
     /* interpolated case modifiers like \L \U, including \Q and \E.
        when we get here, PL_bufptr is at the \
@@ -4479,6 +4494,14 @@ Perl_yylex(pTHX)
        /* FALLTHROUGH */
 
     case LEX_INTERPEND:
+       /* Treat state as LEX_NORMAL if we have no inner lexing scope.
+          XXX This hack can be removed if we stop setting PL_lex_state to
+          LEX_KNOWNEXT, as can the hack under LEX_INTREPCONCAT below.  */
+       if (UNLIKELY(!PL_lex_inwhat)) {
+           PL_lex_state = LEX_NORMAL;
+           break;
+       }
+
        if (PL_lex_dojoin) {
            const U8 dojoin_was = PL_lex_dojoin;
            PL_lex_dojoin = FALSE;
@@ -4530,6 +4553,14 @@ Perl_yylex(pTHX)
            Perl_croak(aTHX_ "panic: INTERPCONCAT, lex_brackets=%ld",
                       (long) PL_lex_brackets);
 #endif
+       /* Treat state as LEX_NORMAL when not in an inner lexing scope.
+          XXX This hack can be removed if we stop setting PL_lex_state to
+          LEX_KNOWNEXT.  */
+       if (UNLIKELY(!PL_lex_inwhat)) {
+           PL_lex_state = LEX_NORMAL;
+           break;
+       }
+
        if (PL_bufptr == PL_bufend)
            return REPORT(sublex_done());
 
@@ -4610,7 +4641,8 @@ Perl_yylex(pTHX)
     case 26:
        goto fake_eof;                  /* emulate EOF on ^D or ^Z */
     case 0:
-       if (!PL_rsfp && (!PL_parser->filtered || s+1 < PL_bufend)) {
+       if ((!PL_rsfp || PL_lex_inwhat)
+        && (!PL_parser->filtered || s+1 < PL_bufend)) {
            PL_last_uni = 0;
            PL_last_lop = 0;
            if (PL_lex_brackets &&
@@ -4949,7 +4981,6 @@ Perl_yylex(pTHX)
        }
        if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
            PL_lex_state = LEX_FORMLINE;
-           NEXTVAL_NEXTTOKE.ival = 0;
            force_next(FORMRBRACK);
            TOKEN(';');
        }
@@ -4992,7 +5023,6 @@ Perl_yylex(pTHX)
                 incline(s);
            if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
                PL_lex_state = LEX_FORMLINE;
-               NEXTVAL_NEXTTOKE.ival = 0;
                force_next(FORMRBRACK);
                TOKEN(';');
            }
@@ -5324,7 +5354,7 @@ Perl_yylex(pTHX)
                    sv_catsv(sv, PL_lex_stuff);
                    attrs = op_append_elem(OP_LIST, attrs,
                                        newSVOP(OP_CONST, 0, sv));
-                   SvREFCNT_dec(PL_lex_stuff);
+                   SvREFCNT_dec_NN(PL_lex_stuff);
                    PL_lex_stuff = NULL;
                }
                else {
@@ -5741,13 +5771,12 @@ Perl_yylex(pTHX)
        PL_tokenbuf[0] = '&';
        s = scan_ident(s - 1, PL_tokenbuf + 1,
                       sizeof PL_tokenbuf - 1, TRUE);
+       pl_yylval.ival = (OPpENTERSUB_AMPER<<8);
        if (PL_tokenbuf[1]) {
-           PL_expect = XOPERATOR;
            force_ident_maybe_lex('&');
        }
        else
            PREREF('&');
-       pl_yylval.ival = (OPpENTERSUB_AMPER<<8);
        TERM('&');
 
     case '|':
@@ -5988,8 +6017,14 @@ Perl_yylex(pTHX)
        PL_tokenbuf[0] = '$';
        s = scan_ident(s, PL_tokenbuf + 1,
                       sizeof PL_tokenbuf - 1, FALSE);
-       if (PL_expect == XOPERATOR)
-           no_op("Scalar", s);
+       if (PL_expect == XOPERATOR) {
+           d = s;
+           if (PL_bufptr > s) {
+               d = PL_bufptr-1;
+               PL_bufptr = PL_oldbufptr;
+           }
+           no_op("Scalar", d);
+       }
        if (!PL_tokenbuf[1]) {
            if (s == PL_bufend)
                yyerror("Final $ should be \\$ or $name");
@@ -7671,10 +7706,8 @@ Perl_yylex(pTHX)
            }
            if (!words)
                words = newNULLLIST();
-           if (PL_lex_stuff) {
-               SvREFCNT_dec(PL_lex_stuff);
-               PL_lex_stuff = NULL;
-           }
+           SvREFCNT_dec_NN(PL_lex_stuff);
+           PL_lex_stuff = NULL;
            PL_expect = XOPERATOR;
            pl_yylval.opval = sawparens(words);
            TOKEN(QWLIST);
@@ -8580,7 +8613,7 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni)
 
     PERL_ARGS_ASSERT_SCAN_IDENT;
 
-    if (isSPACE(*s))
+    if (isSPACE(*s) || !*s)
        s = skipspace(s);
     if (isDIGIT(*s)) {
        while (isDIGIT(*s)) {
@@ -8989,10 +9022,8 @@ S_scan_subst(pTHX_ char *start)
     first_line = CopLINE(PL_curcop);
     s = scan_str(s,FALSE,FALSE,FALSE,NULL);
     if (!s) {
-       if (PL_lex_stuff) {
-           SvREFCNT_dec(PL_lex_stuff);
-           PL_lex_stuff = NULL;
-       }
+       SvREFCNT_dec_NN(PL_lex_stuff);
+       PL_lex_stuff = NULL;
        Perl_croak(aTHX_ "Substitution replacement not terminated");
     }
     PL_multi_start = first_start;      /* so whole substitution is taken together */
@@ -9071,10 +9102,8 @@ S_scan_trans(pTHX_ char *start)
 
     s = scan_str(s,FALSE,FALSE,FALSE,NULL);
     if (!s) {
-       if (PL_lex_stuff) {
-           SvREFCNT_dec(PL_lex_stuff);
-           PL_lex_stuff = NULL;
-       }
+       SvREFCNT_dec_NN(PL_lex_stuff);
+       PL_lex_stuff = NULL;
        Perl_croak(aTHX_ "Transliteration replacement not terminated");
     }