X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/451f421fe4742646fa2efbed0f45a19f0713d00f..0a520fced6c7f8a21494d4e9c42cd89f3a8ff5a5:/toke.c diff --git a/toke.c b/toke.c index 1079e94..332f653 100644 --- a/toke.c +++ b/toke.c @@ -110,7 +110,7 @@ Individual members of C have their own documentation. # define PL_nextval (PL_parser->nextval) #endif -static const char ident_too_long[] = "Identifier too long"; +static const char* const ident_too_long = "Identifier too long"; #ifdef PERL_MAD # define CURMAD(slot,sv) if (PL_madskills) { curmad(slot,sv); sv = 0; } @@ -358,7 +358,7 @@ static struct debug_tokens { { GIVEN, TOKENTYPE_IVAL, "GIVEN" }, { HASHBRACK, TOKENTYPE_NONE, "HASHBRACK" }, { IF, TOKENTYPE_IVAL, "IF" }, - { LABEL, TOKENTYPE_OPVAL, "LABEL" }, + { LABEL, TOKENTYPE_PVAL, "LABEL" }, { LOCAL, TOKENTYPE_IVAL, "LOCAL" }, { LOOPEX, TOKENTYPE_OPNUM, "LOOPEX" }, { LSTOP, TOKENTYPE_OPNUM, "LSTOP" }, @@ -544,7 +544,7 @@ S_no_op(pTHX_ const char *const what, char *s) "\t(Missing semicolon on previous line?)\n"); else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) { const char *t; - for (t = PL_oldoldbufptr; (isALNUM_lazy_if(t,UTF) || *t == ':'); + for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':'); t += UTF ? UTF8SKIP(t) : 1) NOOP; if (t < PL_bufptr && isSPACE(*t)) @@ -784,6 +784,8 @@ Perl_parser_free(pTHX_ const yy_parser *parser) (parser->old_parser && parser->rsfp != parser->old_parser->rsfp))) PerlIO_close(parser->rsfp); SvREFCNT_dec(parser->rsfp_filters); + SvREFCNT_dec(parser->lex_stuff); + SvREFCNT_dec(parser->sublex_info.repl); Safefree(parser->lex_brackstack); Safefree(parser->lex_casestack); @@ -792,6 +794,37 @@ Perl_parser_free(pTHX_ const yy_parser *parser) Safefree(parser); } +void +Perl_parser_free_nexttoke_ops(pTHX_ yy_parser *parser, OPSLAB *slab) +{ +#ifdef PERL_MAD + I32 nexttoke = parser->lasttoke; +#else + I32 nexttoke = parser->nexttoke; +#endif + PERL_ARGS_ASSERT_PARSER_FREE_NEXTTOKE_OPS; + while (nexttoke--) { +#ifdef PERL_MAD + if (S_is_opval_token(parser->nexttoke[nexttoke].next_type + & 0xffff) + && parser->nexttoke[nexttoke].next_val.opval + && parser->nexttoke[nexttoke].next_val.opval->op_slabbed + && OpSLAB(parser->nexttoke[nexttoke].next_val.opval) == slab) { + op_free(parser->nexttoke[nexttoke].next_val.opval); + parser->nexttoke[nexttoke].next_val.opval = NULL; + } +#else + if (S_is_opval_token(parser->nexttype[nexttoke] & 0xffff) + && parser->nextval[nexttoke].opval + && parser->nextval[nexttoke].opval->op_slabbed + && OpSLAB(parser->nextval[nexttoke].opval) == slab) { + op_free(parser->nextval[nexttoke].opval); + parser->nextval[nexttoke].opval = NULL; + } +#endif + } +} + /* =for apidoc AmxU|SV *|PL_parser-Elinestr @@ -976,10 +1009,13 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags) if (flags & LEX_STUFF_UTF8) { goto plain_copy; } else { - STRLEN highhalf = 0; + STRLEN highhalf = 0; /* Count of variants */ const char *p, *e = pv+len; - for (p = pv; p != e; p++) - highhalf += !!(((U8)*p) & 0x80); + for (p = pv; p != e; p++) { + if (! UTF8_IS_INVARIANT(*p)) { + highhalf++; + } + } if (!highhalf) goto plain_copy; lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf); @@ -990,9 +1026,9 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags) PL_parser->bufend += len+highhalf; for (p = pv; p != e; p++) { U8 c = (U8)*p; - if (c & 0x80) { - *bufptr++ = (char)(0xc0 | (c >> 6)); - *bufptr++ = (char)(0x80 | (c & 0x3f)); + if (! UTF8_IS_INVARIANT(c)) { + *bufptr++ = UTF8_TWO_BYTE_HI(c); + *bufptr++ = UTF8_TWO_BYTE_LO(c); } else { *bufptr++ = (char)c; } @@ -1004,14 +1040,13 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags) const char *p, *e = pv+len; for (p = pv; p != e; p++) { U8 c = (U8)*p; - if (c >= 0xc4) { + if (UTF8_IS_ABOVE_LATIN1(c)) { Perl_croak(aTHX_ "Lexing code attempted to stuff " "non-Latin-1 character into Latin-1 input"); - } else if (c >= 0xc2 && p+1 != e && - (((U8)p[1]) & 0xc0) == 0x80) { + } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) { p++; highhalf++; - } else if (c >= 0x80) { + } else if (! UTF8_IS_INVARIANT(c)) { /* malformed UTF-8 */ ENTER; SAVESPTR(PL_warnhook); @@ -1028,17 +1063,20 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags) SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len-highhalf); PL_parser->bufend += len-highhalf; - for (p = pv; p != e; p++) { - U8 c = (U8)*p; - if (c & 0x80) { - *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f)); - p++; - } else { - *bufptr++ = (char)c; + p = pv; + while (p < e) { + if (UTF8_IS_INVARIANT(*p)) { + *bufptr++ = *p; + p++; } + else { + assert(p < e -1 ); + *bufptr++ = TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)); + p += 2; + } } } else { - plain_copy: + plain_copy: lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len); bufptr = PL_parser->bufptr; Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char); @@ -1386,10 +1424,10 @@ Perl_lex_peek_unichar(pTHX_ U32 flags) bufend = PL_parser->bufend; } head = (U8)*s; - if (!(head & 0x80)) + if (UTF8_IS_INVARIANT(head)) return head; - if (head & 0x40) { - len = PL_utf8skip[head]; + if (UTF8_IS_START(head)) { + len = UTF8SKIP(&head); while ((STRLEN)(bufend-s) < len) { if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS)) break; @@ -1697,7 +1735,7 @@ S_incline(pTHX_ const char *s) /* skip space before PL_thistoken */ STATIC char * -S_skipspace0(pTHX_ register char *s) +S_skipspace0(pTHX_ char *s) { PERL_ARGS_ASSERT_SKIPSPACE0; @@ -1718,7 +1756,7 @@ S_skipspace0(pTHX_ register char *s) /* skip space after PL_thistoken */ STATIC char * -S_skipspace1(pTHX_ register char *s) +S_skipspace1(pTHX_ char *s) { const char *start = s; I32 startoff = start - SvPVX(PL_linestr); @@ -1745,7 +1783,7 @@ S_skipspace1(pTHX_ register char *s) } STATIC char * -S_skipspace2(pTHX_ register char *s, SV **svp) +S_skipspace2(pTHX_ char *s, SV **svp) { char *start; const I32 bufptroff = PL_bufptr - SvPVX(PL_linestr); @@ -1798,7 +1836,7 @@ S_update_debugger_info(pTHX_ SV *orig_sv, const char *const buf, STRLEN len) */ STATIC char * -S_skipspace(pTHX_ register char *s) +S_skipspace(pTHX_ char *s) { #ifdef PERL_MAD char *start = s; @@ -1853,7 +1891,7 @@ S_check_uni(pTHX) while (isSPACE(*PL_last_uni)) PL_last_uni++; s = PL_last_uni; - while (isALNUM_lazy_if(s,UTF) || *s == '-') + while (isWORDCHAR_lazy_if(s,UTF) || *s == '-') s++; if ((t = strchr(s, '(')) && t < PL_bufptr) return; @@ -2000,11 +2038,6 @@ S_force_next(pTHX_ I32 type) tokereport(type, &NEXTVAL_NEXTTOKE); } #endif - /* Don’t let opslab_force_free snatch it */ - if (S_is_opval_token(type & 0xffff) && NEXTVAL_NEXTTOKE.opval) { - assert(!NEXTVAL_NEXTTOKE.opval->op_savefree); - NEXTVAL_NEXTTOKE.opval->op_savefree = 1; - } #ifdef PERL_MAD if (PL_curforce < 0) start_force(PL_lasttoke); @@ -2077,7 +2110,7 @@ S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len) */ STATIC char * -S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick) +S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack, int allow_initial_tick) { dVAR; char *s; @@ -2126,14 +2159,14 @@ S_force_word(pTHX_ register char *start, int token, int check_keyword, int allow */ STATIC void -S_force_ident(pTHX_ register const char *s, int kind) +S_force_ident(pTHX_ const char *s, int kind) { dVAR; PERL_ARGS_ASSERT_FORCE_IDENT; - if (*s) { - const STRLEN len = strlen(s); + if (s[0]) { + const STRLEN len = s[1] ? strlen(s) : 1; /* s = "\"" see yylex */ OP* const o = (OP*)newSVOP(OP_CONST, 0, newSVpvn_flags(s, len, UTF ? SVf_UTF8 : 0)); start_force(PL_curforce); @@ -2614,6 +2647,201 @@ S_sublex_done(pTHX) } } +PERL_STATIC_INLINE SV* +S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) +{ + /* points to first character of interior of \N{}, to one beyond the + * interior, hence to the "}". Finds what the name resolves to, returning + * an SV* containing it; NULL if no valid one found */ + + SV* res = newSVpvn_flags(s, e - s, UTF ? SVf_UTF8 : 0); + + HV * table; + SV **cvp; + SV *cv; + SV *rv; + HV *stash; + const U8* first_bad_char_loc; + const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */ + + PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME; + + if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr, + e - backslash_ptr, + &first_bad_char_loc)) + { + /* If warnings are on, this will print a more detailed analysis of what + * is wrong than the error message below */ + utf8n_to_uvuni(first_bad_char_loc, + e - ((char *) first_bad_char_loc), + NULL, 0); + + /* We deliberately don't try to print the malformed character, which + * might not print very well; it also may be just the first of many + * malformations, so don't print what comes after it */ + yyerror(Perl_form(aTHX_ + "Malformed UTF-8 character immediately after '%.*s'", + (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr)); + return NULL; + } + + res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr, + /* include the <}> */ + e - backslash_ptr + 1); + if (! SvPOK(res)) { + SvREFCNT_dec_NN(res); + return NULL; + } + + /* See if the charnames handler is the Perl core's, and if so, we can skip + * the validation needed for a user-supplied one, as Perl's does its own + * validation. */ + table = GvHV(PL_hintgv); /* ^H */ + cvp = hv_fetchs(table, "charnames", FALSE); + if (cvp && (cv = *cvp) && SvROK(cv) && ((rv = SvRV(cv)) != NULL) + && SvTYPE(rv) == SVt_PVCV && ((stash = CvSTASH(rv)) != NULL)) + { + const char * const name = HvNAME(stash); + if strEQ(name, "_charnames") { + return res; + } + } + + /* Here, it isn't Perl's charname handler. We can't rely on a + * user-supplied handler to validate the input name. For non-ut8 input, + * look to see that the first character is legal. Then loop through the + * rest checking that each is a continuation */ + + /* This code needs to be sync'ed with a regex in _charnames.pm which does + * the same thing */ + + if (! UTF) { + if (! isALPHAU(*s)) { + goto bad_charname; + } + s++; + while (s < e) { + if (! isCHARNAME_CONT(*s)) { + goto bad_charname; + } + if (*s == ' ' && *(s-1) == ' ' && ckWARN(WARN_DEPRECATED)) { + Perl_warn(aTHX_ "A sequence of multiple spaces in a charnames alias definition is deprecated"); + } + s++; + } + if (*(s-1) == ' ' && ckWARN(WARN_DEPRECATED)) { + Perl_warn(aTHX_ "Trailing white-space in a charnames alias definition is deprecated"); + } + } + else { + /* Similarly for utf8. For invariants can check directly; for other + * Latin1, can calculate their code point and check; otherwise use a + * swash */ + if (UTF8_IS_INVARIANT(*s)) { + if (! isALPHAU(*s)) { + goto bad_charname; + } + s++; + } else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*s, *(s+1))))) { + goto bad_charname; + } + s += 2; + } + else { + if (! PL_utf8_charname_begin) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_charname_begin = _core_swash_init("utf8", + "_Perl_Charname_Begin", + &PL_sv_undef, + 1, 0, NULL, &flags); + } + if (! swash_fetch(PL_utf8_charname_begin, (U8 *) s, TRUE)) { + goto bad_charname; + } + s += UTF8SKIP(s); + } + + while (s < e) { + if (UTF8_IS_INVARIANT(*s)) { + if (! isCHARNAME_CONT(*s)) { + goto bad_charname; + } + if (*s == ' ' && *(s-1) == ' ' && ckWARN(WARN_DEPRECATED)) { + Perl_warn(aTHX_ "A sequence of multiple spaces in a charnames alias definition is deprecated"); + } + s++; + } + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + if (! isCHARNAME_CONT(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*s, + *(s+1))))) + { + goto bad_charname; + } + s += 2; + } + else { + if (! PL_utf8_charname_continue) { + U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; + PL_utf8_charname_continue = _core_swash_init("utf8", + "_Perl_Charname_Continue", + &PL_sv_undef, + 1, 0, NULL, &flags); + } + if (! swash_fetch(PL_utf8_charname_continue, (U8 *) s, TRUE)) { + goto bad_charname; + } + s += UTF8SKIP(s); + } + } + if (*(s-1) == ' ' && ckWARN(WARN_DEPRECATED)) { + Perl_warn(aTHX_ "Trailing white-space in a charnames alias definition is deprecated"); + } + } + + if (SvUTF8(res)) { /* Don't accept malformed input */ + const U8* first_bad_char_loc; + STRLEN len; + const char* const str = SvPV_const(res, len); + if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) { + /* If warnings are on, this will print a more detailed analysis of + * what is wrong than the error message below */ + utf8n_to_uvuni(first_bad_char_loc, + (char *) first_bad_char_loc - str, + NULL, 0); + + /* We deliberately don't try to print the malformed character, + * which might not print very well; it also may be just the first + * of many malformations, so don't print what comes after it */ + yyerror_pv( + Perl_form(aTHX_ + "Malformed UTF-8 returned by %.*s immediately after '%.*s'", + (int) (e - backslash_ptr + 1), backslash_ptr, + (int) ((char *) first_bad_char_loc - str), str + ), + SVf_UTF8); + return NULL; + } + } + + return res; + + bad_charname: { + int bad_char_size = ((UTF) ? UTF8SKIP(s) : 1); + + /* The final %.*s makes sure that should the trailing NUL be missing + * that this print won't run off the end of the string */ + yyerror_pv( + Perl_form(aTHX_ + "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s", + (int)(s - backslash_ptr + bad_char_size), backslash_ptr, + (int)(e - s + bad_char_size), s + bad_char_size + ), + UTF ? SVf_UTF8 : 0); + return NULL; + } +} + /* scan_const @@ -2635,7 +2863,8 @@ S_sublex_done(pTHX) In patterns: expand: - \N{ABC} => \N{U+41.42.43} + \N{FOO} => \N{U+hex_for_character_FOO} + (if FOO expands to multiple characters, expands to \N{U+xx.XX.yy ...}) pass through: all other \-char, including \N and \N{ apart from \N{ABC} @@ -2721,6 +2950,7 @@ S_scan_const(pTHX_ char *start) isn't utf8, as for example when it is entirely composed of hex constants */ + SV *res; /* result from charnames */ /* Note on sizing: The scanned constant is placed into sv, which is * initialized by newSV() assuming one byte of output for every byte of @@ -2734,7 +2964,8 @@ S_scan_const(pTHX_ char *start) * far, plus the length the current construct will occupy, plus room for * the trailing NUL, plus one byte for every input byte still unscanned */ - UV uv; + UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses + before set */ #ifdef EBCDIC UV literal_endpoint = 0; bool native_range = TRUE; /* turned to FALSE if the first endpoint is Unicode. */ @@ -2749,6 +2980,9 @@ S_scan_const(pTHX_ char *start) this_utf8 = PL_sublex_info.sub_op->op_private & (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); } + /* Protect sv from errors and fatal warnings. */ + ENTER_with_name("scan_const"); + SAVEFREESV(sv); while (s < send || dorange) { @@ -2768,7 +3002,7 @@ S_scan_const(pTHX_ char *start) #ifdef EBCDIC && !native_range #endif - ) { + ) { char * const c = (char*)utf8_hop((U8*)d, -1); char *e = d++; while (e-- > c) @@ -2820,7 +3054,6 @@ S_scan_const(pTHX_ char *start) #endif if (min > max) { - SvREFCNT_dec(sv); Perl_croak(aTHX_ "Invalid range \"%c-%c\" in transliteration operator", (char)min, (char)max); @@ -2879,7 +3112,6 @@ S_scan_const(pTHX_ char *start) /* range begins (ignore - as first or last char) */ else if (*s == '-' && s+1 < send && s != start) { if (didrange) { - SvREFCNT_dec(sv); Perl_croak(aTHX_ "Ambiguous range in transliteration operator"); } if (has_utf8 @@ -2955,7 +3187,7 @@ S_scan_const(pTHX_ char *start) (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-) */ else if (*s == '@' && s[1]) { - if (isALNUM_lazy_if(s+1,UTF)) + if (isWORDCHAR_lazy_if(s+1,UTF)) break; if (strchr(":'{$", s[1])) break; @@ -3017,7 +3249,7 @@ S_scan_const(pTHX_ char *start) else if (PL_lex_inpat && (*s != 'N' || s[1] != '{' - || regcurly(s + 1))) + || regcurly(s + 1, FALSE))) { *d++ = NATIVE_TO_NEED(has_utf8,'\\'); goto default_action; @@ -3034,7 +3266,7 @@ S_scan_const(pTHX_ char *start) /* FALL THROUGH */ default: { - if ((isALNUMC(*s))) + if ((isALPHANUMERIC(*s))) Perl_ck_warner(aTHX_ packWARN(WARN_MISC), "Unrecognized escape \\%c passed through", *s); @@ -3046,21 +3278,30 @@ S_scan_const(pTHX_ char *start) case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': { - I32 flags = 0; + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; STRLEN len = 3; uv = NATIVE_TO_UNI(grok_oct(s, &len, &flags, NULL)); s += len; + if (len < 3 && s < send && isDIGIT(*s) + && ckWARN(WARN_MISC)) + { + Perl_warner(aTHX_ packWARN(WARN_MISC), + "%s", form_short_octal_warning(s, len)); + } } goto NUM_ESCAPE_INSERT; /* eg. \o{24} indicates the octal constant \024 */ case 'o': { - STRLEN len; const char* error; - bool valid = grok_bslash_o(s, &uv, &len, &error, 1); - s += len; + bool valid = grok_bslash_o(&s, &uv, &error, + TRUE, /* Output warning */ + FALSE, /* Not strict */ + TRUE, /* Output warnings for + non-portables */ + UTF); if (! valid) { yyerror(error); continue; @@ -3071,11 +3312,14 @@ S_scan_const(pTHX_ char *start) /* eg. \x24 indicates the hex constant 0x24 */ case 'x': { - STRLEN len; const char* error; - bool valid = grok_bslash_x(s, &uv, &len, &error, 1); - s += len; + bool valid = grok_bslash_x(&s, &uv, &error, + TRUE, /* Output warning */ + FALSE, /* Not strict */ + TRUE, /* Output warnings for + non-portables */ + UTF); if (! valid) { yyerror(error); continue; @@ -3184,31 +3428,6 @@ S_scan_const(pTHX_ char *start) /* Here it looks like a named character */ - if (PL_lex_inpat) { - - /* XXX This block is temporary code. \N{} implies that the - * pattern is to have Unicode semantics, and therefore - * currently has to be encoded in utf8. By putting it in - * utf8 now, we save a whole pass in the regular expression - * compiler. Once that code is changed so Unicode - * semantics doesn't necessarily have to be in utf8, this - * block should be removed. However, the code that parses - * the output of this would have to be changed to not - * necessarily expect utf8 */ - if (!has_utf8) { - SvCUR_set(sv, d - SvPVX_const(sv)); - SvPOK_on(sv); - *d = '\0'; - /* See Note on sizing above. */ - sv_utf8_upgrade_flags_grow(sv, - SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, - /* 5 = '\N{' + cur char + NUL */ - (STRLEN)(send - s) + 5); - d = SvPVX(sv) + SvCUR(sv); - has_utf8 = TRUE; - } - } - if (*s == 'U' && s[1] == '+') { /* \N{U+...} */ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES | PERL_SCAN_DISALLOW_PREFIX; @@ -3270,32 +3489,12 @@ S_scan_const(pTHX_ char *start) else d = (char*)uvuni_to_utf8((U8*)d, uv); } } - else { /* Here is \N{NAME} but not \N{U+...}. */ - - SV *res; /* result from charnames */ - const char *str; /* the string in 'res' */ - STRLEN len; /* its length */ - - /* Get the value for NAME */ - res = newSVpvn(s, e - s); - res = new_constant( NULL, 0, "charnames", - /* includes all of: \N{...} */ - res, NULL, s - 3, e - s + 4 ); - - /* Most likely res will be in utf8 already since the - * standard charnames uses pack U, but a custom translator - * can leave it otherwise, so make sure. XXX This can be - * revisited to not have charnames use utf8 for characters - * that don't need it when regexes don't have to be in utf8 - * for Unicode semantics. If doing so, remember EBCDIC */ - sv_utf8_upgrade(res); - str = SvPV_const(res, len); - - /* Don't accept malformed input */ - if (! is_utf8_string((U8 *) str, len)) { - yyerror("Malformed UTF-8 returned by \\N"); - } - else if (PL_lex_inpat) { + else /* Here is \N{NAME} but not \N{U+...}. */ + if ((res = get_and_check_backslash_N_name(s, e))) + { + STRLEN len; + const char *str = SvPV_const(res, len); + if (PL_lex_inpat) { if (! len) { /* The name resolved to an empty string */ Copy("\\N{}", d, 4, char); @@ -3309,73 +3508,88 @@ S_scan_const(pTHX_ char *start) * returned by charnames */ const char *str_end = str + len; - STRLEN char_length; /* cur char's byte length */ - STRLEN output_length; /* and the number of bytes - after this is translated - into hex digits */ const STRLEN off = d - SvPVX_const(sv); - /* 2 hex per byte; 2 chars for '\N'; 2 chars for - * max('U+', '.'); and 1 for NUL */ - char hex_string[2 * UTF8_MAXBYTES + 5]; - - /* Get the first character of the result. */ - U32 uv = utf8n_to_uvuni((U8 *) str, - len, - &char_length, - UTF8_ALLOW_ANYUV); - - /* The call to is_utf8_string() above hopefully - * guarantees that there won't be an error. But - * it's easy here to make sure. The function just - * above warns and returns 0 if invalid utf8, but - * it can also return 0 if the input is validly a - * NUL. Disambiguate */ - if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') { - uv = UNICODE_REPLACEMENT; - } - - /* Convert first code point to hex, including the - * boiler plate before it. For all these, we - * convert to native format so that downstream code - * can continue to assume the input is native */ - output_length = - my_snprintf(hex_string, sizeof(hex_string), - "\\N{U+%X", - (unsigned int) UNI_TO_NATIVE(uv)); - - /* Make sure there is enough space to hold it */ - d = off + SvGROW(sv, off - + output_length - + (STRLEN)(send - e) - + 2); /* '}' + NUL */ - /* And output it */ - Copy(hex_string, d, output_length, char); - d += output_length; - - /* For each subsequent character, append dot and - * its ordinal in hex */ - while ((str += char_length) < str_end) { - const STRLEN off = d - SvPVX_const(sv); - U32 uv = utf8n_to_uvuni((U8 *) str, - str_end - str, - &char_length, - UTF8_ALLOW_ANYUV); - if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') { - uv = UNICODE_REPLACEMENT; - } - - output_length = - my_snprintf(hex_string, sizeof(hex_string), - ".%X", - (unsigned int) UNI_TO_NATIVE(uv)); - - d = off + SvGROW(sv, off - + output_length - + (STRLEN)(send - e) - + 2); /* '}' + NUL */ - Copy(hex_string, d, output_length, char); - d += output_length; + if (! SvUTF8(res)) { + /* For the non-UTF-8 case, we can determine the + * exact length needed without having to parse + * through the string. Each character takes up + * 2 hex digits plus either a trailing dot or + * the "}" */ + d = off + SvGROW(sv, off + + 3 * len + + 6 /* For the "\N{U+", and + trailing NUL */ + + (STRLEN)(send - e)); + Copy("\\N{U+", d, 5, char); + d += 5; + while (str < str_end) { + char hex_string[4]; + my_snprintf(hex_string, sizeof(hex_string), + "%02X.", (U8) *str); + Copy(hex_string, d, 3, char); + d += 3; + str++; + } + d--; /* We will overwrite below the final + dot with a right brace */ + } + else { + STRLEN char_length; /* cur char's byte length */ + + /* and the number of bytes after this is + * translated into hex digits */ + STRLEN output_length; + + /* 2 hex per byte; 2 chars for '\N'; 2 chars + * for max('U+', '.'); and 1 for NUL */ + char hex_string[2 * UTF8_MAXBYTES + 5]; + + /* Get the first character of the result. */ + U32 uv = utf8n_to_uvuni((U8 *) str, + len, + &char_length, + UTF8_ALLOW_ANYUV); + /* Convert first code point to hex, including + * the boiler plate before it. For all these, + * we convert to native format so that + * downstream code can continue to assume the + * input is native */ + output_length = + my_snprintf(hex_string, sizeof(hex_string), + "\\N{U+%X", + (unsigned int) UNI_TO_NATIVE(uv)); + + /* Make sure there is enough space to hold it */ + d = off + SvGROW(sv, off + + output_length + + (STRLEN)(send - e) + + 2); /* '}' + NUL */ + /* And output it */ + Copy(hex_string, d, output_length, char); + d += output_length; + + /* For each subsequent character, append dot and + * its ordinal in hex */ + while ((str += char_length) < str_end) { + const STRLEN off = d - SvPVX_const(sv); + U32 uv = utf8n_to_uvuni((U8 *) str, + str_end - str, + &char_length, + UTF8_ALLOW_ANYUV); + output_length = + my_snprintf(hex_string, + sizeof(hex_string), + ".%X", + (unsigned int) UNI_TO_NATIVE(uv)); + + d = off + SvGROW(sv, off + + output_length + + (STRLEN)(send - e) + + 2); /* '}' + NUL */ + Copy(hex_string, d, output_length, char); + d += output_length; + } } *d++ = '}'; /* Done. Add the trailing brace */ @@ -3408,68 +3622,9 @@ S_scan_const(pTHX_ char *start) Copy(str, d, len, char); d += len; } + SvREFCNT_dec(res); - /* Deprecate non-approved name syntax */ - if (ckWARN_d(WARN_DEPRECATED)) { - bool problematic = FALSE; - char* i = s; - - /* For non-ut8 input, look to see that the first - * character is an alpha, then loop through the rest - * checking that each is a continuation */ - if (! this_utf8) { - if (! isALPHAU(*i)) problematic = TRUE; - else for (i = s + 1; i < e; i++) { - if (isCHARNAME_CONT(*i)) continue; - problematic = TRUE; - break; - } - } - else { - /* Similarly for utf8. For invariants can check - * directly. We accept anything above the latin1 - * range because it is immaterial to Perl if it is - * correct or not, and is expensive to check. But - * it is fairly easy in the latin1 range to convert - * the variants into a single character and check - * those */ - if (UTF8_IS_INVARIANT(*i)) { - if (! isALPHAU(*i)) problematic = TRUE; - } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) { - if (! isALPHAU(UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(*i, - *(i+1))))) - { - problematic = TRUE; - } - } - if (! problematic) for (i = s + UTF8SKIP(s); - i < e; - i+= UTF8SKIP(i)) - { - if (UTF8_IS_INVARIANT(*i)) { - if (isCHARNAME_CONT(*i)) continue; - } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) { - continue; - } else if (isCHARNAME_CONT( - UNI_TO_NATIVE( - TWO_BYTE_UTF8_TO_UNI(*i, *(i+1))))) - { - continue; - } - problematic = TRUE; - break; - } - } - if (problematic) { - /* The e-i passed to the final %.*s makes sure that - * should the trailing NUL be missing that this - * print won't run off the end of the string */ - Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), - "Deprecated character in \\N{...}; marked by <-- HERE in \\N{%.*s<-- HERE %.*s", - (int)(i - s + 1), s, (int)(e - i), i + 1); - } - } } /* End \N{NAME} */ #ifdef EBCDIC if (!dorange) @@ -3595,6 +3750,7 @@ S_scan_const(pTHX_ char *start) /* return the substring (via pl_yylval) only if we parsed anything */ if (s > PL_bufptr) { + SvREFCNT_inc_simple_void_NN(sv); if ( PL_hints & ( PL_lex_inpat ? HINT_NEW_RE : HINT_NEW_STRING ) ) { const char *const key = PL_lex_inpat ? "qr" : "q"; const STRLEN keylen = PL_lex_inpat ? 2 : 1; @@ -3619,8 +3775,8 @@ S_scan_const(pTHX_ char *start) type, typelen); } pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv); - } else - SvREFCNT_dec(sv); + } + LEAVE_with_name("scan_const"); return s; } @@ -3646,7 +3802,7 @@ S_scan_const(pTHX_ char *start) /* This is the one truly awful dwimmer necessary to conflate C and sed. */ STATIC int -S_intuit_more(pTHX_ register char *s) +S_intuit_more(pTHX_ char *s) { dVAR; @@ -3663,7 +3819,7 @@ S_intuit_more(pTHX_ register char *s) /* In a pattern, so maybe we have {n,m}. */ if (*s == '{') { - if (regcurly(s)) { + if (regcurly(s, FALSE)) { return FALSE; } return TRUE; @@ -3676,16 +3832,16 @@ S_intuit_more(pTHX_ register char *s) return FALSE; else { /* this is terrifying, and it works */ - int weight = 2; /* let's weigh the evidence */ + int weight; char seen[256]; - unsigned char un_char = 255, last_un_char; const char * const send = strchr(s,']'); + unsigned char un_char, last_un_char; char tmpbuf[sizeof PL_tokenbuf * 4]; if (!send) /* has to be an expression */ return TRUE; + weight = 2; /* let's weigh the evidence */ - Zero(seen,256,char); if (*s == '$') weight -= 3; else if (isDIGIT(*s)) { @@ -3696,6 +3852,8 @@ S_intuit_more(pTHX_ register char *s) else weight -= 100; } + Zero(seen,256,char); + un_char = 255; for (; s < send; s++) { last_un_char = un_char; un_char = (unsigned char)*s; @@ -3704,7 +3862,7 @@ S_intuit_more(pTHX_ register char *s) case '&': case '$': weight -= seen[un_char] * 10; - if (isALNUM_lazy_if(s+1,UTF)) { + if (isWORDCHAR_lazy_if(s+1,UTF)) { int len; scan_ident(s, send, tmpbuf, sizeof tmpbuf, FALSE); len = (int)strlen(tmpbuf); @@ -3751,7 +3909,7 @@ S_intuit_more(pTHX_ register char *s) weight -= 5; /* cope with negative subscript */ break; default: - if (!isALNUM(last_un_char) + if (!isWORDCHAR(last_un_char) && !(last_un_char == '$' || last_un_char == '@' || last_un_char == '&') && isALPHA(*s) && s[1] && isALPHA(s[1])) { @@ -4095,7 +4253,7 @@ Perl_filter_read(pTHX_ int idx, SV *buf_sv, int maxlen) } STATIC char * -S_filter_gets(pTHX_ register SV *sv, STRLEN append) +S_filter_gets(pTHX_ SV *sv, STRLEN append) { dVAR; @@ -4276,7 +4434,6 @@ Perl_madlex(pTHX) case FUNC0SUB: case UNIOPSUB: case LSTOPSUB: - case LABEL: if (pl_yylval.opval) append_madprops(PL_thismad, pl_yylval.opval, 0); PL_thismad = 0; @@ -4291,6 +4448,10 @@ Perl_madlex(pTHX) } break; + /* pval */ + case LABEL: + break; + case ']': case '}': if (PL_faketokens) @@ -4406,21 +4567,40 @@ S_word_takes_any_delimeter(char *p, STRLEN len) stitching them into a tree. Returns: - PRIVATEREF + The type of the next token Structure: - if read an identifier - if we're in a my declaration - croak if they tried to say my($foo::bar) - build the ops for a my() declaration - if it's an access to a my() variable - are we in a sort block? - croak if my($a); $a <=> $b - build ops for access to a my() variable - if in a dq string, and they've said @foo and we can't find @foo - croak - build ops for a bareword - if we already built the token before, use it. + Switch based on the current state: + - if we already built the token before, use it + - if we have a case modifier in a string, deal with that + - handle other cases of interpolation inside a string + - scan the next line if we are inside a format + In the normal state switch on the next character: + - default: + if alphabetic, go to key lookup + unrecoginized character - croak + - 0/4/26: handle end-of-line or EOF + - cases for whitespace + - \n and #: handle comments and line numbers + - various operators, brackets and sigils + - numbers + - quotes + - 'v': vstrings (or go to key lookup) + - 'x' repetition operator (or go to key lookup) + - other ASCII alphanumerics (key lookup begins here): + word before => ? + keyword plugin + scan built-in keyword (but do nothing with it yet) + check for statement label + check for lexical subs + goto just_a_word if there is one + see whether built-in keyword is overridden + switch on keyword number: + - default: just_a_word: + not a built-in keyword; handle bareword lookup + disambiguate between method and sub call + fall back to bareword + - cases for built-in keywords */ @@ -4513,8 +4693,6 @@ Perl_yylex(pTHX) PL_lex_allbrackets--; next_type &= 0xffff; } - if (S_is_opval_token(next_type) && pl_yylval.opval) - pl_yylval.opval->op_savefree = 0; /* release */ return REPORT(next_type == 'p' ? pending_ident() : next_type); } @@ -4556,9 +4734,11 @@ Perl_yylex(pTHX) #ifdef PERL_MAD while (PL_bufptr != PL_bufend && PL_bufptr[0] == '\\' && PL_bufptr[1] == 'E') { - if (!PL_thiswhite) + if (PL_madskills) { + if (!PL_thiswhite) PL_thiswhite = newSVpvs(""); - sv_catpvn(PL_thiswhite, PL_bufptr, 2); + sv_catpvn(PL_thiswhite, PL_bufptr, 2); + } PL_bufptr += 2; } #else @@ -4574,9 +4754,11 @@ Perl_yylex(pTHX) s = PL_bufptr + 1; if (s[1] == '\\' && s[2] == 'E') { #ifdef PERL_MAD - if (!PL_thiswhite) + if (PL_madskills) { + if (!PL_thiswhite) PL_thiswhite = newSVpvs(""); - sv_catpvn(PL_thiswhite, PL_bufptr, 4); + sv_catpvn(PL_thiswhite, PL_bufptr, 4); + } #endif PL_bufptr = s + 3; PL_lex_state = LEX_INTERPCONCAT; @@ -4843,7 +5025,7 @@ Perl_yylex(pTHX) #endif switch (*s) { default: - if (isIDFIRST_lazy_if(s,UTF)) + if (UTF ? isIDFIRST_utf8((U8*)s) : isALNUMC(*s)) goto keylookup; { SV *dsv = newSVpvs_flags("", SVs_TEMP); @@ -5234,9 +5416,11 @@ Perl_yylex(pTHX) case ' ': case '\t': case '\f': case 013: #ifdef PERL_MAD PL_realtokenstart = -1; - if (!PL_thiswhite) + if (PL_madskills) { + if (!PL_thiswhite) PL_thiswhite = newSVpvs(""); - sv_catpvn(PL_thiswhite, s, 1); + sv_catpvn(PL_thiswhite, s, 1); + } #endif s++; goto retry; @@ -5329,7 +5513,7 @@ Perl_yylex(pTHX) } goto retry; case '-': - if (s[1] && isALPHA(s[1]) && !isALNUM(s[2])) { + if (s[1] && isALPHA(s[1]) && !isWORDCHAR(s[2])) { I32 ftst = 0; char tmp; @@ -5589,7 +5773,7 @@ Perl_yylex(pTHX) } sv = newSVpvn_flags(s, len, UTF ? SVf_UTF8 : 0); if (*d == '(') { - d = scan_str(d,TRUE,TRUE,FALSE); + d = scan_str(d,TRUE,TRUE,FALSE, FALSE); if (!d) { /* MUST advance bufptr here to avoid bogus "at end of line" context messages from yyerror(). @@ -5834,9 +6018,9 @@ Perl_yylex(pTHX) } else if (*s == 'q') { if (++t < PL_bufend - && (!isALNUM(*t) + && (!isWORDCHAR(*t) || ((*t == 'q' || *t == 'x') && ++t < PL_bufend - && !isALNUM(*t)))) + && !isWORDCHAR(*t)))) { /* skip q//-like construct */ const char *tmps; @@ -5875,12 +6059,12 @@ Perl_yylex(pTHX) } else /* skip plain q word */ - while (t < PL_bufend && isALNUM_lazy_if(t,UTF)) + while (t < PL_bufend && isWORDCHAR_lazy_if(t,UTF)) t += UTF8SKIP(t); } - else if (isALNUM_lazy_if(t,UTF)) { + else if (isWORDCHAR_lazy_if(t,UTF)) { t += UTF8SKIP(t); - while (t < PL_bufend && isALNUM_lazy_if(t,UTF)) + while (t < PL_bufend && isWORDCHAR_lazy_if(t,UTF)) t += UTF8SKIP(t); } while (t < PL_bufend && isSPACE(*t)) @@ -5950,7 +6134,7 @@ Perl_yylex(pTHX) force_next(formbrack ? '.' : '}'); if (formbrack) LEAVE; #ifdef PERL_MAD - if (!PL_thistoken) + if (PL_madskills && !PL_thistoken) PL_thistoken = newSVpvs(""); #endif if (formbrack == 2) { /* means . where arguments were expected */ @@ -6117,8 +6301,8 @@ Perl_yylex(pTHX) if (*t == '/' || *t == '?' || ((*t == 'm' || *t == 's' || *t == 'y') - && !isALNUM(t[1])) || - (*t == 't' && t[1] == 'r' && !isALNUM(t[2]))) + && !isWORDCHAR(t[1])) || + (*t == 't' && t[1] == 'r' && !isWORDCHAR(t[2]))) Perl_warner(aTHX_ packWARN(WARN_SYNTAX), "!=~ should be !~"); } @@ -6255,7 +6439,7 @@ Perl_yylex(pTHX) if (ckWARN(WARN_SYNTAX)) { char *t = s+1; - while (isSPACE(*t) || isALNUM_lazy_if(t,UTF) || *t == '$') + while (isSPACE(*t) || isWORDCHAR_lazy_if(t,UTF) || *t == '$') t++; if (*t++ == ',') { PL_bufptr = PEEKSPACE(PL_bufptr); /* XXX can realloc */ @@ -6366,7 +6550,7 @@ Perl_yylex(pTHX) if (*s == '[' || *s == '{') { if (ckWARN(WARN_SYNTAX)) { const char *t = s + 1; - while (*t && (isALNUM_lazy_if(t,UTF) || strchr(" \t$#+-'\"", *t))) + while (*t && (isWORDCHAR_lazy_if(t,UTF) || strchr(" \t$#+-'\"", *t))) t += UTF ? UTF8SKIP(t) : 1; if (*t == '}' || *t == ']') { t++; @@ -6434,7 +6618,7 @@ Perl_yylex(pTHX) if (PL_oldoldbufptr == PL_last_uni && (*PL_last_uni != 's' || s - PL_last_uni < 5 || memNE(PL_last_uni, "study", 5) - || isALNUM_lazy_if(PL_last_uni+5,UTF) + || isWORDCHAR_lazy_if(PL_last_uni+5,UTF) )) check_uni(); if (*s == '?') @@ -6494,7 +6678,7 @@ Perl_yylex(pTHX) TERM(THING); case '\'': - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); DEBUG_T( { printbuf("### Saw string before %s\n", s); } ); if (PL_expect == XOPERATOR) { if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) { @@ -6509,7 +6693,7 @@ Perl_yylex(pTHX) TERM(sublex_start()); case '"': - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); DEBUG_T( { printbuf("### Saw string before %s\n", s); } ); if (PL_expect == XOPERATOR) { if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) { @@ -6532,7 +6716,7 @@ Perl_yylex(pTHX) TERM(sublex_start()); case '`': - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); DEBUG_T( { printbuf("### Saw backtick string before %s\n", s); } ); if (PL_expect == XOPERATOR) no_op("Backticks",s); @@ -6691,9 +6875,9 @@ Perl_yylex(pTHX) if (!anydelim && PL_expect == XSTATE && d < PL_bufend && *d == ':' && *(d + 1) != ':') { s = d + 1; - pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, - newSVpvn_flags(PL_tokenbuf, - len, UTF ? SVf_UTF8 : 0)); + pl_yylval.pval = savepvn(PL_tokenbuf, len+1); + pl_yylval.pval[len] = '\0'; + pl_yylval.pval[len+1] = UTF ? 1 : 0; CLINE; TOKEN(LABEL); } @@ -6716,6 +6900,11 @@ Perl_yylex(pTHX) gv = gv_fetchsv(sv, GV_NOADD_NOINIT | SvUTF8(sv), SVt_PVCV); off = 0; + if (!gv) { + sv_free(sv); + sv = NULL; + goto just_a_word; + } } else { rv2cv_op = newOP(OP_PADANY, 0); @@ -7866,6 +8055,9 @@ Perl_yylex(pTHX) "Experimental \"%s\" subs not enabled", tmp == KEY_my ? "my" : tmp == KEY_state ? "state" : "our"); + Perl_ck_warner_d(aTHX_ + packWARN(WARN_EXPERIMENTAL__LEXICAL_SUBS), + "The lexical_subs feature is experimental"); goto really_sub; } PL_in_my_stash = find_in_my_stash(PL_tokenbuf, len); @@ -7913,15 +8105,9 @@ Perl_yylex(pTHX) case KEY_open: s = SKIPSPACE1(s); if (isIDFIRST_lazy_if(s,UTF)) { - const char *t; - for (d = s; isALNUM_lazy_if(d,UTF);) { - d += UTF ? UTF8SKIP(d) : 1; - if (UTF) { - while (UTF8_IS_CONTINUED(*d) && is_utf8_mark((U8*)d)) { - d += UTF ? UTF8SKIP(d) : 1; - } - } - } + const char *t; + d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, + &len); for (t=d; isSPACE(*t);) t++; if ( *t && strchr("|&*+-=!?:.", *t) && ckWARN_d(WARN_PRECEDENCE) @@ -7988,7 +8174,7 @@ Perl_yylex(pTHX) LOP(OP_PIPE_OP,XTERM); case KEY_q: - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) missingterm(NULL); pl_yylval.ival = OP_CONST; @@ -7999,7 +8185,7 @@ Perl_yylex(pTHX) case KEY_qw: { OP *words = NULL; - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) missingterm(NULL); PL_expect = XOPERATOR; @@ -8049,7 +8235,7 @@ Perl_yylex(pTHX) } case KEY_qq: - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) missingterm(NULL); pl_yylval.ival = OP_STRINGIFY; @@ -8062,7 +8248,7 @@ Perl_yylex(pTHX) TERM(sublex_start()); case KEY_qx: - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) missingterm(NULL); readpipe_override(); @@ -8288,7 +8474,9 @@ Perl_yylex(pTHX) SV *tmpwhite = 0; char *tstart = SvPVX(PL_linestr) + PL_realtokenstart; - SV *subtoken = newSVpvn_flags(tstart, s - tstart, SvUTF8(PL_linestr)); + SV *subtoken = PL_madskills + ? newSVpvn_flags(tstart, s - tstart, SvUTF8(PL_linestr)) + : NULL; PL_thistoken = 0; d = s; @@ -8381,7 +8569,7 @@ Perl_yylex(pTHX) const bool warnillegalproto = ckWARN(WARN_ILLEGALPROTO); STRLEN tmplen; - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) Perl_croak(aTHX_ "Prototype not terminated"); /* strip spaces and check for bad characters */ @@ -8660,6 +8848,26 @@ Perl_yylex(pTHX) #pragma segment Main #endif +/* + S_pending_ident + + Looks up an identifier in the pad or in a package + + Returns: + PRIVATEREF if this is a lexical name. + WORD if this belongs to a package. + + Structure: + if we're in a my declaration + croak if they tried to say my($foo::bar) + build the ops for a my() declaration + if it's an access to a my() variable + build ops for access to a my() variable + if in a dq string, and they've said @foo and we can't find @foo + warn + build ops for a bareword +*/ + static int S_pending_ident(pTHX) { @@ -8813,7 +9021,7 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what) if (isIDFIRST_lazy_if(s,UTF)) { const char * const w = s; s += UTF ? UTF8SKIP(s) : 1; - while (isALNUM_lazy_if(s,UTF)) + while (isWORDCHAR_lazy_if(s,UTF)) s += UTF ? UTF8SKIP(s) : 1; while (s < PL_bufend && isSPACE(*s)) s++; @@ -8830,10 +9038,11 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what) } } -/* Either returns sv, or mortalizes sv and returns a new SV*. +/* Either returns sv, or mortalizes/frees sv and returns a new SV*. Best used as sv=new_constant(..., sv, ...). If s, pv are NULL, calls subroutine with one argument, - and type is used with error messages only. */ + and is used with error messages only. + is assumed to be well formed UTF-8 */ STATIC SV * S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen, @@ -8842,26 +9051,34 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen, dVAR; dSP; HV * table = GvHV(PL_hintgv); /* ^H */ SV *res; + SV *errsv = NULL; SV **cvp; SV *cv, *typesv; const char *why1 = "", *why2 = "", *why3 = ""; PERL_ARGS_ASSERT_NEW_CONSTANT; + /* We assume that this is true: */ + if (*key == 'c') { assert (strEQ(key, "charnames")); } + assert(type || s); /* charnames doesn't work well if there have been errors found */ - if (PL_error_count > 0 && strEQ(key,"charnames")) + if (PL_error_count > 0 && *key == 'c') + { + SvREFCNT_dec_NN(sv); return &PL_sv_undef; + } + sv_2mortal(sv); /* Parent created it permanently */ if (!table || ! (PL_hints & HINT_LOCALIZE_HH) || ! (cvp = hv_fetch(table, key, keylen, FALSE)) || ! SvOK(*cvp)) { - SV *msg; + char *msg; /* Here haven't found what we're looking for. If it is charnames, * perhaps it needs to be loaded. Try doing that before giving up */ - if (strEQ(key,"charnames")) { + if (*key == 'c') { Perl_load_module(aTHX_ 0, newSVpvs("_charnames"), @@ -8883,23 +9100,32 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen, } } if (!table || !(PL_hints & HINT_LOCALIZE_HH)) { - msg = Perl_newSVpvf(aTHX_ - "Constant(%s) unknown", (type ? type: "undef")); + msg = Perl_form(aTHX_ + "Constant(%.*s) unknown", + (int)(type ? typelen : len), + (type ? type: s)); } else { - why1 = "$^H{"; - why2 = key; - why3 = "} is not defined"; - report: - msg = Perl_newSVpvf(aTHX_ "Constant(%s): %s%s%s", - (type ? type: "undef"), why1, why2, why3); - } - yyerror(SvPVX_const(msg)); - SvREFCNT_dec(msg); - return sv; + why1 = "$^H{"; + why2 = key; + why3 = "} is not defined"; + report: + if (*key == 'c') { + msg = Perl_form(aTHX_ + /* The +3 is for '\N{'; -4 for that, plus '}' */ + "Unknown charname '%.*s'", (int)typelen - 4, type + 3 + ); + } + else { + msg = Perl_form(aTHX_ "Constant(%.*s): %s%s%s", + (int)(type ? typelen : len), + (type ? type: s), why1, why2, why3); + } + } + yyerror_pv(msg, UTF ? SVf_UTF8 : 0); + return SvREFCNT_inc_simple_NN(sv); } now_ok: - sv_2mortal(sv); /* Parent created it permanently */ cv = *cvp; if (!pv && s) pv = newSVpvn_flags(s, len, SVs_TEMP); @@ -8925,15 +9151,18 @@ now_ok: SPAGAIN ; /* Check the eval first */ - if (!PL_in_eval && SvTRUE(ERRSV)) { - sv_catpvs(ERRSV, "Propagated"); - yyerror(SvPV_nolen_const(ERRSV)); /* Duplicates the message inside eval */ + if (!PL_in_eval && ((errsv = ERRSV), SvTRUE_NN(errsv))) { + STRLEN errlen; + const char * errstr; + sv_catpvs(errsv, "Propagated"); + errstr = SvPV_const(errsv, errlen); + yyerror_pvn(errstr, errlen, 0); /* Duplicates the message inside eval */ (void)POPs; - res = SvREFCNT_inc_simple(sv); + res = SvREFCNT_inc_simple_NN(sv); } else { res = POPs; - SvREFCNT_inc_simple_void(res); + SvREFCNT_inc_simple_void_NN(res); } PUTBACK ; @@ -8946,66 +9175,89 @@ now_ok: why2 = key; why3 = "}} did not return a defined value"; sv = res; + (void)sv_2mortal(sv); goto report; } return res; } +PERL_STATIC_INLINE void +S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package, bool is_utf8) { + dVAR; + PERL_ARGS_ASSERT_PARSE_IDENT; + + for (;;) { + if (*d >= e) + Perl_croak(aTHX_ "%s", ident_too_long); + if (is_utf8 && isIDFIRST_utf8((U8*)*s)) { + /* The UTF-8 case must come first, otherwise things + * like c\N{COMBINING TILDE} would start failing, as the + * isWORDCHAR_A case below would gobble the 'c' up. + */ + + char *t = *s + UTF8SKIP(*s); + while (isIDCONT_utf8((U8*)t)) + t += UTF8SKIP(t); + if (*d + (t - *s) > e) + Perl_croak(aTHX_ "%s", ident_too_long); + Copy(*s, *d, t - *s, char); + *d += t - *s; + *s = t; + } + else if ( isWORDCHAR_A(**s) ) { + do { + *(*d)++ = *(*s)++; + } while isWORDCHAR_A(**s); + } + else if (allow_package && **s == '\'' && isIDFIRST_lazy_if(*s+1,is_utf8)) { + *(*d)++ = ':'; + *(*d)++ = ':'; + (*s)++; + } + else if (allow_package && **s == ':' && (*s)[1] == ':' + /* Disallow things like Foo::$bar. For the curious, this is + * the code path that triggers the "Bad name after" warning + * when looking for barewords. + */ + && (*s)[2] != '$') { + *(*d)++ = *(*s)++; + *(*d)++ = *(*s)++; + } + else + break; + } + return; +} + /* Returns a NUL terminated string, with the length of the string written to *slp */ STATIC char * -S_scan_word(pTHX_ register char *s, char *dest, STRLEN destlen, int allow_package, STRLEN *slp) +S_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STRLEN *slp) { dVAR; char *d = dest; char * const e = d + destlen - 3; /* two-character token, ending NUL */ + bool is_utf8 = cBOOL(UTF); PERL_ARGS_ASSERT_SCAN_WORD; - for (;;) { - if (d >= e) - Perl_croak(aTHX_ ident_too_long); - if (isALNUM(*s) || (!UTF && isALNUMC_L1(*s))) /* UTF handled below */ - *d++ = *s++; - else if (allow_package && (*s == '\'') && isIDFIRST_lazy_if(s+1,UTF)) { - *d++ = ':'; - *d++ = ':'; - s++; - } - else if (allow_package && (s[0] == ':') && (s[1] == ':') && (s[2] != '$')) { - *d++ = *s++; - *d++ = *s++; - } - else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) { - char *t = s + UTF8SKIP(s); - size_t len; - while (UTF8_IS_CONTINUED(*t) && is_utf8_mark((U8*)t)) - t += UTF8SKIP(t); - len = t - s; - if (d + len > e) - Perl_croak(aTHX_ ident_too_long); - Copy(s, d, len, char); - d += len; - s = t; - } - else { - *d = '\0'; - *slp = d - dest; - return s; - } - } + parse_ident(&s, &d, e, allow_package, is_utf8); + *d = '\0'; + *slp = d - dest; + return s; } STATIC char * -S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRLEN destlen, I32 ck_uni) +S_scan_ident(pTHX_ char *s, const char *send, char *dest, STRLEN destlen, I32 ck_uni) { dVAR; char *bracket = NULL; char funny = *s++; char *d = dest; char * const e = d + destlen - 3; /* two-character token, ending NUL */ + bool is_utf8 = cBOOL(UTF); PERL_ARGS_ASSERT_SCAN_IDENT; @@ -9014,38 +9266,12 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL if (isDIGIT(*s)) { while (isDIGIT(*s)) { if (d >= e) - Perl_croak(aTHX_ ident_too_long); + Perl_croak(aTHX_ "%s", ident_too_long); *d++ = *s++; } } else { - for (;;) { - if (d >= e) - Perl_croak(aTHX_ ident_too_long); - if (isALNUM(*s)) /* UTF handled below */ - *d++ = *s++; - else if (*s == '\'' && isIDFIRST_lazy_if(s+1,UTF)) { - *d++ = ':'; - *d++ = ':'; - s++; - } - else if (*s == ':' && s[1] == ':') { - *d++ = *s++; - *d++ = *s++; - } - else if (UTF && UTF8_IS_START(*s) && isALNUM_utf8((U8*)s)) { - char *t = s + UTF8SKIP(s); - while (UTF8_IS_CONTINUED(*t) && is_utf8_mark((U8*)t)) - t += UTF8SKIP(t); - if (d + (t - s) > e) - Perl_croak(aTHX_ ident_too_long); - Copy(s, d, t - s, char); - d += t - s; - s = t; - } - else - break; - } + parse_ident(&s, &d, e, 1, is_utf8); } *d = '\0'; d = dest; @@ -9055,7 +9281,11 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL return s; } if (*s == '$' && s[1] && - (isALNUM_lazy_if(s+1,UTF) || s[1] == '$' || s[1] == '{' || strnEQ(s+1,"::",2)) ) + (isIDFIRST_lazy_if(s+1,is_utf8) + || isDIGIT_A((U8)s[1]) + || s[1] == '$' + || s[1] == '{' + || strnEQ(s+1,"::",2)) ) { return s; } @@ -9063,8 +9293,15 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL bracket = s; s++; } - if (s < send) { - if (UTF) { + +#define VALID_LEN_ONE_IDENT(d, u) (isPUNCT_A((U8)*(d)) \ + || isCNTRL_A((U8)*(d)) \ + || isDIGIT_A((U8)*(d)) \ + || (!(u) && !UTF8_IS_INVARIANT((U8)*(d)))) + if (s < send + && (isIDFIRST_lazy_if(s, is_utf8) || VALID_LEN_ONE_IDENT(s, is_utf8))) + { + if (is_utf8) { const STRLEN skip = UTF8SKIP(s); STRLEN i; d[skip] = '\0'; @@ -9092,25 +9329,9 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL } } } - if (isIDFIRST_lazy_if(d,UTF)) { - d += UTF8SKIP(d); - if (UTF) { - char *end = s; - while ((end < send && isALNUM_lazy_if(end,UTF)) || *end == ':') { - end += UTF8SKIP(end); - while (end < send && UTF8_IS_CONTINUED(*end) && is_utf8_mark((U8*)end)) - end += UTF8SKIP(end); - } - Copy(s, d, end - s, char); - d += end - s; - s = end; - } - else { - while ((isALNUM(*s) || *s == ':') && d < e) - *d++ = *s++; - if (d >= e) - Perl_croak(aTHX_ ident_too_long); - } + if (isIDFIRST_lazy_if(d,is_utf8)) { + d += is_utf8 ? UTF8SKIP(d) : 1; + parse_ident(&s, &d, e, 1, is_utf8); *d = '\0'; while (s < send && SPACE_OR_TAB(*s)) s++; @@ -9132,15 +9353,15 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL } /* Handle extended ${^Foo} variables * 1999-02-27 mjd-perl-patch@plover.com */ - else if (!isALNUM(*d) && !isPRINT(*d) /* isCTRL(d) */ - && isALNUM(*s)) + else if (!isWORDCHAR(*d) && !isPRINT(*d) /* isCTRL(d) */ + && isWORDCHAR(*s)) { d++; - while (isALNUM(*s) && d < e) { + while (isWORDCHAR(*s) && d < e) { *d++ = *s++; } if (d >= e) - Perl_croak(aTHX_ ident_too_long); + Perl_croak(aTHX_ "%s", ident_too_long); *d = '\0'; } if (*s == '}') { @@ -9152,10 +9373,10 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL if (PL_lex_state == LEX_NORMAL) { if (ckWARN(WARN_AMBIGUOUS) && (keyword(dest, d - dest, 0) - || get_cvn_flags(dest, d - dest, UTF ? SVf_UTF8 : 0))) + || get_cvn_flags(dest, d - dest, is_utf8 ? SVf_UTF8 : 0))) { SV *tmp = newSVpvn_flags( dest, d - dest, - SVs_TEMP | (UTF ? SVf_UTF8 : 0) ); + SVs_TEMP | (is_utf8 ? SVf_UTF8 : 0) ); if (funny == '#') funny = '@'; Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS), @@ -9191,7 +9412,7 @@ S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charse STRLEN charlen = UTF ? UTF8SKIP(*s) : 1; if ( charlen != 1 || ! strchr(valid_flags, c) ) { - if (isALNUM_lazy_if(*s, UTF)) { + if (isWORDCHAR_lazy_if(*s, UTF)) { yyerror_pv(Perl_form(aTHX_ "Unknown regexp modifier \"/%.*s\"", (int)charlen, *s), UTF ? SVf_UTF8 : 0); (*s) += charlen; @@ -9275,7 +9496,8 @@ S_scan_pat(pTHX_ char *start, I32 type) { dVAR; PMOP *pm; - char *s = scan_str(start,!!PL_madskills,FALSE, PL_reg_state.re_reparsing); + char *s = scan_str(start,!!PL_madskills,FALSE, PL_reg_state.re_reparsing, + TRUE /* look for escaped bracketed metas */ ); const char * const valid_flags = (const char *)((type == OP_QR) ? QR_PAT_MODS : M_PAT_MODS); char charset = '\0'; /* character set modifier */ @@ -9378,7 +9600,8 @@ S_scan_subst(pTHX_ char *start) pl_yylval.ival = OP_NULL; - s = scan_str(start,!!PL_madskills,FALSE,FALSE); + s = scan_str(start,!!PL_madskills,FALSE,FALSE, + TRUE /* look for escaped bracketed metas */ ); if (!s) Perl_croak(aTHX_ "Substitution pattern not terminated"); @@ -9396,7 +9619,7 @@ S_scan_subst(pTHX_ char *start) #endif first_start = PL_multi_start; - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) { if (PL_lex_stuff) { SvREFCNT_dec(PL_lex_stuff); @@ -9482,7 +9705,7 @@ S_scan_trans(pTHX_ char *start) pl_yylval.ival = OP_NULL; - s = scan_str(start,!!PL_madskills,FALSE,FALSE); + s = scan_str(start,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) Perl_croak(aTHX_ "Transliteration pattern not terminated"); @@ -9498,7 +9721,7 @@ S_scan_trans(pTHX_ char *start) } #endif - s = scan_str(s,!!PL_madskills,FALSE,FALSE); + s = scan_str(s,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) { if (PL_lex_stuff) { SvREFCNT_dec(PL_lex_stuff); @@ -9583,7 +9806,7 @@ S_scan_trans(pTHX_ char *start) */ STATIC char * -S_scan_heredoc(pTHX_ register char *s) +S_scan_heredoc(pTHX_ char *s) { dVAR; I32 op_type = OP_SCALAR; @@ -9626,9 +9849,9 @@ S_scan_heredoc(pTHX_ register char *s) s++, term = '\''; else term = '"'; - if (!isALNUM_lazy_if(s,UTF)) + if (!isWORDCHAR_lazy_if(s,UTF)) deprecate("bare << to mean <<\"\""); - for (; isALNUM_lazy_if(s,UTF); s++) { + for (; isWORDCHAR_lazy_if(s,UTF); s++) { if (d < e) *d++ = *s; } @@ -9778,8 +10001,10 @@ S_scan_heredoc(pTHX_ register char *s) bufend - shared->re_eval_start); shared->re_eval_start -= s-d; } - if (CxTYPE(cx) == CXt_EVAL && CxOLD_OP_TYPE(cx) == OP_ENTEREVAL - && cx->blk_eval.cur_text == linestr) { + if (cxstack_ix >= 0 && CxTYPE(cx) == CXt_EVAL && + CxOLD_OP_TYPE(cx) == OP_ENTEREVAL && + cx->blk_eval.cur_text == linestr) + { cx->blk_eval.cur_text = newSVsv(linestr); SvSCREAM_on(cx->blk_eval.cur_text); } @@ -9934,7 +10159,7 @@ S_scan_inputsymbol(pTHX_ char *start) if (*d == '$' && d[1]) d++; /* allow or */ - while (*d && (isALNUM_lazy_if(d,UTF) || *d == '\'' || *d == ':')) + while (*d && (isWORDCHAR_lazy_if(d,UTF) || *d == '\'' || *d == ':')) d += UTF ? UTF8SKIP(d) : 1; /* If we've tried to read what we allow filehandles to look like, and @@ -9945,7 +10170,7 @@ S_scan_inputsymbol(pTHX_ char *start) if (d - PL_tokenbuf != len) { pl_yylval.ival = OP_GLOB; - s = scan_str(start,!!PL_madskills,FALSE,FALSE); + s = scan_str(start,!!PL_madskills,FALSE,FALSE, FALSE); if (!s) Perl_croak(aTHX_ "Glob not terminated"); return s; @@ -10087,20 +10312,25 @@ intro_sym: */ STATIC char * -S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse) +S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse, + bool deprecate_escaped_meta /* Should we issue a deprecation warning + for certain paired metacharacters that + appear escaped within it */ + ) { dVAR; - SV *sv; /* scalar value: string */ - const char *tmps; /* temp string, used for delimiter matching */ + SV *sv; /* scalar value: string */ + const char *tmps; /* temp string, used for delimiter matching */ char *s = start; /* current position in the buffer */ char term; /* terminating character */ char *to; /* current position in the sv's data */ - I32 brackets = 1; /* bracket nesting level */ - bool has_utf8 = FALSE; /* is there any utf8 content? */ - I32 termcode; /* terminating char. code */ - U8 termstr[UTF8_MAXBYTES]; /* terminating string */ - STRLEN termlen; /* length of terminating string */ - int last_off = 0; /* last position for nesting bracket */ + I32 brackets = 1; /* bracket nesting level */ + bool has_utf8 = FALSE; /* is there any utf8 content? */ + I32 termcode; /* terminating char. code */ + U8 termstr[UTF8_MAXBYTES]; /* terminating string */ + STRLEN termlen; /* length of terminating string */ + int last_off = 0; /* last position for nesting bracket */ + char *escaped_open = NULL; #ifdef PERL_MAD int stuffstart; char *tstart; @@ -10147,6 +10377,18 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse) PL_multi_close = term; + /* A warning is raised if the input parameter requires it for escaped (by a + * backslash) paired metacharacters {} [] and () when the delimiters are + * those same characters, and the backslash is ineffective. This doesn't + * happen for <>, as they aren't metas. */ + if (deprecate_escaped_meta + && (PL_multi_open == PL_multi_close + || ! ckWARN_d(WARN_DEPRECATED) + || PL_multi_open == '<')) + { + deprecate_escaped_meta = FALSE; + } + /* create a new SV to hold the contents. 79 is the SV's initial length. What a random number. */ sv = newSV_type(SVt_PVIV); @@ -10160,7 +10402,7 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse) s += termlen; #ifdef PERL_MAD tstart = SvPVX(PL_linestr) + stuffstart; - if (!PL_thisopen && !keep_delims) { + if (PL_madskills && !PL_thisopen && !keep_delims) { PL_thisopen = newSVpvn(tstart, s - tstart); stuffstart = s - SvPVX(PL_linestr); } @@ -10285,7 +10527,44 @@ S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse) if (*s == '\\' && s+1 < PL_bufend) { if (!keep_quoted && ((s[1] == PL_multi_open) || (s[1] == PL_multi_close))) + { s++; + + /* Here, 'deprecate_escaped_meta' is true iff the + * delimiters are paired metacharacters, and 's' points + * to an occurrence of one of them within the string, + * which was preceded by a backslash. If this is a + * context where the delimiter is also a metacharacter, + * the backslash is useless, and deprecated. () and [] + * are meta in any context. {} are meta only when + * appearing in a quantifier or in things like '\p{'. + * They also aren't meta unless there is a matching + * closed, escaped char later on within the string. + * If 's' points to an open, set a flag; if to a close, + * test that flag, and raise a warning if it was set */ + + if (deprecate_escaped_meta) { + if (*s == PL_multi_open) { + if (*s != '{') { + escaped_open = s; + } + else if (regcurly(s, + TRUE /* Look for a closing + '\}' */) + || (s - start > 2 /* Look for e.g. + '\x{' */ + && _generic_isCC(*(s-2), _CC_BACKSLASH_FOO_LBRACE_IS_META))) + { + escaped_open = s; + } + } + else if (escaped_open) { + Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), + "Useless use of '\\'; doesn't escape metacharacter '%c'", PL_multi_open); + escaped_open = NULL; + } + } + } else *to++ = *s++; } @@ -10441,7 +10720,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) SV *sv = NULL; /* place to put the converted number */ bool floatit; /* boolean: int or float? */ const char *lastub = NULL; /* position of last underbar */ - static char const number_too_long[] = "Number too long"; + static const char* const number_too_long = "Number too long"; PERL_ARGS_ASSERT_SCAN_NUM; @@ -10651,7 +10930,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) else { /* check for end of fixed-length buffer */ if (d >= e) - Perl_croak(aTHX_ number_too_long); + Perl_croak(aTHX_ "%s", number_too_long); /* if we're ok, copy the character */ *d++ = *s++; } @@ -10681,7 +10960,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) for (; isDIGIT(*s) || *s == '_'; s++) { /* fixed length buffer check */ if (d >= e) - Perl_croak(aTHX_ number_too_long); + Perl_croak(aTHX_ "%s", number_too_long); if (*s == '_') { if (lastub && s == lastub + 1) Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), @@ -10733,7 +11012,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) while (isDIGIT(*s) || *s == '_') { if (isDIGIT(*s)) { if (d >= e) - Perl_croak(aTHX_ number_too_long); + Perl_croak(aTHX_ "%s", number_too_long); *d++ = *s++; } else { @@ -10789,7 +11068,11 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) case 'v': vstring: sv = newSV(5); /* preallocate storage space */ + ENTER_with_name("scan_vstring"); + SAVEFREESV(sv); s = scan_vstring(s, PL_bufend, sv); + SvREFCNT_inc_simple_void_NN(sv); + LEAVE_with_name("scan_vstring"); break; } @@ -10804,7 +11087,7 @@ vstring: } STATIC char * -S_scan_formline(pTHX_ register char *s) +S_scan_formline(pTHX_ char *s) { dVAR; char *eol; @@ -10946,7 +11229,8 @@ Perl_start_subparse(pTHX_ I32 is_format, U32 flags) CvOUTSIDE(PL_compcv) = MUTABLE_CV(SvREFCNT_inc_simple(outsidecv)); CvOUTSIDE_SEQ(PL_compcv) = PL_cop_seqmax; if (outsidecv && CvPADLIST(outsidecv)) - CvPADLIST(PL_compcv)->xpadl_outid = CvPADLIST(outsidecv)->xpadl_id; + CvPADLIST(PL_compcv)->xpadl_outid = + PadlistNAMES(CvPADLIST(outsidecv)); return oldsavestack_ix; } @@ -10990,7 +11274,6 @@ Perl_yyerror_pvn(pTHX_ const char *const s, STRLEN len, U32 flags) SV *msg; SV * const where_sv = newSVpvs_flags("", SVs_TEMP); int yychar = PL_parser->yychar; - U32 is_utf8 = flags & SVf_UTF8; PERL_ARGS_ASSERT_YYERROR_PVN; @@ -11051,7 +11334,7 @@ Perl_yyerror_pvn(pTHX_ const char *const s, STRLEN len, U32 flags) else Perl_sv_catpvf(aTHX_ where_sv, "\\%03o", yychar & 255); } - msg = sv_2mortal(newSVpvn_flags(s, len, is_utf8)); + msg = newSVpvn_flags(s, len, (flags & SVf_UTF8) | SVs_TEMP); Perl_sv_catpvf(aTHX_ msg, " at %s line %"IVdf", ", OutCopFILE(PL_curcop), (IV)CopLINE(PL_curcop)); if (context) @@ -11072,9 +11355,10 @@ Perl_yyerror_pvn(pTHX_ const char *const s, STRLEN len, U32 flags) else qerror(msg); if (PL_error_count >= 10) { - if (PL_in_eval && SvCUR(ERRSV)) + SV * errsv; + if (PL_in_eval && ((errsv = ERRSV), SvCUR(errsv))) Perl_croak(aTHX_ "%"SVf"%s has too many errors.\n", - SVfARG(ERRSV), OutCopFILE(PL_curcop)); + SVfARG(errsv), OutCopFILE(PL_curcop)); else Perl_croak(aTHX_ "%s has too many errors.\n", OutCopFILE(PL_curcop)); @@ -11338,13 +11622,18 @@ vstring, as well as updating the passed in sv. Function must be called like - sv = newSV(5); + sv = sv_2mortal(newSV(5)); s = scan_vstring(s,e,sv); where s and e are the start and end of the string. The sv should already be large enough to store the vstring passed in, for performance reasons. +This function may croak if fatal warnings are enabled in the +calling scope, hence the sv_2mortal in the example (to prevent +a leak). Make sure to do SvREFCNT_inc afterwards if you use +sv_2mortal. + */ char * @@ -11710,11 +11999,10 @@ Perl_parse_label(pTHX_ U32 flags) if (PL_lex_state == LEX_KNOWNEXT) { PL_parser->yychar = yylex(); if (PL_parser->yychar == LABEL) { - SV *lsv; + char * const lpv = pl_yylval.pval; + STRLEN llen = strlen(lpv); PL_parser->yychar = YYEMPTY; - lsv = newSV_type(SVt_PV); - sv_copypv(lsv, cSVOPx(pl_yylval.opval)->op_sv); - return lsv; + return newSVpvn_flags(lpv, llen, lpv[llen+1] ? SVf_UTF8 : 0); } else { yyunlex(); goto no_label;