X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/16d1d8bdaf1e7c1c3ad11abc246622d1e5c3b26a..607733213512652a8c98b3055264d6baf5019eb5:/toke.c diff --git a/toke.c b/toke.c index 5a181c3..a91a4fc 100644 --- a/toke.c +++ b/toke.c @@ -413,12 +413,12 @@ S_tokereport(pTHX_ I32 rv, const YYSTYPE* lvalp) else if (!rv) sv_catpvs(report, "EOF"); else - Perl_sv_catpvf(aTHX_ report, "?? %"IVdf, (IV)rv); + Perl_sv_catpvf(aTHX_ report, "?? %" IVdf, (IV)rv); switch (type) { case TOKENTYPE_NONE: break; case TOKENTYPE_IVAL: - Perl_sv_catpvf(aTHX_ report, "(ival=%"IVdf")", (IV)lvalp->ival); + Perl_sv_catpvf(aTHX_ report, "(ival=%" IVdf ")", (IV)lvalp->ival); break; case TOKENTYPE_OPNUM: Perl_sv_catpvf(aTHX_ report, "(ival=op_%s)", @@ -464,13 +464,6 @@ S_printbuf(pTHX_ const char *const fmt, const char *const s) #endif -static int -S_deprecate_commaless_var_list(pTHX) { - PL_expect = XTERM; - deprecate("comma-less variable list"); - return REPORT(','); /* grandfather non-comma-format format */ -} - /* * S_ao * @@ -527,20 +520,26 @@ S_no_op(pTHX_ const char *const what, char *s) if (is_first) Perl_warner(aTHX_ packWARN(WARN_SYNTAX), "\t(Missing semicolon on previous line?)\n"); - else if (PL_oldoldbufptr && isIDFIRST_lazy_if(PL_oldoldbufptr,UTF)) { + else if (PL_oldoldbufptr && isIDFIRST_lazy_if_safe(PL_oldoldbufptr, + PL_bufend, + UTF)) + { const char *t; - for (t = PL_oldoldbufptr; (isWORDCHAR_lazy_if(t,UTF) || *t == ':'); - t += UTF ? UTF8SKIP(t) : 1) + for (t = PL_oldoldbufptr; + (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) || *t == ':'); + t += UTF ? UTF8SKIP(t) : 1) + { NOOP; + } if (t < PL_bufptr && isSPACE(*t)) Perl_warner(aTHX_ packWARN(WARN_SYNTAX), - "\t(Do you need to predeclare %"UTF8f"?)\n", + "\t(Do you need to predeclare %" UTF8f "?)\n", UTF8fARG(UTF, t - PL_oldoldbufptr, PL_oldoldbufptr)); } else { assert(s >= oldbp); Perl_warner(aTHX_ packWARN(WARN_SYNTAX), - "\t(Missing operator before %"UTF8f"?)\n", + "\t(Missing operator before %" UTF8f "?)\n", UTF8fARG(UTF, s - oldbp, oldbp)); } } @@ -590,7 +589,7 @@ S_missingterm(pTHX_ char *s) sv = sv_2mortal(newSVpv(s,0)); if (uni) SvUTF8_on(sv); - Perl_croak(aTHX_ "Can't find string terminator %c%"SVf + Perl_croak(aTHX_ "Can't find string terminator %c%" SVf "%c anywhere before EOF",q,SVfARG(sv),q); } @@ -663,7 +662,7 @@ S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen) Creates and initialises a new lexer/parser state object, supplying a context in which to lex and parse from a new source of Perl code. A pointer to the new state object is placed in L. An entry -is made on the save stack so that upon unwinding the new state object +is made on the save stack so that upon unwinding, the new state object will be destroyed and the former value of L will be restored. Nothing else need be done to clean up the parsing context. @@ -695,6 +694,7 @@ Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags) { const char *s = NULL; yy_parser *parser, *oparser; + if (flags && flags & ~LEX_START_FLAGS) Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_start"); @@ -705,8 +705,8 @@ Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags) PL_parser = parser; parser->stack = NULL; + parser->stack_max1 = NULL; parser->ps = NULL; - parser->stack_size = 0; /* on scope exit, free this parser and restore any outer one */ SAVEPARSER(parser); @@ -720,6 +720,7 @@ Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags) parser->lex_state = LEX_NORMAL; parser->expect = XSTATE; parser->rsfp = rsfp; + parser->recheck_utf8_validity = FALSE; parser->rsfp_filters = !(flags & LEX_START_SAME_FILTER) || !oparser ? NULL @@ -736,7 +737,22 @@ Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags) if (line) { STRLEN len; + const U8* first_bad_char_loc; + s = SvPV_const(line, len); + + if ( SvUTF8(line) + && UNLIKELY(! is_utf8_string_loc((U8 *) s, + SvCUR(line), + &first_bad_char_loc))) + { + _force_out_malformed_utf8_message(first_bad_char_loc, + (U8 *) s + SvCUR(line), + 0, + 1 /* 1 means die */ ); + NOT_REACHED; /* NOTREACHED */ + } + parser->linestr = flags & LEX_START_COPIED ? SvREFCNT_inc_simple_NN(line) : newSVpvn_flags(s, len, SvUTF8(line)); @@ -745,6 +761,7 @@ Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags) } else { parser->linestr = newSVpvn("\n;", rsfp ? 1 : 2); } + parser->oldoldbufptr = parser->oldbufptr = parser->bufptr = @@ -923,10 +940,19 @@ Perl_lex_grow_linestr(pTHX_ STRLEN len) char *buf; STRLEN bufend_pos, bufptr_pos, oldbufptr_pos, oldoldbufptr_pos; STRLEN linestart_pos, last_uni_pos, last_lop_pos, re_eval_start_pos; + bool current; + linestr = PL_parser->linestr; buf = SvPVX(linestr); if (len <= SvLEN(linestr)) return buf; + + /* Is the lex_shared linestr SV the same as the current linestr SV? + * Only in this case does re_eval_start need adjusting, since it + * points within lex_shared->ls_linestr's buffer */ + current = ( !PL_parser->lex_shared->ls_linestr + || linestr == PL_parser->lex_shared->ls_linestr); + bufend_pos = PL_parser->bufend - buf; bufptr_pos = PL_parser->bufptr - buf; oldbufptr_pos = PL_parser->oldbufptr - buf; @@ -934,7 +960,7 @@ Perl_lex_grow_linestr(pTHX_ STRLEN len) linestart_pos = PL_parser->linestart - buf; last_uni_pos = PL_parser->last_uni ? PL_parser->last_uni - buf : 0; last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0; - re_eval_start_pos = PL_parser->lex_shared->re_eval_start ? + re_eval_start_pos = (current && PL_parser->lex_shared->re_eval_start) ? PL_parser->lex_shared->re_eval_start - buf : 0; buf = sv_grow(linestr, len); @@ -948,7 +974,7 @@ Perl_lex_grow_linestr(pTHX_ STRLEN len) PL_parser->last_uni = buf + last_uni_pos; if (PL_parser->last_lop) PL_parser->last_lop = buf + last_lop_pos; - if (PL_parser->lex_shared->re_eval_start) + if (current && PL_parser->lex_shared->re_eval_start) PL_parser->lex_shared->re_eval_start = buf + re_eval_start_pos; return buf; } @@ -1024,14 +1050,7 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags) } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) { p++; highhalf++; - } else if (! UTF8_IS_INVARIANT(c)) { - /* malformed UTF-8 */ - ENTER; - SAVESPTR(PL_warnhook); - PL_warnhook = PERL_WARNHOOK_FATAL; - utf8n_to_uvchr((U8*)p, e-p, NULL, 0); - LEAVE; - } + } else assert(UTF8_IS_INVARIANT(c)); } if (!highhalf) goto plain_copy; @@ -1245,6 +1264,24 @@ Perl_lex_discard_to(pTHX_ char *ptr) PL_parser->last_lop -= discard_len; } +void +Perl_notify_parser_that_changed_to_utf8(pTHX) +{ + /* Called when $^H is changed to indicate that HINT_UTF8 has changed from + * off to on. At compile time, this has the effect of entering a 'use + * utf8' section. This means that any input was not previously checked for + * UTF-8 (because it was off), but now we do need to check it, or our + * assumptions about the input being sane could be wrong, and we could + * segfault. This routine just sets a flag so that the next time we look + * at the input we do the well-formed UTF-8 check. If we aren't in the + * proper phase, there may not be a parser object, but if there is, setting + * the flag is harmless */ + + if (PL_parser) { + PL_parser->recheck_utf8_validity = TRUE; + } +} + /* =for apidoc Amx|bool|lex_next_chunk|U32 flags @@ -1280,6 +1317,7 @@ Perl_lex_next_chunk(pTHX_ U32 flags) STRLEN linestart_pos, last_uni_pos, last_lop_pos; bool got_some_for_debugger = 0; bool got_some; + if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM)) Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk"); if (!(flags & LEX_NO_TERM) && PL_lex_inwhat) @@ -1344,6 +1382,22 @@ Perl_lex_next_chunk(pTHX_ U32 flags) new_bufend_pos = SvCUR(linestr); PL_parser->bufend = buf + new_bufend_pos; PL_parser->bufptr = buf + bufptr_pos; + + if (UTF) { + const U8* first_bad_char_loc; + if (UNLIKELY(! is_utf8_string_loc( + (U8 *) PL_parser->bufptr, + PL_parser->bufend - PL_parser->bufptr, + &first_bad_char_loc))) + { + _force_out_malformed_utf8_message(first_bad_char_loc, + (U8 *) PL_parser->bufend, + 0, + 1 /* 1 means die */ ); + NOT_REACHED; /* NOTREACHED */ + } + } + PL_parser->oldbufptr = buf + oldbufptr_pos; PL_parser->oldoldbufptr = buf + oldoldbufptr_pos; PL_parser->linestart = buf + linestart_pos; @@ -1420,12 +1474,11 @@ Perl_lex_peek_unichar(pTHX_ U32 flags) } unichar = utf8n_to_uvchr((U8*)s, bufend-s, &retlen, UTF8_CHECK_ONLY); if (retlen == (STRLEN)-1) { - /* malformed UTF-8 */ - ENTER; - SAVESPTR(PL_warnhook); - PL_warnhook = PERL_WARNHOOK_FATAL; - utf8n_to_uvchr((U8*)s, bufend-s, NULL, 0); - LEAVE; + _force_out_malformed_utf8_message((U8 *) s, + (U8 *) bufend, + 0, + 1 /* 1 means die */ ); + NOT_REACHED; /* NOTREACHED */ } return unichar; } else { @@ -1570,7 +1623,7 @@ Note that C is a valid C and will always return C. */ bool -Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn) +Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn, bool curstash) { STRLEN len, origlen; char *p; @@ -1632,21 +1685,28 @@ Perl_validate_proto(pTHX_ SV *name, SV *proto, bool warn) origlen, UNI_DISPLAY_ISPRINT) : pv_pretty(tmpsv, p, origlen, 60, NULL, NULL, PERL_PV_ESCAPE_NONASCII); + if (curstash && !memchr(SvPVX(name), ':', SvCUR(name))) { + SV *name2 = sv_2mortal(newSVsv(PL_curstname)); + sv_catpvs(name2, "::"); + sv_catsv(name2, (SV *)name); + name = name2; + } + if (proto_after_greedy_proto) Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO), - "Prototype after '%c' for %"SVf" : %s", + "Prototype after '%c' for %" SVf " : %s", greedy_proto, SVfARG(name), p); if (in_brackets) Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO), - "Missing ']' in prototype for %"SVf" : %s", + "Missing ']' in prototype for %" SVf " : %s", SVfARG(name), p); if (bad_proto) Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO), - "Illegal character in prototype for %"SVf" : %s", + "Illegal character in prototype for %" SVf " : %s", SVfARG(name), p); if (bad_proto_after_underscore) Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO), - "Illegal character after '_' in prototype for %"SVf" : %s", + "Illegal character after '_' in prototype for %" SVf " : %s", SVfARG(name), p); } @@ -1866,13 +1926,13 @@ S_check_uni(pTHX) while (isSPACE(*PL_last_uni)) PL_last_uni++; s = PL_last_uni; - while (isWORDCHAR_lazy_if(s,UTF) || *s == '-') + while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) || *s == '-') s += UTF ? UTF8SKIP(s) : 1; if ((t = strchr(s, '(')) && t < PL_bufptr) return; Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS), - "Warning: Use of \"%"UTF8f"\" without parentheses is ambiguous", + "Warning: Use of \"%" UTF8f "\" without parentheses is ambiguous", UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni)); } @@ -2036,7 +2096,7 @@ S_force_word(pTHX_ char *start, int token, int check_keyword, int allow_pack) start = skipspace(start); s = start; - if (isIDFIRST_lazy_if(s,UTF) + if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF) || (allow_pack && *s == ':' && s[1] == ':') ) { s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, allow_pack, &len); @@ -2117,7 +2177,7 @@ Perl_str_to_version(pTHX_ SV *sv) STRLEN len; const char *start = SvPV_const(sv,len); const char * const end = start + len; - const bool utf = SvUTF8(sv) ? TRUE : FALSE; + const bool utf = cBOOL(SvUTF8(sv)); PERL_ARGS_ASSERT_STR_TO_VERSION; @@ -2223,10 +2283,9 @@ S_force_strict_version(pTHX_ char *s) /* * S_tokeq - * Tokenize a quoted string passed in as an SV. It finds the next - * chunk, up to end of string or a backslash. It may make a new - * SV containing that chunk (if HINT_NEW_STRING is on). It also - * turns \\ into \. + * Turns any \\ into \ in a quoted string passed in in 'sv', returning 'sv', + * modified as necessary. However, if HINT_NEW_STRING is on, 'sv' is + * unchanged, and a new SV containing the modified input is returned. */ STATIC SV * @@ -2429,7 +2488,7 @@ S_sublex_push(pTHX) if (is_heredoc) CopLINE_set(PL_curcop, (line_t)PL_multi_start); PL_copline = NOLINE; - + Newxz(shared, 1, LEXSHARED); shared->ls_prev = PL_parser->lex_shared; PL_parser->lex_shared = shared; @@ -2517,7 +2576,7 @@ S_sublex_done(pTHX) } } -PERL_STATIC_INLINE SV* +STATIC SV* S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) { /* points to first character of interior of \N{}, to one beyond the @@ -2531,35 +2590,14 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) SV *cv; SV *rv; HV *stash; - const U8* first_bad_char_loc; const char* backslash_ptr = s - 3; /* Points to the <\> of \N{... */ PERL_ARGS_ASSERT_GET_AND_CHECK_BACKSLASH_N_NAME; if (!SvCUR(res)) { - Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), - "Unknown charname '' is deprecated"); - return res; - } - - if (UTF && ! is_utf8_string_loc((U8 *) backslash_ptr, - e - backslash_ptr, - &first_bad_char_loc)) - { - /* If warnings are on, this will print a more detailed analysis of what - * is wrong than the error message below */ - utf8n_to_uvchr(first_bad_char_loc, - e - ((char *) first_bad_char_loc), - NULL, 0); - - /* We deliberately don't try to print the malformed character, which - * might not print very well; it also may be just the first of many - * malformations, so don't print what comes after it */ - yyerror_pv(Perl_form(aTHX_ - "Malformed UTF-8 character immediately after '%.*s'", - (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr), - SVf_UTF8); - return NULL; + /* diag_listed_as: Unknown charname '%s' */ + yyerror("Unknown charname ''"); + return NULL; } res = new_constant( NULL, 0, "charnames", res, NULL, backslash_ptr, @@ -2671,6 +2709,9 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) } } if (*(s-1) == ' ') { + /* diag_listed_as: charnames alias definitions may not contain + trailing white-space; marked by <-- HERE in %s + */ yyerror_pv( Perl_form(aTHX_ "charnames alias definitions may not contain trailing " @@ -2686,16 +2727,15 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) const U8* first_bad_char_loc; STRLEN len; const char* const str = SvPV_const(res, len); - if (! is_utf8_string_loc((U8 *) str, len, &first_bad_char_loc)) { - /* If warnings are on, this will print a more detailed analysis of - * what is wrong than the error message below */ - utf8n_to_uvchr(first_bad_char_loc, - (char *) first_bad_char_loc - str, - NULL, 0); - - /* We deliberately don't try to print the malformed character, - * which might not print very well; it also may be just the first - * of many malformations, so don't print what comes after it */ + if (UNLIKELY(! is_utf8_string_loc((U8 *) str, len, + &first_bad_char_loc))) + { + _force_out_malformed_utf8_message(first_bad_char_loc, + (U8 *) PL_parser->bufend, + 0, + 0 /* 0 means don't die */ ); + /* diag_listed_as: Malformed UTF-8 returned by \N{%s} + immediately after '%s' */ yyerror_pv( Perl_form(aTHX_ "Malformed UTF-8 returned by %.*s immediately after '%.*s'", @@ -2713,6 +2753,8 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) /* The final %.*s makes sure that should the trailing NUL be missing * that this print won't run off the end of the string */ + /* diag_listed_as: Invalid character in \N{...}; marked by <-- HERE + in \N{%s} */ yyerror_pv( Perl_form(aTHX_ "Invalid character in \\N{...}; marked by <-- HERE in %.*s<-- HERE %.*s", @@ -2724,6 +2766,9 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) } multi_spaces: + /* diag_listed_as: charnames alias definitions may not contain a + sequence of multiple spaces; marked by <-- HERE + in %s */ yyerror_pv( Perl_form(aTHX_ "charnames alias definitions may not contain a sequence of " @@ -2769,15 +2814,17 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) In transliterations: characters are VERY literal, except for - not at the start or end - of the string, which indicates a range. If the range is in bytes, + of the string, which indicates a range. However some backslash sequences + are recognized: \r, \n, and the like + \007 \o{}, \x{}, \N{} + If all elements in the transliteration are below 256, scan_const expands the range to the full set of intermediate characters. If the range is in utf8, the hyphen is replaced with a certain range mark which will be handled by pmtrans() in op.c. In double-quoted strings: backslashes: - double-quoted style: \r and \n - constants: \x31, etc. + all those recognized in transliterations deprecated backrefs: \1 (in substitution replacements) case and quoting: \U \Q \E stops on @ and $ @@ -2820,7 +2867,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) } (end if backslash) handle regular character } (end while character to read) - + */ STATIC char * @@ -2840,10 +2887,23 @@ S_scan_const(pTHX_ char *start) when the source isn't utf8, as for example when it is entirely composed of hex constants */ + STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the + number of characters found so far + that will expand (into 2 bytes) + should we have to convert to + UTF-8) */ SV *res; /* result from charnames */ STRLEN offset_to_max; /* The offset in the output to where the range high-end character is temporarily placed */ + /* Does something require special handling in tr/// ? This avoids extra + * work in a less likely case. As such, khw didn't feel it was worth + * adding any branches to the more mainline code to handle this, which + * means that this doesn't get set in some circumstances when things like + * \x{100} get expanded out. As a result there needs to be extra testing + * done in the tr code */ + bool has_above_latin1 = FALSE; + /* Note on sizing: The scanned constant is placed into sv, which is * initialized by newSV() assuming one byte of output for every byte of * input. This routine expects newSV() to allocate an extra byte for a @@ -2854,7 +2914,7 @@ S_scan_const(pTHX_ char *start) * the needed size, SvGROW() is called. Its size parameter each time is * based on the best guess estimate at the time, namely the length used so * far, plus the length the current construct will occupy, plus room for - * the trailing NUL, plus one byte for every input byte still unscanned */ + * the trailing NUL, plus one byte for every input byte still unscanned */ UV uv = UV_MAX; /* Initialize to weird value to try to catch any uses before set */ @@ -2888,30 +2948,29 @@ S_scan_const(pTHX_ char *start) * range, so for most cases we just drop down and handle the value * as any other. There are two exceptions. * - * 1. A minus sign indicates that we are actually going to have - * a range. In this case, skip the '-', set a flag, then drop + * 1. A hyphen indicates that we are actually going to have a + * range. In this case, skip the '-', set a flag, then drop * down to handle what should be the end range value. * 2. After we've handled that value, the next time through, that * flag is set and we fix up the range. * * Ranges entirely within Latin1 are expanded out entirely, in - * order to avoid the significant overhead of making a swash. - * Ranges that extend above Latin1 have to have a swash, so there - * is no advantage to abbreviating them here, so they are stored - * here as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte signifies - * a hyphen without any possible ambiguity. On EBCDIC machines, if - * the range is expressed as Unicode, the Latin1 portion is - * expanded out even if the entire range extends above Latin1. - * This is because each code point in it has to be processed here - * individually to get its native translation */ + * order to make the transliteration a simple table look-up. + * Ranges that extend above Latin1 have to be done differently, so + * there is no advantage to expanding them here, so they are + * stored here as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte + * signifies a hyphen without any possible ambiguity. On EBCDIC + * machines, if the range is expressed as Unicode, the Latin1 + * portion is expanded out even if the range extends above + * Latin1. This is because each code point in it has to be + * processed here individually to get its native translation */ if (! dorange) { - /* Here, we don't think we're in a range. If we've processed - * at least one character, then see if this next one is a '-', - * indicating the previous one was the start of a range. But - * don't bother if we're too close to the end for the minus to - * mean that. */ + /* Here, we don't think we're in a range. If the new character + * is not a hyphen; or if it is a hyphen, but it's too close to + * either edge to indicate a range, then it's a regular + * character. */ if (*s != '-' || s >= send - 1 || s == start) { /* A regular character. Process like any other, but first @@ -2922,16 +2981,26 @@ S_scan_const(pTHX_ char *start) non_portable_endpoint = 0; backslash_N = 0; #endif + /* The tests here for being above Latin1 and similar ones + * in the following 'else' suffice to find all such + * occurences in the constant, except those added by a + * backslash escape sequence, like \x{100}. Mostly, those + * set 'has_above_latin1' as appropriate */ + if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) { + has_above_latin1 = TRUE; + } + /* Drops down to generic code to process current byte */ } - else { + else { /* Is a '-' in the context where it means a range */ if (didrange) { /* Something like y/A-C-Z// */ - Perl_croak(aTHX_ "Ambiguous range in transliteration operator"); + Perl_croak(aTHX_ "Ambiguous range in transliteration" + " operator"); } dorange = TRUE; - s++; /* Skip past the minus */ + s++; /* Skip past the hyphen */ /* d now points to where the end-range character will be * placed. Save it so won't have to go finding it later, @@ -2941,6 +3010,12 @@ S_scan_const(pTHX_ char *start) * pointer). We'll finish processing the range the next * time through the loop */ offset_to_max = d - SvPVX_const(sv); + + if (this_utf8 && UTF8_IS_ABOVE_LATIN1(*s)) { + has_above_latin1 = TRUE; + } + + /* Drops down to generic code to process current byte */ } } /* End of not a range */ else { @@ -2952,26 +3027,33 @@ S_scan_const(pTHX_ char *start) * 'd' points to just beyond the range end in the 'sv' string, * where we would next place something * 'offset_to_max' is the offset in 'sv' at which the character - * before 'd' begins. + * (the range's maximum end point) before 'd' begins. */ - const char * max_ptr = SvPVX_const(sv) + offset_to_max; - const char * min_ptr; + char * max_ptr = SvPVX(sv) + offset_to_max; + char * min_ptr; IV range_min; IV range_max; /* last character in range */ - STRLEN save_offset; STRLEN grow; + Size_t offset_to_min = 0; + Size_t extras = 0; #ifdef EBCDIC bool convert_unicode; IV real_range_max = 0; #endif - - /* Get the range-ends code point values. */ + /* Get the code point values of the range ends. */ if (has_utf8) { /* We know the utf8 is valid, because we just constructed * it ourselves in previous loop iterations */ min_ptr = (char*) utf8_hop( (U8*) max_ptr, -1); range_min = valid_utf8_to_uvchr( (U8*) min_ptr, NULL); range_max = valid_utf8_to_uvchr( (U8*) max_ptr, NULL); + + /* This compensates for not all code setting + * 'has_above_latin1', so that we don't skip stuff that + * should be executed */ + if (range_max > 255) { + has_above_latin1 = TRUE; + } } else { min_ptr = max_ptr - 1; @@ -2979,23 +3061,40 @@ S_scan_const(pTHX_ char *start) range_max = * (U8*) max_ptr; } + /* If the range is just a single code point, like tr/a-a/.../, + * that code point is already in the output, twice. We can + * just back up over the second instance and avoid all the rest + * of the work. But if it is a variant character, it's been + * counted twice, so decrement. (This unlikely scenario is + * special cased, like the one for a range of 2 code points + * below, only because the main-line code below needs a range + * of 3 or more to work without special casing. Might as well + * get it out of the way now.) */ + if (UNLIKELY(range_max == range_min)) { + d = max_ptr; + if (! has_utf8 && ! UVCHR_IS_INVARIANT(range_max)) { + utf8_variant_count--; + } + goto range_done; + } + #ifdef EBCDIC /* On EBCDIC platforms, we may have to deal with portable * ranges. These happen if at least one range endpoint is a * Unicode value (\N{...}), or if the range is a subset of * [A-Z] or [a-z], and both ends are literal characters, * like 'A', and not like \x{C1} */ - if ((convert_unicode - = cBOOL(backslash_N) /* \N{} forces Unicode, hence - portable range */ - || ( ! non_portable_endpoint - && (( isLOWER_A(range_min) && isLOWER_A(range_max)) - || (isUPPER_A(range_min) && isUPPER_A(range_max)))) - )) { + convert_unicode = + cBOOL(backslash_N) /* \N{} forces Unicode, + hence portable range */ + || ( ! non_portable_endpoint + && (( isLOWER_A(range_min) && isLOWER_A(range_max)) + || (isUPPER_A(range_min) && isUPPER_A(range_max)))); + if (convert_unicode) { /* Special handling is needed for these portable ranges. - * They are defined to all be in Unicode terms, which - * include all Unicode code points between the end points. + * They are defined to be in Unicode terms, which includes + * all the Unicode code points between the end points. * Convert to Unicode to get the Unicode range. Later we * will convert each code point in the range back to * native. */ @@ -3013,7 +3112,6 @@ S_scan_const(pTHX_ char *start) range_max = UNI_TO_NATIVE(range_max); } #endif - /* Use the characters themselves for the error message if * ASCII printables; otherwise some visible representation * of them */ @@ -3024,32 +3122,41 @@ S_scan_const(pTHX_ char *start) } #ifdef EBCDIC else if (convert_unicode) { - /* diag_listed_as: Invalid range "%s" in transliteration operator */ + /* diag_listed_as: Invalid range "%s" in transliteration operator */ Perl_croak(aTHX_ - "Invalid range \"\\N{U+%04"UVXf"}-\\N{U+%04"UVXf"}\"" - " in transliteration operator", - range_min, range_max); + "Invalid range \"\\N{U+%04" UVXf "}-\\N{U+%04" + UVXf "}\" in transliteration operator", + range_min, range_max); } #endif else { - /* diag_listed_as: Invalid range "%s" in transliteration operator */ + /* diag_listed_as: Invalid range "%s" in transliteration operator */ Perl_croak(aTHX_ - "Invalid range \"\\x{%04"UVXf"}-\\x{%04"UVXf"}\"" - " in transliteration operator", - range_min, range_max); + "Invalid range \"\\x{%04" UVXf "}-\\x{%04" UVXf "}\"" + " in transliteration operator", + range_min, range_max); } } + /* If the range is exactly two code points long, they are + * already both in the output */ + if (UNLIKELY(range_min + 1 == range_max)) { + goto range_done; + } + + /* Here the range contains at least 3 code points */ + if (has_utf8) { - /* We try to avoid creating a swash. If the upper end of - * this range is below 256, this range won't force a swash; - * otherwise it does force a swash, and as long as we have - * to have one, we might as well not expand things out. - * But if it's EBCDIC, we may have to look at each - * character below 256 if we have to convert to/from - * Unicode values */ - if (range_max > 255 + /* If everything in the transliteration is below 256, we + * can avoid special handling later. A translation table + * for each of those bytes is created by op.c. So we + * expand out all ranges to their constituent code points. + * But if we've encountered something above 255, the + * expanding won't help, so skip doing that. But if it's + * EBCDIC, we may have to look at each character below 256 + * if we have to convert to/from Unicode values */ + if ( has_above_latin1 #ifdef EBCDIC && (range_min > 255 || ! convert_unicode) #endif @@ -3057,7 +3164,7 @@ S_scan_const(pTHX_ char *start) /* Move the high character one byte to the right; then * insert between it and the range begin, an illegal * byte which serves to indicate this is a range (using - * a '-' could be ambiguous). */ + * a '-' would be ambiguous). */ char *e = d++; while (e-- > max_ptr) { *(e + 1) = *e; @@ -3079,51 +3186,94 @@ S_scan_const(pTHX_ char *start) } /* Here we need to expand out the string to contain each - * character in the range. Grow the output to handle this */ + * character in the range. Grow the output to handle this. + * For non-UTF8, we need a byte for each code point in the + * range, minus the three that we've already allocated for: the + * hyphen, the min, and the max. For UTF-8, we need this + * plus an extra byte for each code point that occupies two + * bytes (is variant) when in UTF-8 (except we've already + * allocated for the end points, including if they are + * variants). For ASCII platforms and Unicode ranges on EBCDIC + * platforms, it's easy to calculate a precise number. To + * start, we count the variants in the range, which we need + * elsewhere in this function anyway. (For the case where it + * isn't easy to calculate, 'extras' has been initialized to 0, + * and the calculation is done in a loop further down.) */ +#ifdef EBCDIC + if (convert_unicode) +#endif + { + /* This is executed unconditionally on ASCII, and for + * Unicode ranges on EBCDIC. Under these conditions, all + * code points above a certain value are variant; and none + * under that value are. We just need to find out how much + * of the range is above that value. We don't count the + * end points here, as they will already have been counted + * as they were parsed. */ + if (range_min >= UTF_CONTINUATION_MARK) { + + /* The whole range is made up of variants */ + extras = (range_max - 1) - (range_min + 1) + 1; + } + else if (range_max >= UTF_CONTINUATION_MARK) { + + /* Only the higher portion of the range is variants */ + extras = (range_max - 1) - UTF_CONTINUATION_MARK + 1; + } - save_offset = min_ptr - SvPVX_const(sv); + utf8_variant_count += extras; + } - /* The base growth is the number of code points in the range */ - grow = range_max - range_min + 1; - if (has_utf8) { + /* The base growth is the number of code points in the range, + * not including the endpoints, which have already been sized + * for (and output). We don't subtract for the hyphen, as it + * has been parsed but not output, and the SvGROW below is + * based only on what's been output plus what's left to parse. + * */ + grow = (range_max - 1) - (range_min + 1) + 1; - /* But if the output is UTF-8, some of those characters may - * need two bytes (since the maximum range value here is - * 255, the max bytes per character is two). On ASCII - * platforms, it's not much trouble to get an accurate - * count of what's needed. But on EBCDIC, the ones that - * need 2 bytes are scattered around, so just use a worst - * case value instead of calculating for that platform. */ + if (has_utf8) { #ifdef EBCDIC - grow *= 2; -#else - /* Only those above 127 require 2 bytes. This may be - * everything in the range, or not */ - if (range_min > 127) { + /* In some cases in EBCDIC, we haven't yet calculated a + * precise amount needed for the UTF-8 variants. Just + * assume the worst case, that everything will expand by a + * byte */ + if (! convert_unicode) { grow *= 2; } - else if (range_max > 127) { - grow += range_max - 127; - } + else #endif + { + /* Otherwise we know exactly how many variants there + * are in the range. */ + grow += extras; + } } - /* Subtract 3 for the bytes that were already accounted for - * (min, max, and the hyphen) */ - d = save_offset + SvGROW(sv, SvLEN(sv) + grow - 3); + /* Grow, but position the output to overwrite the range min end + * point, because in some cases we overwrite that */ + SvCUR_set(sv, d - SvPVX_const(sv)); + offset_to_min = min_ptr - SvPVX_const(sv); + /* See Note on sizing above. */ + d = offset_to_min + SvGROW(sv, SvCUR(sv) + + (send - s) + + grow + + 1 /* Trailing NUL */ ); + + /* Now, we can expand out the range. */ #ifdef EBCDIC - /* Here, we expand out the range. */ if (convert_unicode) { - IV i; + SSize_t i; /* Recall that the min and max are now in Unicode terms, so * we have to convert each character to its native * equivalent */ if (has_utf8) { for (i = range_min; i <= range_max; i++) { - append_utf8_from_native_byte(LATIN1_TO_NATIVE((U8) i), - (U8 **) &d); + append_utf8_from_native_byte( + LATIN1_TO_NATIVE((U8) i), + (U8 **) &d); } } else { @@ -3136,34 +3286,51 @@ S_scan_const(pTHX_ char *start) #endif /* Always gets run for ASCII, and sometimes for EBCDIC. */ { - IV i; - /* Here, no conversions are necessary, which means that the * first character in the range is already in 'd' and * valid, so we can skip overwriting it */ if (has_utf8) { + SSize_t i; d += UTF8SKIP(d); for (i = range_min + 1; i <= range_max; i++) { append_utf8_from_native_byte((U8) i, (U8 **) &d); } } else { + SSize_t i; d++; - for (i = range_min + 1; i <= range_max; i++) { + assert(range_min + 1 <= range_max); + for (i = range_min + 1; i < range_max; i++) { +#ifdef EBCDIC + /* In this case on EBCDIC, we haven't calculated + * the variants. Do it here, as we go along */ + if (! UVCHR_IS_INVARIANT(i)) { + utf8_variant_count++; + } +#endif *d++ = (char)i; } + + /* The range_max is done outside the loop so as to + * avoid having to special case not incrementing + * 'utf8_variant_count' on EBCDIC (it's already been + * counted when originally parsed) */ + *d++ = (char) range_max; } } #ifdef EBCDIC - /* If the original range extended above 255, add in that portion. */ + /* If the original range extended above 255, add in that + * portion. */ if (real_range_max) { *d++ = (char) UTF8_TWO_BYTE_HI(0x100); *d++ = (char) UTF8_TWO_BYTE_LO(0x100); - if (real_range_max > 0x101) - *d++ = (char) ILLEGAL_UTF8_BYTE; - if (real_range_max > 0x100) + if (real_range_max > 0x100) { + if (real_range_max > 0x101) { + *d++ = (char) ILLEGAL_UTF8_BYTE; + } d = (char*)uvchr_to_utf8((U8*)d, real_range_max); + } } #endif @@ -3186,8 +3353,7 @@ S_scan_const(pTHX_ char *start) if (!esc) in_charclass = TRUE; } - - else if (*s == ']' && PL_lex_inpat && in_charclass) { + else if (*s == ']' && PL_lex_inpat && in_charclass) { char *s1 = s-1; int esc = 0; while (s1 >= start && *s1-- == '\\') @@ -3195,11 +3361,9 @@ S_scan_const(pTHX_ char *start) if (!esc) in_charclass = FALSE; } - - /* skip for regexp comments /(?#comment)/, except for the last - * char, which will be done separately. - * Stop on (?{..}) and friends */ - + /* skip for regexp comments /(?#comment)/, except for the last + * char, which will be done separately. Stop on (?{..}) and + * friends */ else if (*s == '(' && PL_lex_inpat && s[1] == '?' && !in_charclass) { if (s[2] == '#') { while (s+1 < send && *s != ')') @@ -3212,36 +3376,36 @@ S_scan_const(pTHX_ char *start) break; } } - - /* likewise skip #-initiated comments in //x patterns */ + /* likewise skip #-initiated comments in //x patterns */ else if (*s == '#' && PL_lex_inpat && !in_charclass && ((PMOP*)PL_lex_inpat)->op_pmflags & RXf_PMf_EXTENDED) { - while (s+1 < send && *s != '\n') + while (s < send && *s != '\n') *d++ = *s++; } - - /* no further processing of single-quoted regex */ + /* no further processing of single-quoted regex */ else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') goto default_action; - /* check for embedded arrays - (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-) - */ + /* check for embedded arrays + * (@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-) + */ else if (*s == '@' && s[1]) { - if (UTF ? isIDFIRST_utf8((U8*)s+1) : isWORDCHAR_A(s[1])) + if (UTF + ? isIDFIRST_utf8_safe(s+1, send) + : isWORDCHAR_A(s[1])) + { break; + } if (strchr(":'{$", s[1])) break; if (!PL_lex_inpat && (s[1] == '+' || s[1] == '-')) break; /* in regexp, neither @+ nor @- are interpolated */ } - - /* check for embedded scalars. only stop if we're sure it's a - variable. - */ + /* check for embedded scalars. only stop if we're sure it's a + * variable. */ else if (*s == '$') { if (!PL_lex_inpat) /* not a regexp, so $ must be var */ break; @@ -3256,6 +3420,11 @@ S_scan_const(pTHX_ char *start) /* End of else if chain - OP_TRANS rejoin rest */ + if (UNLIKELY(s >= send)) { + assert(s == send); + break; + } + /* backslashes */ if (*s == '\\' && s+1 < send) { char* e; /* Can be used for ending '}', etc. */ @@ -3345,7 +3514,7 @@ S_scan_const(pTHX_ char *start) UTF); if (! valid) { yyerror(error); - continue; + uv = 0; /* drop through to ensure range ends are set */ } goto NUM_ESCAPE_INSERT; } @@ -3363,51 +3532,68 @@ S_scan_const(pTHX_ char *start) UTF); if (! valid) { yyerror(error); - continue; + uv = 0; /* drop through to ensure range ends are set */ } } NUM_ESCAPE_INSERT: /* Insert oct or hex escaped character. */ - + /* Here uv is the ordinal of the next character being added */ if (UVCHR_IS_INVARIANT(uv)) { *d++ = (char) uv; } else { if (!has_utf8 && uv > 255) { - /* Might need to recode whatever we have accumulated so - * far if it contains any chars variant in utf8 or - * utf-ebcdic. */ - - SvCUR_set(sv, d - SvPVX_const(sv)); - SvPOK_on(sv); - *d = '\0'; - /* See Note on sizing above. */ - sv_utf8_upgrade_flags_grow( - sv, - SV_GMAGIC|SV_FORCE_UTF8_UPGRADE - /* Above-latin1 in string - * implies no encoding */ - |SV_UTF8_NO_ENCODING, - UVCHR_SKIP(uv) + (STRLEN)(send - s) + 1); - d = SvPVX(sv) + SvCUR(sv); - has_utf8 = TRUE; + + /* Here, 'uv' won't fit unless we convert to UTF-8. + * If we've only seen invariants so far, all we have to + * do is turn on the flag */ + if (utf8_variant_count == 0) { + SvUTF8_on(sv); + } + else { + SvCUR_set(sv, d - SvPVX_const(sv)); + SvPOK_on(sv); + *d = '\0'; + + sv_utf8_upgrade_flags_grow( + sv, + SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, + + /* Since we're having to grow here, + * make sure we have enough room for + * this escape and a NUL, so the + * code immediately below won't have + * to actually grow again */ + UVCHR_SKIP(uv) + + (STRLEN)(send - s) + 1); + d = SvPVX(sv) + SvCUR(sv); + } + + has_above_latin1 = TRUE; + has_utf8 = TRUE; } - if (has_utf8) { + if (! has_utf8) { + *d++ = (char)uv; + utf8_variant_count++; + } + else { /* Usually, there will already be enough room in 'sv' * since such escapes are likely longer than any UTF-8 * sequence they can end up as. This isn't the case on * EBCDIC where \x{40000000} contains 12 bytes, and the * UTF-8 for it contains 14. And, we have to allow for * a trailing NUL. It probably can't happen on ASCII - * platforms, but be safe */ - const STRLEN needed = d - SvPVX(sv) + UVCHR_SKIP(uv) + * platforms, but be safe. See Note on sizing above. */ + const STRLEN needed = d - SvPVX(sv) + + UVCHR_SKIP(uv) + + (send - s) + 1; if (UNLIKELY(needed > SvLEN(sv))) { SvCUR_set(sv, d - SvPVX_const(sv)); - d = sv_grow(sv, needed) + SvCUR(sv); + d = SvCUR(sv) + SvGROW(sv, needed); } d = (char*)uvchr_to_utf8((U8*)d, uv); @@ -3418,9 +3604,6 @@ S_scan_const(pTHX_ char *start) (PL_lex_repl ? OPpTRANS_FROM_UTF : OPpTRANS_TO_UTF); } - } - else { - *d++ = (char)uv; } } #ifdef EBCDIC @@ -3468,7 +3651,8 @@ S_scan_const(pTHX_ char *start) * braces */ s++; if (*s != '{') { - yyerror("Missing braces on \\N{}"); + yyerror("Missing braces on \\N{}"); + *d++ = '\0'; continue; } s++; @@ -3480,7 +3664,7 @@ S_scan_const(pTHX_ char *start) } else { yyerror("Missing right brace on \\N{} or unescaped left brace after \\N"); } - continue; + yyquit(); /* Have exhausted the input. */ } /* Here it looks like a named character */ @@ -3499,6 +3683,7 @@ S_scan_const(pTHX_ char *start) "Invalid hexadecimal number in \\N{U+...}" ); s = e + 1; + *d++ = '\0'; continue; } while (++s < e) { @@ -3534,16 +3719,27 @@ S_scan_const(pTHX_ char *start) if (! has_utf8 && ( uv > 0xFF || PL_lex_inwhat != OP_TRANS)) { + /* See Note on sizing above. */ + const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1; + SvCUR_set(sv, d - SvPVX_const(sv)); SvPOK_on(sv); *d = '\0'; - /* See Note on sizing above. */ - sv_utf8_upgrade_flags_grow( - sv, - SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, - OFFUNISKIP(uv) + (STRLEN)(send - e) + 1); - d = SvPVX(sv) + SvCUR(sv); + + if (utf8_variant_count == 0) { + SvUTF8_on(sv); + d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra); + } + else { + sv_utf8_upgrade_flags_grow( + sv, + SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, + extra); + d = SvPVX(sv) + SvCUR(sv); + } + has_utf8 = TRUE; + has_above_latin1 = TRUE; } /* Add the (Unicode) code point to the output. */ @@ -3686,36 +3882,52 @@ S_scan_const(pTHX_ char *start) " in transliteration operator", /* +1 to include the "}" */ (int) (e + 1 - start), start)); + *d++ = '\0'; goto end_backslash_N; } + + if (SvUTF8(res) && UTF8_IS_ABOVE_LATIN1(*str)) { + has_above_latin1 = TRUE; + } + } else if (! SvUTF8(res)) { /* Make sure \N{} return is UTF-8. This is because * \N{} implies Unicode semantics, and scalars have * to be in utf8 to guarantee those semantics; but * not needed in tr/// */ - sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING); + sv_utf8_upgrade_flags(res, 0); str = SvPV_const(res, len); } /* Upgrade destination to be utf8 if this new * component is */ if (! has_utf8 && SvUTF8(res)) { + /* See Note on sizing above. */ + const STRLEN extra = len + (send - s) + 1; + SvCUR_set(sv, d - SvPVX_const(sv)); SvPOK_on(sv); *d = '\0'; - /* See Note on sizing above. */ - sv_utf8_upgrade_flags_grow(sv, + + if (utf8_variant_count == 0) { + SvUTF8_on(sv); + d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra); + } + else { + sv_utf8_upgrade_flags_grow(sv, SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, - len + (STRLEN)(send - s) + 1); - d = SvPVX(sv) + SvCUR(sv); + extra); + d = SvPVX(sv) + SvCUR(sv); + } has_utf8 = TRUE; } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */ /* See Note on sizing above. (NOTE: SvCUR() is not * set correctly here). */ + const STRLEN extra = len + (send - e) + 1; const STRLEN off = d - SvPVX_const(sv); - d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1); + d = off + SvGROW(sv, off + extra); } Copy(str, d, len, char); d += len; @@ -3736,15 +3948,16 @@ S_scan_const(pTHX_ char *start) case 'c': s++; if (s < send) { - *d++ = grok_bslash_c(*s++, 1); + *d++ = grok_bslash_c(*s, 1); } else { yyerror("Missing control char name in \\c"); + yyquit(); /* Are at end of input, no sense continuing */ } #ifdef EBCDIC non_portable_endpoint++; #endif - continue; + break; /* printf-style backslashes, formfeeds, newlines, etc */ case 'b': @@ -3775,54 +3988,78 @@ S_scan_const(pTHX_ char *start) } /* end if (backslash) */ default_action: - /* If we started with encoded form, or already know we want it, - then encode the next character */ - if (! NATIVE_BYTE_IS_INVARIANT((U8)(*s)) && (this_utf8 || has_utf8)) { - STRLEN len = 1; - - /* One might think that it is wasted effort in the case of the - * source being utf8 (this_utf8 == TRUE) to take the next character - * in the source, convert it to an unsigned value, and then convert - * it back again. But the source has not been validated here. The - * routine that does the conversion checks for errors like - * malformed utf8 */ + /* Just copy the input to the output, though we may have to convert + * to/from UTF-8. + * + * If the input has the same representation in UTF-8 as not, it will be + * a single byte, and we don't care about UTF8ness; just copy the byte */ + if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) { + *d++ = *s++; + } + else if (! this_utf8 && ! has_utf8) { + /* If neither source nor output is UTF-8, is also a single byte, + * just copy it; but this byte counts should we later have to + * convert to UTF-8 */ + *d++ = *s++; + utf8_variant_count++; + } + else if (this_utf8 && has_utf8) { /* Both UTF-8, can just copy */ + const STRLEN len = UTF8SKIP(s); + /* We expect the source to have already been checked for + * malformedness */ + assert(isUTF8_CHAR((U8 *) s, (U8 *) send)); + + Copy(s, d, len, U8); + d += len; + s += len; + } + else { /* UTF8ness matters and doesn't match, need to convert */ + STRLEN len = 1; const UV nextuv = (this_utf8) ? utf8n_to_uvchr((U8*)s, send - s, &len, 0) : (UV) ((U8) *s); - const STRLEN need = UVCHR_SKIP(nextuv); + STRLEN need = UVCHR_SKIP(nextuv); + if (!has_utf8) { SvCUR_set(sv, d - SvPVX_const(sv)); SvPOK_on(sv); *d = '\0'; - /* See Note on sizing above. */ - sv_utf8_upgrade_flags_grow(sv, - SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, - need + (STRLEN)(send - s) + 1); - d = SvPVX(sv) + SvCUR(sv); + + /* See Note on sizing above. */ + need += (STRLEN)(send - s) + 1; + + if (utf8_variant_count == 0) { + SvUTF8_on(sv); + d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need); + } + else { + sv_utf8_upgrade_flags_grow(sv, + SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, + need); + d = SvPVX(sv) + SvCUR(sv); + } has_utf8 = TRUE; } else if (need > len) { /* encoded value larger than old, may need extra space (NOTE: * SvCUR() is not set correctly here). See Note on sizing * above. */ + const STRLEN extra = need + (send - s) + 1; const STRLEN off = d - SvPVX_const(sv); - d = SvGROW(sv, off + need + (STRLEN)(send - s) + 1) + off; + d = off + SvGROW(sv, off + extra); } s += len; d = (char*)uvchr_to_utf8((U8*)d, nextuv); } - else { - *d++ = *s++; - } } /* while loop to process each character */ /* terminate the string and set up the sv */ *d = '\0'; SvCUR_set(sv, d - SvPVX_const(sv)); if (SvCUR(sv) >= SvLEN(sv)) - Perl_croak(aTHX_ "panic: constant overflowed allocated space, %"UVuf - " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv)); + Perl_croak(aTHX_ "panic: constant overflowed allocated space, %" UVuf + " >= %" UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv)); SvPOK_on(sv); if (has_utf8) { @@ -3963,12 +4200,9 @@ S_intuit_more(pTHX_ char *s) case '&': case '$': weight -= seen[un_char] * 10; - if (isWORDCHAR_lazy_if(s+1,UTF)) { + if (isWORDCHAR_lazy_if_safe(s+1, PL_bufend, UTF)) { int len; - char *tmp = PL_bufend; - PL_bufend = (char*)send; - scan_ident(s, tmpbuf, sizeof tmpbuf, FALSE); - PL_bufend = tmp; + scan_ident(s, tmpbuf, sizeof tmpbuf, FALSE); len = (int)strlen(tmpbuf); if (len > 1 && gv_fetchpvn_flags(tmpbuf, len, UTF ? SVf_UTF8 : 0, SVt_PV)) @@ -4089,11 +4323,14 @@ S_intuit_method(pTHX_ char *start, SV *ioname, CV *cv) } if (*start == '$') { + SSize_t start_off = start - SvPVX(PL_linestr); if (cv || PL_last_lop_op == OP_PRINT || PL_last_lop_op == OP_SAY || isUPPER(*PL_tokenbuf)) return 0; - s = skipspace(s); - PL_bufptr = start; + /* this could be $# */ + if (isSPACE(*s)) + s = skipspace(s); + PL_bufptr = SvPVX(PL_linestr) + start_off; PL_expect = XREF; return *s == '(' ? FUNCMETH : METHOD; } @@ -4194,7 +4431,7 @@ Perl_filter_add(pTHX_ filter_t funcp, SV *datasv) STRLEN const last_lop_pos = PL_parser->last_lop ? PL_parser->last_lop - buf : 0; av_push(PL_rsfp_filters, linestr); - PL_parser->linestr = + PL_parser->linestr = newSVpvn(SvPVX(linestr), ++s-SvPVX(linestr)); buf = SvPVX(PL_parser->linestr); PL_parser->bufend = buf + SvCUR(PL_parser->linestr); @@ -4393,6 +4630,7 @@ S_tokenize_use(pTHX_ int is_use, char *s) { PERL_ARGS_ASSERT_TOKENIZE_USE; if (PL_expect != XSTATE) + /* diag_listed_as: "use" not allowed in expression */ yyerror(Perl_form(aTHX_ "\"%s\" not allowed in expression", is_use ? "use" : "no")); PL_expect = XTERM; @@ -4438,12 +4676,18 @@ static void S_check_scalar_slice(pTHX_ char *s) { s++; - while (*s == ' ' || *s == '\t') s++; - if (*s == 'q' && s[1] == 'w' - && !isWORDCHAR_lazy_if(s+2,UTF)) + while (SPACE_OR_TAB(*s)) s++; + if (*s == 'q' && s[1] == 'w' && !isWORDCHAR_lazy_if_safe(s+2, + PL_bufend, + UTF)) + { return; - while (*s && (isWORDCHAR_lazy_if(s,UTF) || strchr(" \t$#+-'\"", *s))) - s += UTF ? UTF8SKIP(s) : 1; + } + while ( isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF) + || (*s && strchr(" \t$#+-'\"", *s))) + { + s += UTF ? UTF8SKIP(s) : 1; + } if (*s == '}' || *s == ']') pl_yylval.ival = OPpSLICEWARNING; } @@ -4532,9 +4776,23 @@ Perl_yylex(pTHX) GV *gv = NULL; GV **gvp = NULL; + if (UNLIKELY(PL_parser->recheck_utf8_validity)) { + const U8* first_bad_char_loc; + if (UTF && UNLIKELY(! is_utf8_string_loc((U8 *) PL_bufptr, + PL_bufend - PL_bufptr, + &first_bad_char_loc))) + { + _force_out_malformed_utf8_message(first_bad_char_loc, + (U8 *) PL_bufend, + 0, + 1 /* 1 means die */ ); + NOT_REACHED; /* NOTREACHED */ + } + PL_parser->recheck_utf8_validity = FALSE; + } DEBUG_T( { SV* tmp = newSVpvs(""); - PerlIO_printf(Perl_debug_log, "### %"IVdf":LEX_%s/X%s %s\n", + PerlIO_printf(Perl_debug_log, "### %" IVdf ":LEX_%s/X%s %s\n", (IV)CopLINE(PL_curcop), lex_state_names[PL_lex_state], exp_name[PL_expect], @@ -4621,9 +4879,7 @@ Perl_yylex(pTHX) if (strnEQ(s, "L\\u", 3) || strnEQ(s, "U\\l", 3)) tmp = *s, *s = s[2], s[2] = (char)tmp; /* misordered... */ if ((*s == 'L' || *s == 'U' || *s == 'F') - && (strchr(PL_lex_casestack, 'L') - || strchr(PL_lex_casestack, 'U') - || strchr(PL_lex_casestack, 'F'))) + && (strpbrk(PL_lex_casestack, "LUF"))) { PL_lex_casestack[--PL_lex_casemods] = '\0'; PL_lex_allbrackets--; @@ -4781,7 +5037,16 @@ Perl_yylex(pTHX) s = PL_bufend; } else { + int save_error_count = PL_error_count; + s = scan_const(PL_bufptr); + + /* Set flag if this was a pattern and there were errors. op.c will + * refuse to compile a pattern with this flag set. Otherwise, we + * could get segfaults, etc. */ + if (PL_lex_inpat && PL_error_count > save_error_count) { + ((PMOP*)PL_lex_inpat)->op_pmflags |= PMf_HAS_ERROR; + } if (*s == '\\') PL_lex_state = LEX_INTERPCASEMOD; else @@ -4832,7 +5097,7 @@ Perl_yylex(pTHX) * as a var; e.g. ($, ...) would be seen as the var '$,' */ - char sigil; + U8 sigil; s = skipspace(s); sigil = *s++; @@ -4852,7 +5117,7 @@ Perl_yylex(pTHX) break; } s = skipspace(s); - if (isIDFIRST_lazy_if(s, UTF)) { + if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) { char *dest = PL_tokenbuf + 1; /* read var name, including sigil, into PL_tokenbuf */ PL_tokenbuf[0] = sigil; @@ -4860,12 +5125,43 @@ Perl_yylex(pTHX) 0, cBOOL(UTF), FALSE); *dest = '\0'; assert(PL_tokenbuf[1]); /* we have a variable name */ + } + else { + *PL_tokenbuf = 0; + PL_in_my = 0; + } + + s = skipspace(s); + /* parse the = for the default ourselves to avoid '+=' etc being accepted here + * as the ASSIGNOP, and exclude other tokens that start with = + */ + if (*s == '=' && (!s[1] || strchr("=~>", s[1]) == 0)) { + /* save now to report with the same context as we did when + * all ASSIGNOPS were accepted */ + PL_oldbufptr = s; + + ++s; + NEXTVAL_NEXTTOKE.ival = 0; + force_next(ASSIGNOP); + PL_expect = XTERM; + } + else if (*s == ',' || *s == ')') { + PL_expect = XOPERATOR; + } + else { + /* make sure the context shows the unexpected character and + * hopefully a bit more */ + if (*s) ++s; + while (*s && *s != '$' && *s != '@' && *s != '%' && *s != ')') + s++; + PL_bufptr = s; /* for error reporting */ + yyerror("Illegal operator following parameter in a subroutine signature"); + PL_in_my = 0; + } + if (*PL_tokenbuf) { NEXTVAL_NEXTTOKE.ival = sigil; force_next('p'); /* force a signature pending identifier */ } - else - PL_in_my = 0; - PL_expect = XOPERATOR; break; case ')': @@ -4890,14 +5186,7 @@ Perl_yylex(pTHX) switch (*s) { default: if (UTF) { - if (! isUTF8_CHAR((U8 *) s, (U8 *) PL_bufend)) { - ENTER; - SAVESPTR(PL_warnhook); - PL_warnhook = PERL_WARNHOOK_FATAL; - utf8n_to_uvchr((U8*)s, PL_bufend-s, NULL, 0); - LEAVE; - } - if (isIDFIRST_utf8((U8*)s)) { + if (isIDFIRST_utf8_safe(s, PL_bufend)) { goto keylookup; } } @@ -4918,13 +5207,24 @@ Perl_yylex(pTHX) else { c = Perl_form(aTHX_ "\\x%02X", (unsigned char)*s); } - len = UTF ? Perl_utf8_length(aTHX_ (U8 *) PL_linestart, (U8 *) s) : (STRLEN) (s - PL_linestart); - if (len > UNRECOGNIZED_PRECEDE_COUNT) { - d = UTF ? (char *) utf8_hop_back((U8 *) s, -UNRECOGNIZED_PRECEDE_COUNT, (U8 *)PL_linestart) : s - UNRECOGNIZED_PRECEDE_COUNT; - } else { + + if (s >= PL_linestart) { d = PL_linestart; } - Perl_croak(aTHX_ "Unrecognized character %s; marked by <-- HERE after %"UTF8f"<-- HERE near column %d", c, + else { + /* somehow (probably due to a parse failure), PL_linestart has advanced + * pass PL_bufptr, get a reasonable beginning of line + */ + d = s; + while (d > SvPVX(PL_linestr) && d[-1] && d[-1] != '\n') + --d; + } + len = UTF ? Perl_utf8_length(aTHX_ (U8 *) d, (U8 *) s) : (STRLEN) (s - d); + if (len > UNRECOGNIZED_PRECEDE_COUNT) { + d = UTF ? (char *) utf8_hop_back((U8 *) s, -UNRECOGNIZED_PRECEDE_COUNT, (U8 *)d) : s - UNRECOGNIZED_PRECEDE_COUNT; + } + + Perl_croak(aTHX_ "Unrecognized character %s; marked by <-- HERE after %" UTF8f "<-- HERE near column %d", c, UTF8fARG(UTF, (s - d), d), (int) len + 1); } @@ -5027,7 +5327,7 @@ Perl_yylex(pTHX) } do { fake_eof = 0; - bof = PL_rsfp ? TRUE : FALSE; + bof = cBOOL(PL_rsfp); if (0) { fake_eof: fake_eof = LEX_FAKE_EOF; @@ -5150,8 +5450,6 @@ Perl_yylex(pTHX) d = instr(s,"perl -"); if (!d) { d = instr(s,"perl"); - if (d && d[4] == '6') - d = NULL; #if defined(DOSISH) /* avoid getting into infinite loops when shebang * line contains "Perl" rather than "perl" */ @@ -5430,7 +5728,7 @@ Perl_yylex(pTHX) PL_expect = XPOSTDEREF; TOKEN(ARROW); } - if (isIDFIRST_lazy_if(s,UTF)) { + if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) { s = force_word(s,METHOD,FALSE,TRUE); TOKEN(ARROW); } @@ -5529,8 +5827,7 @@ Perl_yylex(pTHX) } else if (PL_expect == XPOSTDEREF) POSTDEREF('%'); PL_tokenbuf[0] = '%'; - s = scan_ident(s, PL_tokenbuf + 1, - sizeof PL_tokenbuf - 1, FALSE); + s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE); pl_yylval.ival = 0; if (!PL_tokenbuf[1]) { PREREF('%'); @@ -5615,7 +5912,7 @@ Perl_yylex(pTHX) grabattrs: s = skipspace(s); attrs = NULL; - while (isIDFIRST_lazy_if(s,UTF)) { + while (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) { I32 tmp; SV *sv; d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &len); @@ -5654,25 +5951,12 @@ Perl_yylex(pTHX) PL_lex_stuff = NULL; } else { - if (len == 6 && strnEQ(SvPVX(sv), "unique", len)) { - sv_free(sv); - if (PL_in_my == KEY_our) { - deprecate(":unique"); - } - else - Perl_croak(aTHX_ "The 'unique' attribute may only be applied to 'our' variables"); - } - /* NOTE: any CV attrs applied here need to be part of the CVf_BUILTIN_ATTRS define in cv.h! */ - else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "lvalue", len)) { + if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "lvalue", len)) { sv_free(sv); CvLVALUE_on(PL_compcv); } - else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "locked", len)) { - sv_free(sv); - deprecate(":locked"); - } else if (!PL_in_my && len == 6 && strnEQ(SvPVX(sv), "method", len)) { sv_free(sv); CvMETHOD_on(PL_compcv); @@ -5821,7 +6105,7 @@ Perl_yylex(pTHX) while (d < PL_bufend && SPACE_OR_TAB(*d)) d++; } - if (d < PL_bufend && isIDFIRST_lazy_if(d,UTF)) { + if (d < PL_bufend && isIDFIRST_lazy_if_safe(d, PL_bufend, UTF)) { d = scan_word(d, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE, &len); while (d < PL_bufend && SPACE_OR_TAB(*d)) @@ -5941,13 +6225,19 @@ Perl_yylex(pTHX) } else /* skip plain q word */ - while (t < PL_bufend && isWORDCHAR_lazy_if(t,UTF)) + while ( t < PL_bufend + && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) + { t += UTF ? UTF8SKIP(t) : 1; + } } - else if (isWORDCHAR_lazy_if(t,UTF)) { + else if (isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) { t += UTF ? UTF8SKIP(t) : 1; - while (t < PL_bufend && isWORDCHAR_lazy_if(t,UTF)) + while ( t < PL_bufend + && isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF)) + { t += UTF ? UTF8SKIP(t) : 1; + } } while (t < PL_bufend && isSPACE(*t)) t++; @@ -5970,8 +6260,10 @@ Perl_yylex(pTHX) break; } if (strEQs(s, "sub")) { + PL_bufptr = s; d = s + 3; d = skipspace(d); + s = PL_bufptr; if (*d == ':') { PL_expect = XTERM; break; @@ -6042,8 +6334,9 @@ Perl_yylex(pTHX) } s--; if (PL_expect == XOPERATOR) { - if (PL_bufptr == PL_linestart && ckWARN(WARN_SEMICOLON) - && isIDFIRST_lazy_if(s,UTF)) + if ( PL_bufptr == PL_linestart + && ckWARN(WARN_SEMICOLON) + && isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) { CopLINE_dec(PL_curcop); Perl_warner(aTHX_ packWARN(WARN_SEMICOLON), "%s", PL_warn_nosemi); @@ -6067,8 +6360,7 @@ Perl_yylex(pTHX) } PL_tokenbuf[0] = '&'; - s = scan_ident(s - 1, PL_tokenbuf + 1, - sizeof PL_tokenbuf - 1, TRUE); + s = scan_ident(s - 1, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, TRUE); pl_yylval.ival = (OPpENTERSUB_AMPER<<8); if (PL_tokenbuf[1]) { force_ident_maybe_lex('&'); @@ -6312,12 +6604,7 @@ Perl_yylex(pTHX) case '$': CLINE; - if (PL_expect == XOPERATOR) { - if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) { - return deprecate_commaless_var_list(); - } - } - else if (PL_expect == XPOSTDEREF) { + if (PL_expect == XPOSTDEREF) { if (s[1] == '#') { s++; POSTDEREF(DOLSHARP); @@ -6325,7 +6612,10 @@ Perl_yylex(pTHX) POSTDEREF('$'); } - if (s[1] == '#' && (isIDFIRST_lazy_if(s+2,UTF) || strchr("{$:+-@", s[2]))) { + if ( s[1] == '#' + && ( isIDFIRST_lazy_if_safe(s+2, PL_bufend, UTF) + || strchr("{$:+-@", s[2]))) + { PL_tokenbuf[0] = '@'; s = scan_ident(s + 1, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE); @@ -6345,8 +6635,7 @@ Perl_yylex(pTHX) } PL_tokenbuf[0] = '$'; - s = scan_ident(s, PL_tokenbuf + 1, - sizeof PL_tokenbuf - 1, FALSE); + s = scan_ident(s, PL_tokenbuf + 1, sizeof PL_tokenbuf - 1, FALSE); if (PL_expect == XOPERATOR) { d = s; if (PL_bufptr > s) { @@ -6374,14 +6663,18 @@ Perl_yylex(pTHX) if (ckWARN(WARN_SYNTAX)) { char *t = s+1; - while (isSPACE(*t) || isWORDCHAR_lazy_if(t,UTF) || *t == '$') + while ( isSPACE(*t) + || isWORDCHAR_lazy_if_safe(t, PL_bufend, UTF) + || *t == '$') + { t += UTF ? UTF8SKIP(t) : 1; + } if (*t++ == ',') { PL_bufptr = skipspace(PL_bufptr); /* XXX can realloc */ while (t < PL_bufend && *t != ']') t++; Perl_warner(aTHX_ packWARN(WARN_SYNTAX), - "Multidimensional syntax %"UTF8f" not supported", + "Multidimensional syntax %" UTF8f " not supported", UTF8fARG(UTF,(int)((t - PL_bufptr) + 1), PL_bufptr)); } } @@ -6396,17 +6689,21 @@ Perl_yylex(pTHX) do { t++; } while (isSPACE(*t)); - if (isIDFIRST_lazy_if(t,UTF)) { + if (isIDFIRST_lazy_if_safe(t, PL_bufend, UTF)) { STRLEN len; t = scan_word(t, tmpbuf, sizeof tmpbuf, TRUE, &len); while (isSPACE(*t)) t++; - if (*t == ';' - && get_cvn_flags(tmpbuf, len, UTF ? SVf_UTF8 : 0)) + if ( *t == ';' + && get_cvn_flags(tmpbuf, len, UTF + ? SVf_UTF8 + : 0)) + { Perl_warner(aTHX_ packWARN(WARN_SYNTAX), - "You need to quote \"%"UTF8f"\"", + "You need to quote \"%" UTF8f "\"", UTF8fARG(UTF, len, tmpbuf)); + } } } } @@ -6419,9 +6716,12 @@ Perl_yylex(pTHX) PL_expect = XOPERATOR; else if (strchr("$@\"'`q", *s)) PL_expect = XTERM; /* e.g. print $fh "foo" */ - else if (strchr("&*<%", *s) && isIDFIRST_lazy_if(s+1,UTF)) + else if ( strchr("&*<%", *s) + && isIDFIRST_lazy_if_safe(s+1, PL_bufend, UTF)) + { PL_expect = XTERM; /* e.g. print $fh &sub */ - else if (isIDFIRST_lazy_if(s,UTF)) { + } + else if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) { char tmpbuf[sizeof PL_tokenbuf]; int t2; scan_word(s, tmpbuf, sizeof tmpbuf, TRUE, &len); @@ -6520,10 +6820,10 @@ Perl_yylex(pTHX) } else { /* Disable warning on "study /blah/" */ - if (PL_oldoldbufptr == PL_last_uni - && (*PL_last_uni != 's' || s - PL_last_uni < 5 - || memNE(PL_last_uni, "study", 5) - || isWORDCHAR_lazy_if(PL_last_uni+5,UTF) + if ( PL_oldoldbufptr == PL_last_uni + && ( *PL_last_uni != 's' || s - PL_last_uni < 5 + || memNE(PL_last_uni, "study", 5) + || isWORDCHAR_lazy_if_safe(PL_last_uni+5, PL_bufend, UTF) )) check_uni(); s = scan_pat(s,OP_MATCH); @@ -6594,10 +6894,6 @@ Perl_yylex(pTHX) TERM(THING); case '\'': - if ( PL_expect == XOPERATOR - && (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack)) - return deprecate_commaless_var_list(); - s = scan_str(s,FALSE,FALSE,FALSE,NULL); if (!s) missingterm(NULL); @@ -6610,10 +6906,6 @@ Perl_yylex(pTHX) TERM(sublex_start()); case '"': - if ( PL_expect == XOPERATOR - && (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack)) - return deprecate_commaless_var_list(); - s = scan_str(s,FALSE,FALSE,FALSE,NULL); DEBUG_T( { if (s) @@ -6893,8 +7185,10 @@ Perl_yylex(pTHX) else { /* no override */ tmp = -tmp; if (tmp == KEY_dump) { - Perl_ck_warner(aTHX_ packWARN(WARN_MISC), - "dump() better written as CORE::dump()"); + Perl_ck_warner_d(aTHX_ packWARN2(WARN_MISC,WARN_DEPRECATED), + "dump() better written as CORE::dump(). " + "dump() will no longer be available " + "in Perl 5.30"); } gv = NULL; gvp = 0; @@ -6951,7 +7245,7 @@ Perl_yylex(pTHX) s = scan_word(s, PL_tokenbuf + len, sizeof PL_tokenbuf - len, TRUE, &morelen); if (!morelen) - Perl_croak(aTHX_ "Bad name after %"UTF8f"%s", + Perl_croak(aTHX_ "Bad name after %" UTF8f "%s", UTF8fARG(UTF, len, PL_tokenbuf), *s == '\'' ? "'" : "::"); len += morelen; @@ -6979,8 +7273,9 @@ Perl_yylex(pTHX) if (ckWARN(WARN_BAREWORD) && ! gv_fetchpvn_flags(PL_tokenbuf, len, UTF ? SVf_UTF8 : 0, SVt_PVHV)) Perl_warner(aTHX_ packWARN(WARN_BAREWORD), - "Bareword \"%"UTF8f"\" refers to nonexistent package", - UTF8fARG(UTF, len, PL_tokenbuf)); + "Bareword \"%" UTF8f + "\" refers to nonexistent package", + UTF8fARG(UTF, len, PL_tokenbuf)); len -= 2; PL_tokenbuf[len] = '\0'; gv = NULL; @@ -7044,17 +7339,24 @@ Perl_yylex(pTHX) == OA_FILEREF)) { bool immediate_paren = *s == '('; + SSize_t s_off; /* (Now we can afford to cross potential line boundary.) */ s = skipspace(s); + /* intuit_method() can indirectly call lex_next_chunk(), + * invalidating s + */ + s_off = s - SvPVX(PL_linestr); /* Two barewords in a row may indicate method call. */ - - if ((isIDFIRST_lazy_if(s,UTF) || *s == '$') + if ( ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF) + || *s == '$') && (tmp = intuit_method(s, lex ? NULL : sv, cv))) { + /* the code at method: doesn't use s */ goto method; } + s = SvPVX(PL_linestr) + s_off; /* If not a declared subroutine, it's an indirect object. */ /* (But it's an indir obj regardless for sort.) */ @@ -7135,9 +7437,11 @@ Perl_yylex(pTHX) /* If followed by a bareword, see if it looks like indir obj. */ - if (tmp == 1 && !orig_keyword - && (isIDFIRST_lazy_if(s,UTF) || *s == '$') - && (tmp = intuit_method(s, lex ? NULL : sv, cv))) { + if ( tmp == 1 + && !orig_keyword + && (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF) || *s == '$') + && (tmp = intuit_method(s, lex ? NULL : sv, cv))) + { method: if (lex && !off) { assert(cSVOPx(pl_yylval.opval)->op_sv == sv); @@ -7281,7 +7585,7 @@ Perl_yylex(pTHX) if ((lastchar == '*' || lastchar == '%' || lastchar == '&') && saw_infix_sigil) { Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS), - "Operator or semicolon missing before %c%"UTF8f, + "Operator or semicolon missing before %c%" UTF8f, lastchar, UTF8fARG(UTF, strlen(PL_tokenbuf), PL_tokenbuf)); @@ -7300,7 +7604,7 @@ Perl_yylex(pTHX) case KEY___LINE__: FUN0OP( newSVOP(OP_CONST, 0, - Perl_newSVpvf(aTHX_ "%"IVdf, (IV)CopLINE(PL_curcop))) + Perl_newSVpvf(aTHX_ "%" IVdf, (IV)CopLINE(PL_curcop))) ); case KEY___PACKAGE__: @@ -7405,7 +7709,7 @@ Perl_yylex(pTHX) goto just_a_word; } if (!tmp) - Perl_croak(aTHX_ "CORE::%"UTF8f" is not a keyword", + Perl_croak(aTHX_ "CORE::%" UTF8f " is not a keyword", UTF8fARG(UTF, len, PL_tokenbuf)); if (tmp < 0) tmp = -tmp; @@ -7576,7 +7880,7 @@ Perl_yylex(pTHX) case KEY_exists: UNI(OP_EXISTS); - + case KEY_exit: UNI(OP_EXIT); @@ -7631,8 +7935,11 @@ Perl_yylex(pTHX) return REPORT(0); pl_yylval.ival = CopLINE(PL_curcop); s = skipspace(s); - if (PL_expect == XSTATE && isIDFIRST_lazy_if(s,UTF)) { + if ( PL_expect == XSTATE + && isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) + { char *p = s; + SSize_t s_off = s - SvPVX(PL_linestr); if ((PL_bufend - p) >= 3 && strEQs(p, "my") && isSPACE(*(p + 2))) @@ -7644,12 +7951,15 @@ Perl_yylex(pTHX) p += 3; p = skipspace(p); /* skip optional package name, as in "for my abc $x (..)" */ - if (isIDFIRST_lazy_if(p,UTF)) { + if (isIDFIRST_lazy_if_safe(p, PL_bufend, UTF)) { p = scan_word(p, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len); p = skipspace(p); } if (*p != '$' && *p != '\\') Perl_croak(aTHX_ "Missing $ on loop variable"); + + /* The buffer may have been reallocated, update s */ + s = SvPVX(PL_linestr) + s_off; } OPERATOR(FOR); @@ -7810,7 +8120,7 @@ Perl_yylex(pTHX) case KEY_last: LOOPX(OP_LAST); - + case KEY_lc: UNI(OP_LC); @@ -7887,7 +8197,7 @@ Perl_yylex(pTHX) } PL_in_my = (U16)tmp; s = skipspace(s); - if (isIDFIRST_lazy_if(s,UTF)) { + if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) { s = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, TRUE, &len); if (len == 3 && strEQs(PL_tokenbuf, "sub")) goto really_sub; @@ -7937,10 +8247,10 @@ Perl_yylex(pTHX) case KEY_open: s = skipspace(s); - if (isIDFIRST_lazy_if(s,UTF)) { - const char *t; - d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, - &len); + if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) { + const char *t; + d = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, + &len); for (t=d; isSPACE(*t);) t++; if ( *t && strchr("|&*+-=!?:.", *t) && ckWARN_d(WARN_PRECEDENCE) @@ -7950,7 +8260,7 @@ Perl_yylex(pTHX) && !keyword(s, d-s, 0) ) { Perl_warner(aTHX_ packWARN(WARN_PRECEDENCE), - "Precedence problem: open %"UTF8f" should be open(%"UTF8f")", + "Precedence problem: open %" UTF8f " should be open(%" UTF8f ")", UTF8fARG(UTF, d-s, s), UTF8fARG(UTF, d-s, s)); } } @@ -7990,7 +8300,7 @@ Perl_yylex(pTHX) case KEY_pos: UNIDOR(OP_POS); - + case KEY_pack: LOP(OP_PACK,XTERM); @@ -8097,9 +8407,13 @@ Perl_yylex(pTHX) { *PL_tokenbuf = '\0'; s = force_word(s,BAREWORD,TRUE,TRUE); - if (isIDFIRST_lazy_if(PL_tokenbuf,UTF)) + if (isIDFIRST_lazy_if_safe(PL_tokenbuf, + PL_tokenbuf + sizeof(PL_tokenbuf), + UTF)) + { gv_stashpvn(PL_tokenbuf, strlen(PL_tokenbuf), GV_ADD | (UTF ? SVf_UTF8 : 0)); + } else if (*s == '<') yyerror("<> at require-statement should be quotes"); } @@ -8107,7 +8421,7 @@ Perl_yylex(pTHX) orig_keyword = 0; pl_yylval.ival = 1; } - else + else pl_yylval.ival = 0; PL_expect = PL_nexttoke ? XOPERATOR : XTERM; PL_bufptr = s; @@ -8174,7 +8488,7 @@ Perl_yylex(pTHX) case KEY_chomp: UNI(OP_CHOMP); - + case KEY_scalar: UNI(OP_SCALAR); @@ -8301,7 +8615,7 @@ Perl_yylex(pTHX) s = skipspace(s); d = SvPVX(PL_linestr)+off; - if (isIDFIRST_lazy_if(s,UTF) + if ( isIDFIRST_lazy_if_safe(s, PL_bufend, UTF) || *s == '\'' || (*s == ':' && s[1] == ':')) { @@ -8360,7 +8674,8 @@ Perl_yylex(pTHX) COPLINE_SET_FROM_MULTI_END; if (!s) Perl_croak(aTHX_ "Prototype not terminated"); - (void)validate_proto(PL_subname, PL_lex_stuff, ckWARN(WARN_ILLEGALPROTO)); + (void)validate_proto(PL_subname, PL_lex_stuff, + ckWARN(WARN_ILLEGALPROTO), 0); have_proto = TRUE; s = skipspace(s); @@ -8380,7 +8695,7 @@ Perl_yylex(pTHX) if (!have_name) Perl_croak(aTHX_ "Illegal declaration of anonymous subroutine"); else if (*s != ';' && *s != '}') - Perl_croak(aTHX_ "Illegal declaration of subroutine %"SVf, SVfARG(PL_subname)); + Perl_croak(aTHX_ "Illegal declaration of subroutine %" SVf, SVfARG(PL_subname)); } if (have_proto) { @@ -8595,8 +8910,11 @@ S_pending_ident(pTHX) if (PL_in_my) { if (PL_in_my == KEY_our) { /* "our" is merely analogous to "my" */ if (has_colon) + /* diag_listed_as: No package name allowed for variable %s + in "our" */ yyerror_pv(Perl_form(aTHX_ "No package name allowed for " - "variable %s in \"our\"", + "%se %s in \"our\"", + *PL_tokenbuf=='&' ?"subroutin":"variabl", PL_tokenbuf), UTF ? SVf_UTF8 : 0); tmp = allocmy(PL_tokenbuf, tokenbuf_len, UTF ? SVf_UTF8 : 0); } @@ -8693,7 +9011,7 @@ S_pending_ident(pTHX) { /* Downgraded from fatal to warning 20000522 mjd */ Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS), - "Possible unintended interpolation of %"UTF8f + "Possible unintended interpolation of %" UTF8f " in string", UTF8fARG(UTF, tokenbuf_len, PL_tokenbuf)); } @@ -8747,16 +9065,15 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what) s++; while (s < PL_bufend && isSPACE(*s)) s++; - if (isIDFIRST_lazy_if(s,UTF)) { + if (isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) { const char * const w = s; s += UTF ? UTF8SKIP(s) : 1; - while (isWORDCHAR_lazy_if(s,UTF)) + while (isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)) s += UTF ? UTF8SKIP(s) : 1; while (s < PL_bufend && isSPACE(*s)) s++; if (*s == ',') { GV* gv; - PADOFFSET off; if (keyword(w, s - w, 0)) return; @@ -8764,6 +9081,7 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what) if (gv && GvCVu(gv)) return; if (s - w <= 254) { + PADOFFSET off; char tmpbuf[256]; Copy(w, tmpbuf+1, s - w, char); *tmpbuf = '&'; @@ -8814,7 +9132,7 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen, || ! SvOK(*cvp)) { char *msg; - + /* Here haven't found what we're looking for. If it is charnames, * perhaps it needs to be loaded. Try doing that before giving up */ if (*key == 'c') { @@ -8923,21 +9241,23 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen, PERL_STATIC_INLINE void S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package, - bool is_utf8, bool check_dollar) { + bool is_utf8, bool check_dollar) +{ PERL_ARGS_ASSERT_PARSE_IDENT; - for (;;) { + while (*s < PL_bufend) { if (*d >= e) Perl_croak(aTHX_ "%s", ident_too_long); - if (is_utf8 && isIDFIRST_utf8((U8*)*s)) { + if (is_utf8 && isIDFIRST_utf8_safe(*s, PL_bufend)) { /* The UTF-8 case must come first, otherwise things * like c\N{COMBINING TILDE} would start failing, as the * isWORDCHAR_A case below would gobble the 'c' up. */ char *t = *s + UTF8SKIP(*s); - while (isIDCONT_utf8((U8*)t)) + while (isIDCONT_utf8_safe((const U8*) t, (const U8*) PL_bufend)) { t += UTF8SKIP(t); + } if (*d + (t - *s) > e) Perl_croak(aTHX_ "%s", ident_too_long); Copy(*s, *d, t - *s, char); @@ -8949,7 +9269,10 @@ S_parse_ident(pTHX_ char **s, char **d, char * const e, int allow_package, *(*d)++ = *(*s)++; } while (isWORDCHAR_A(**s) && *d < e); } - else if (allow_package && **s == '\'' && isIDFIRST_lazy_if(*s+1,is_utf8)) { + else if ( allow_package + && **s == '\'' + && isIDFIRST_lazy_if_safe((*s)+1, PL_bufend, is_utf8)) + { *(*d)++ = ':'; *(*d)++ = ':'; (*s)++; @@ -9000,10 +9323,10 @@ S_scan_word(pTHX_ char *s, char *dest, STRLEN destlen, int allow_package, STRLEN * Because all ASCII characters have the same representation whether * encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and * '{' without knowing if is UTF-8 or not. */ -#define VALID_LEN_ONE_IDENT(s, is_utf8) \ - (isGRAPH_A(*(s)) || ((is_utf8) \ - ? isIDFIRST_utf8((U8*) (s)) \ - : (isGRAPH_L1(*s) \ +#define VALID_LEN_ONE_IDENT(s, e, is_utf8) \ + (isGRAPH_A(*(s)) || ((is_utf8) \ + ? isIDFIRST_utf8_safe(s, e) \ + : (isGRAPH_L1(*s) \ && LIKELY((U8) *(s) != LATIN1_TO_NATIVE(0xAD))))) STATIC char * @@ -9044,7 +9367,7 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni) /* Here, it is not a run-of-the-mill identifier name */ if (*s == '$' && s[1] - && (isIDFIRST_lazy_if(s+1,is_utf8) + && ( isIDFIRST_lazy_if_safe(s+1, PL_bufend, is_utf8) || isDIGIT_A((U8)s[1]) || s[1] == '$' || s[1] == '{' @@ -9067,7 +9390,7 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni) if ((s <= PL_bufend - (is_utf8) ? UTF8SKIP(s) : 1) - && VALID_LEN_ONE_IDENT(s, is_utf8)) + && VALID_LEN_ONE_IDENT(s, PL_bufend, is_utf8)) { if (is_utf8) { const STRLEN skip = UTF8SKIP(s); @@ -9095,19 +9418,36 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni) bool skip; char *s2; /* If we were processing {...} notation then... */ - if (isIDFIRST_lazy_if(d,is_utf8)) { - /* if it starts as a valid identifier, assume that it is one. - (the later check for } being at the expected point will trap - cases where this doesn't pan out.) */ - d += is_utf8 ? UTF8SKIP(d) : 1; - parse_ident(&s, &d, e, 1, is_utf8, TRUE); - *d = '\0'; + if (isIDFIRST_lazy_if_safe(d, e, is_utf8) + || (!isPRINT(*d) /* isCNTRL(d), plus all non-ASCII */ + && isWORDCHAR(*s)) + ) { + /* note we have to check for a normal identifier first, + * as it handles utf8 symbols, and only after that has + * been ruled out can we look at the caret words */ + if (isIDFIRST_lazy_if_safe(d, e, is_utf8) ) { + /* if it starts as a valid identifier, assume that it is one. + (the later check for } being at the expected point will trap + cases where this doesn't pan out.) */ + d += is_utf8 ? UTF8SKIP(d) : 1; + parse_ident(&s, &d, e, 1, is_utf8, TRUE); + *d = '\0'; + } + else { /* caret word: ${^Foo} ${^CAPTURE[0]} */ + d++; + while (isWORDCHAR(*s) && d < e) { + *d++ = *s++; + } + if (d >= e) + Perl_croak(aTHX_ "%s", ident_too_long); + *d = '\0'; + } tmp_copline = CopLINE(PL_curcop); if (s < PL_bufend && isSPACE(*s)) { s = skipspace(s); } if ((*s == '[' || (*s == '{' && strNE(dest, "sub")))) { - /* ${foo[0]} and ${foo{bar}} notation. */ + /* ${foo[0]} and ${foo{bar}} and ${^CAPTURE[0]} notation. */ if (ckWARN(WARN_AMBIGUOUS) && keyword(dest, d - dest, 0)) { const char * const brack = (const char *) @@ -9126,29 +9466,19 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni) return s; } } - /* Handle extended ${^Foo} variables - * 1999-02-27 mjd-perl-patch@plover.com */ - else if (! isPRINT(*d) /* isCNTRL(d), plus all non-ASCII */ - && isWORDCHAR(*s)) - { - d++; - while (isWORDCHAR(*s) && d < e) { - *d++ = *s++; - } - if (d >= e) - Perl_croak(aTHX_ "%s", ident_too_long); - *d = '\0'; - } if ( !tmp_copline ) tmp_copline = CopLINE(PL_curcop); - if ((skip = s < PL_bufend && isSPACE(*s))) + if ((skip = s < PL_bufend && isSPACE(*s))) { /* Avoid incrementing line numbers or resetting PL_linestart, in case we have to back up. */ + STRLEN s_off = s - SvPVX(PL_linestr); s2 = peekspace(s); + s = SvPVX(PL_linestr) + s_off; + } else s2 = s; - + /* Expect to find a closing } after consuming any trailing whitespace. */ if (*s2 == '}') { @@ -9174,7 +9504,7 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni) orig_copline = CopLINE(PL_curcop); CopLINE_set(PL_curcop, tmp_copline); Perl_warner(aTHX_ packWARN(WARN_AMBIGUOUS), - "Ambiguous use of %c{%"SVf"} resolved to %c%"SVf, + "Ambiguous use of %c{%" SVf "} resolved to %c%" SVf, funny, SVfARG(tmp), funny, SVfARG(tmp)); CopLINE_set(PL_curcop, orig_copline); } @@ -9211,7 +9541,7 @@ S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charse STRLEN charlen = UTF ? UTF8SKIP(*s) : 1; if ( charlen != 1 || ! strchr(valid_flags, c) ) { - if (isWORDCHAR_lazy_if(*s, UTF)) { + if (isWORDCHAR_lazy_if_safe( *s, PL_bufend, UTF)) { yyerror_pv(Perl_form(aTHX_ "Unknown regexp modifier \"/%.*s\"", (int)charlen, *s), UTF ? SVf_UTF8 : 0); (*s) += charlen; @@ -9355,14 +9685,10 @@ S_scan_pat(pTHX_ char *start, I32 type) /* issue a warning if /c is specified,but /g is not */ if ((pm->op_pmflags & PMf_CONTINUE) && !(pm->op_pmflags & PMf_GLOBAL)) { - Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), + Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Use of /c modifier is meaningless without /g" ); } - if (UNLIKELY((x_mod_count) > 1)) { - yyerror("Only one /x regex modifier is allowed"); - } - PL_lex_op = (OP*)pm; pl_yylval.ival = OP_MATCH; return s; @@ -9417,10 +9743,6 @@ S_scan_subst(pTHX_ char *start) } } - if (UNLIKELY((x_mod_count) > 1)) { - yyerror("Only one /x regex modifier is allowed"); - } - if ((pm->op_pmflags & PMf_CONTINUE)) { Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Use of /c modifier is meaningless in s///" ); } @@ -9430,18 +9752,14 @@ S_scan_subst(pTHX_ char *start) PL_multi_end = 0; pm->op_pmflags |= PMf_EVAL; - while (es-- > 0) { - if (es) - sv_catpvs(repl, "eval "); - else - sv_catpvs(repl, "do "); - } - sv_catpvs(repl, "{"); + for (; es > 1; es--) { + sv_catpvs(repl, "eval "); + } + sv_catpvs(repl, "do {"); sv_catsv(repl, PL_parser->lex_sub_repl); sv_catpvs(repl, "}"); SvREFCNT_dec(PL_parser->lex_sub_repl); PL_parser->lex_sub_repl = repl; - es = 1; } @@ -9455,7 +9773,8 @@ S_scan_subst(pTHX_ char *start) * spreads over */ sv_upgrade(PL_parser->lex_sub_repl, SVt_PVNV); ((XPVNV*)SvANY(PL_parser->lex_sub_repl))->xnv_u.xnv_lines = 0; - ((XPVIV*)SvANY(PL_parser->lex_sub_repl))->xiv_u.xivu_eval_seen = es; + ((XPVIV*)SvANY(PL_parser->lex_sub_repl))->xiv_u.xivu_eval_seen = + cBOOL(es); } PL_lex_op = (OP*)pm; @@ -9595,10 +9914,12 @@ S_scan_heredoc(pTHX_ char *s) s++, term = '\''; else term = '"'; - if (!isWORDCHAR_lazy_if(s,UTF)) - deprecate("bare << to mean <<\"\""); + if (! isWORDCHAR_lazy_if_safe(s, PL_bufend, UTF)) + Perl_croak(aTHX_ "Use of bare << to mean <<\"\" is forbidden"); peek = s; - while (isWORDCHAR_lazy_if(peek,UTF)) { + while ( + isWORDCHAR_lazy_if_safe(peek, PL_bufend, UTF)) + { peek += UTF ? UTF8SKIP(peek) : 1; } len = (peek - s >= e - d) ? (e - d) : (peek - s); @@ -9713,7 +10034,7 @@ S_scan_heredoc(pTHX_ char *s) /* Only valid if it's preceded by whitespace only */ while (backup != myolds && --backup >= myolds) { - if (*backup != ' ' && *backup != '\t') { + if (! SPACE_OR_TAB(*backup)) { break; } @@ -9849,14 +10170,14 @@ S_scan_heredoc(pTHX_ char *s) /* Only valid if it's preceded by whitespace only */ while (backup != s && --backup >= s) { - if (*backup != ' ' && *backup != '\t') { + if (! SPACE_OR_TAB(*backup)) { break; } indent_len++; } /* All whitespace or none! */ - if (backup == found || *backup == ' ' || *backup == '\t') { + if (backup == found || SPACE_OR_TAB(*backup)) { Newxz(indent, indent_len + 1, char); memcpy(indent, backup, indent_len); SvREFCNT_dec(PL_linestr); @@ -9896,8 +10217,8 @@ S_scan_heredoc(pTHX_ char *s) STRLEN herelen = SvCUR(tmpstr); char *ss = SvPVX(tmpstr); char *se = ss + herelen; - SV *newstr = newSVpvs(""); - SvGROW(newstr, herelen); + SV *newstr = newSV(herelen+1); + SvPOK_on(newstr); /* Trim leading whitespace */ while (ss < se) { @@ -9905,6 +10226,7 @@ S_scan_heredoc(pTHX_ char *s) if (*ss == '\n') { sv_catpv(newstr,"\n"); ss++; + linecount++; /* Found our indentation? Strip it */ } else if (se - ss >= indent_len @@ -9928,12 +10250,9 @@ S_scan_heredoc(pTHX_ char *s) (int)linecount ); } - - linecount++; } - - sv_setsv(tmpstr,newstr); - + /* avoid sv_setsv() as we dont wan't to COW here */ + sv_setpvn(tmpstr,SvPVX(newstr),SvCUR(newstr)); Safefree(indent); SvREFCNT_dec_NN(newstr); } @@ -10017,8 +10336,9 @@ S_scan_inputsymbol(pTHX_ char *start) if (*d == '$' && d[1]) d++; /* allow or */ - while (*d && (isWORDCHAR_lazy_if(d,UTF) || *d == '\'' || *d == ':')) + while (isWORDCHAR_lazy_if_safe(d, e, UTF) || *d == '\'' || *d == ':') { d += UTF ? UTF8SKIP(d) : 1; + } /* If we've tried to read what we allow filehandles to look like, and there's still text left, then it must be a glob() and not a getline. @@ -10145,7 +10465,7 @@ S_scan_inputsymbol(pTHX_ char *start) ($*@) sub prototypes sub foo ($) (stuff) sub attr parameters sub foo : attr(stuff) <> readline or globs , <>, <$fh>, or <*.c> - + In most of these cases (all but <>, patterns and transliterate) yylex() calls scan_str(). m// makes yylex() call scan_pat() which calls scan_str(). s/// makes yylex() call scan_subst() which calls @@ -10182,6 +10502,18 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re STRLEN termlen; /* length of terminating string */ line_t herelines; + /* The delimiters that have a mirror-image closing one */ + const char * opening_delims = "([{<"; + const char * closing_delims = ")]}>"; + + const char * non_grapheme_msg = "Use of unassigned code point or" + " non-standalone grapheme for a delimiter" + " will be a fatal error starting in Perl" + " 5.30"; + /* The only non-UTF character that isn't a stand alone grapheme is + * white-space, hence can't be a delimiter. So can skip for non-UTF-8 */ + bool check_grapheme = UTF && ckWARN_d(WARN_DEPRECATED); + PERL_ARGS_ASSERT_SCAN_STR; /* skip space before the delimiter */ @@ -10194,15 +10526,35 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re /* after skipping whitespace, the next character is the terminator */ term = *s; - if (!UTF) { + if (!UTF || UTF8_IS_INVARIANT(term)) { termcode = termstr[0] = term; termlen = 1; } else { termcode = utf8_to_uvchr_buf((U8*)s, (U8*)PL_bufend, &termlen); + if (check_grapheme) { + if ( UNLIKELY(UNICODE_IS_SUPER(termcode)) + || UNLIKELY(UNICODE_IS_NONCHAR(termcode))) + { + /* These are considered graphemes, and since the ending + * delimiter will be the same, we don't have to check the other + * end */ + check_grapheme = FALSE; + } + else if (UNLIKELY(! _is_grapheme((U8 *) start, + (U8 *) s, + (U8 *) PL_bufend, + termcode))) + { + Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), "%s", non_grapheme_msg); + + /* Don't have to check the other end, as have already warned at + * this one */ + check_grapheme = FALSE; + } + } + Copy(s, termstr, termlen, U8); - if (!UTF8_IS_INVARIANT(term)) - has_utf8 = TRUE; } /* mark where we are */ @@ -10210,9 +10562,10 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re PL_multi_open = termcode; herelines = PL_parser->herelines; - /* find corresponding closing delimiter */ - if (term && (tmps = strchr("([{< )]}> )]}>",term))) - termcode = termstr[0] = term = tmps[5]; + /* If the delimiter has a mirror-image closing one, get it */ + if (term && (tmps = strchr(opening_delims, term))) { + termcode = termstr[0] = term = closing_delims[tmps - opening_delims]; + } PL_multi_close = termcode; @@ -10255,18 +10608,35 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re } /* terminate when run out of buffer (the for() condition), or have found the terminator */ - else if (*s == term) { - if (termlen == 1) + else if (*s == term) { /* First byte of terminator matches */ + if (termlen == 1) /* If is the only byte, are done */ break; - if (s+termlen <= PL_bufend && memEQ(s, (char*)termstr, termlen)) + + /* If the remainder of the terminator matches, also are + * done, after checking that is a separate grapheme */ + if ( s + termlen <= PL_bufend + && memEQ(s + 1, (char*)termstr + 1, termlen - 1)) + { + if ( check_grapheme + && UNLIKELY(! _is_grapheme((U8 *) start, + (U8 *) s, + (U8 *) PL_bufend, + termcode))) + { + Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), + "%s", non_grapheme_msg); + } break; + } } - else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) + else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF) { has_utf8 = TRUE; + } + *to = *s; } } - + /* if the terminator isn't the same as the start character (e.g., matched brackets), we have to allow more in the quoting, and be prepared for nested brackets. @@ -10324,7 +10694,7 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re else if (to - SvPVX_const(sv) == 1 && to[-1] == '\r') to[-1] = '\n'; #endif - + /* if we're out of file, or a read fails, bail and reset the current line marker so we can report where the unterminated string began */ @@ -10335,7 +10705,7 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re CopLINE_set(PL_curcop, (line_t)PL_multi_start); return NULL; } - s = PL_bufptr; + s = start = PL_bufptr; } /* at this point, we have successfully read the delimited string */ @@ -10403,6 +10773,15 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) bool floatit; /* boolean: int or float? */ const char *lastub = NULL; /* position of last underbar */ static const char* const number_too_long = "Number too long"; + bool warned_about_underscore = 0; +#define WARN_ABOUT_UNDERSCORE() \ + do { \ + if (!warned_about_underscore) { \ + warned_about_underscore = 1; \ + Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), \ + "Misplaced _ in number"); \ + } \ + } while(0) /* Hexadecimal floating point. * * In many places (where we have quads and NV is IEEE 754 double) @@ -10487,8 +10866,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) } if (*s == '_') { - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "Misplaced _ in number"); + WARN_ABOUT_UNDERSCORE(); lastub = s++; } @@ -10511,8 +10889,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) /* _ are ignored -- but warned about if consecutive */ case '_': if (lastub && s == lastub + 1) - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "Misplaced _ in number"); + WARN_ABOUT_UNDERSCORE(); lastub = s++; break; @@ -10597,9 +10974,8 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) out: /* final misplaced underbar check */ - if (s[-1] == '_') { - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "Misplaced _ in number"); - } + if (s[-1] == '_') + WARN_ABOUT_UNDERSCORE(); if (UNLIKELY(HEXFP_PEEK(s))) { /* Do sloppy (on the underbars) but quick detection @@ -10808,8 +11184,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) */ if (*s == '_') { if (lastub && s == lastub + 1) - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "Misplaced _ in number"); + WARN_ABOUT_UNDERSCORE(); lastub = s++; } else { @@ -10822,9 +11197,8 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) } /* final misplaced underbar check */ - if (lastub && s == lastub + 1) { - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), "Misplaced _ in number"); - } + if (lastub && s == lastub + 1) + WARN_ABOUT_UNDERSCORE(); /* read a decimal portion if there is one. avoid 3..5 being interpreted as the number 3. followed @@ -10835,8 +11209,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) *d++ = *s++; if (*s == '_') { - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "Misplaced _ in number"); + WARN_ABOUT_UNDERSCORE(); lastub = s; } @@ -10852,18 +11225,15 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) Perl_croak(aTHX_ "%s", number_too_long); if (*s == '_') { if (lastub && s == lastub + 1) - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "Misplaced _ in number"); + WARN_ABOUT_UNDERSCORE(); lastub = s; } else *d++ = *s; } /* fractional part ending in underbar? */ - if (s[-1] == '_') { - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "Misplaced _ in number"); - } + if (s[-1] == '_') + WARN_ABOUT_UNDERSCORE(); if (*s == '.' && isDIGIT(s[1])) { /* oops, it's really a v-string, but without the "v" */ s = start; @@ -10876,9 +11246,11 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) || UNLIKELY(hexfp && isALPHA_FOLD_EQ(*s, 'p'))) && strchr("+-0123456789_", s[1])) { - floatit = TRUE; + int exp_digits = 0; + const char *save_s = s; + char * save_d = d; - /* regardless of whether user said 3E5 or 3e5, use lower 'e', + /* regardless of whether user said 3E5 or 3e5, use lower 'e', ditto for p (hexfloats) */ if ((isALPHA_FOLD_EQ(*s, 'e'))) { /* At least some Mach atof()s don't grok 'E' */ @@ -10893,8 +11265,7 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) /* stray preinitial _ */ if (*s == '_') { - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "Misplaced _ in number"); + WARN_ABOUT_UNDERSCORE(); lastub = s++; } @@ -10904,14 +11275,14 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) /* stray initial _ */ if (*s == '_') { - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "Misplaced _ in number"); + WARN_ABOUT_UNDERSCORE(); lastub = s++; } /* read digits of exponent */ while (isDIGIT(*s) || *s == '_') { if (isDIGIT(*s)) { + ++exp_digits; if (d >= e) Perl_croak(aTHX_ "%s", number_too_long); *d++ = *s++; @@ -10919,11 +11290,24 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) else { if (((lastub && s == lastub + 1) || (!isDIGIT(s[1]) && s[1] != '_'))) - Perl_ck_warner(aTHX_ packWARN(WARN_SYNTAX), - "Misplaced _ in number"); + WARN_ABOUT_UNDERSCORE(); lastub = s++; } } + + if (!exp_digits) { + /* no exponent digits, the [eEpP] could be for something else, + * though in practice we don't get here for p since that's preparsed + * earlier, and results in only the 0xX being consumed, so behave similarly + * for decimal floats and consume only the D.DD, leaving the [eE] to the + * next token. + */ + s = save_s; + d = save_d; + } + else { + floatit = TRUE; + } } @@ -11005,8 +11389,6 @@ Perl_scan_num(pTHX_ const char *start, YYSTYPE* lvalp) STATIC char * S_scan_formline(pTHX_ char *s) { - char *eol; - char *t; SV * const stuff = newSVpvs(""); bool needargs = FALSE; bool eofmt = FALSE; @@ -11014,8 +11396,9 @@ S_scan_formline(pTHX_ char *s) PERL_ARGS_ASSERT_SCAN_FORMLINE; while (!needargs) { + char *eol; if (*s == '.') { - t = s+1; + char *t = s+1; #ifdef PERL_STRICT_CR while (SPACE_OR_TAB(*t)) t++; @@ -11032,6 +11415,7 @@ S_scan_formline(pTHX_ char *s) if (!eol++) eol = PL_bufend; if (*s != '#') { + char *t; for (t = s; t < eol; t++) { if (*t == '~' && t[1] == '~' && SvCUR(stuff)) { needargs = FALSE; @@ -11075,8 +11459,7 @@ S_scan_formline(pTHX_ char *s) PL_expect = XSTATE; if (needargs) { const char *s2 = s; - while (*s2 == '\r' || *s2 == ' ' || *s2 == '\t' || *s2 == '\f' - || *s2 == '\v') + while (isSPACE(*s2) && *s2 != '\n') s2++; if (*s2 == '{') { PL_expect = XTERMBLOCK; @@ -11134,6 +11517,29 @@ S_yywarn(pTHX_ const char *const s, U32 flags) return 0; } +void +Perl_abort_execution(pTHX_ const char * const msg, const char * const name) +{ + PERL_ARGS_ASSERT_ABORT_EXECUTION; + + if (PL_minus_c) + Perl_croak(aTHX_ "%s%s had compilation errors.\n", msg, name); + else { + Perl_croak(aTHX_ + "%sExecution of %s aborted due to compilation errors.\n", msg, name); + } + NOT_REACHED; /* NOTREACHED */ +} + +void +Perl_yyquit(pTHX) +{ + /* Called, after at least one error has been found, to abort the parse now, + * instead of trying to forge ahead */ + + yyerror_pvn(NULL, 0, 0); +} + int Perl_yyerror(pTHX_ const char *const s) { @@ -11157,100 +11563,120 @@ Perl_yyerror_pvn(pTHX_ const char *const s, STRLEN len, U32 flags) SV * const where_sv = newSVpvs_flags("", SVs_TEMP); int yychar = PL_parser->yychar; - PERL_ARGS_ASSERT_YYERROR_PVN; - - if (!yychar || (yychar == ';' && !PL_rsfp)) - sv_catpvs(where_sv, "at EOF"); - else if ( PL_oldoldbufptr - && PL_bufptr > PL_oldoldbufptr - && PL_bufptr - PL_oldoldbufptr < 200 - && PL_oldoldbufptr != PL_oldbufptr - && PL_oldbufptr != PL_bufptr) - { - /* - Only for NetWare: - The code below is removed for NetWare because it abends/crashes on NetWare - when the script has error such as not having the closing quotes like: - if ($var eq "value) - Checking of white spaces is anyway done in NetWare code. - */ + /* Output error message 's' with length 'len'. 'flags' are SV flags that + * apply. If the number of errors found is large enough, it abandons + * parsing. If 's' is NULL, there is no message, and it abandons + * processing unconditionally */ + + if (s != NULL) { + if (!yychar || (yychar == ';' && !PL_rsfp)) + sv_catpvs(where_sv, "at EOF"); + else if ( PL_oldoldbufptr + && PL_bufptr > PL_oldoldbufptr + && PL_bufptr - PL_oldoldbufptr < 200 + && PL_oldoldbufptr != PL_oldbufptr + && PL_oldbufptr != PL_bufptr) + { + /* + Only for NetWare: + The code below is removed for NetWare because it + abends/crashes on NetWare when the script has error such as + not having the closing quotes like: + if ($var eq "value) + Checking of white spaces is anyway done in NetWare code. + */ #ifndef NETWARE - while (isSPACE(*PL_oldoldbufptr)) - PL_oldoldbufptr++; + while (isSPACE(*PL_oldoldbufptr)) + PL_oldoldbufptr++; #endif - context = PL_oldoldbufptr; - contlen = PL_bufptr - PL_oldoldbufptr; - } - else if ( PL_oldbufptr - && PL_bufptr > PL_oldbufptr - && PL_bufptr - PL_oldbufptr < 200 - && PL_oldbufptr != PL_bufptr) { - /* - Only for NetWare: - The code below is removed for NetWare because it abends/crashes on NetWare - when the script has error such as not having the closing quotes like: - if ($var eq "value) - Checking of white spaces is anyway done in NetWare code. - */ + context = PL_oldoldbufptr; + contlen = PL_bufptr - PL_oldoldbufptr; + } + else if ( PL_oldbufptr + && PL_bufptr > PL_oldbufptr + && PL_bufptr - PL_oldbufptr < 200 + && PL_oldbufptr != PL_bufptr) { + /* + Only for NetWare: + The code below is removed for NetWare because it + abends/crashes on NetWare when the script has error such as + not having the closing quotes like: + if ($var eq "value) + Checking of white spaces is anyway done in NetWare code. + */ #ifndef NETWARE - while (isSPACE(*PL_oldbufptr)) - PL_oldbufptr++; + while (isSPACE(*PL_oldbufptr)) + PL_oldbufptr++; #endif - context = PL_oldbufptr; - contlen = PL_bufptr - PL_oldbufptr; - } - else if (yychar > 255) - sv_catpvs(where_sv, "next token ???"); - else if (yychar == YYEMPTY) { - if (PL_lex_state == LEX_NORMAL) - sv_catpvs(where_sv, "at end of line"); - else if (PL_lex_inpat) - sv_catpvs(where_sv, "within pattern"); - else - sv_catpvs(where_sv, "within string"); - } - else { - sv_catpvs(where_sv, "next char "); - if (yychar < 32) - Perl_sv_catpvf(aTHX_ where_sv, "^%c", toCTRL(yychar)); - else if (isPRINT_LC(yychar)) { - const char string = yychar; - sv_catpvn(where_sv, &string, 1); - } - else - Perl_sv_catpvf(aTHX_ where_sv, "\\%03o", yychar & 255); - } - msg = newSVpvn_flags(s, len, (flags & SVf_UTF8) | SVs_TEMP); - Perl_sv_catpvf(aTHX_ msg, " at %s line %"IVdf", ", - OutCopFILE(PL_curcop), - (IV)(PL_parser->preambling == NOLINE - ? CopLINE(PL_curcop) - : PL_parser->preambling)); - if (context) - Perl_sv_catpvf(aTHX_ msg, "near \"%"UTF8f"\"\n", - UTF8fARG(UTF, contlen, context)); - else - Perl_sv_catpvf(aTHX_ msg, "%"SVf"\n", SVfARG(where_sv)); - if (PL_multi_start < PL_multi_end && (U32)(CopLINE(PL_curcop) - PL_multi_end) <= 1) { - Perl_sv_catpvf(aTHX_ msg, - " (Might be a runaway multi-line %c%c string starting on line %"IVdf")\n", - (int)PL_multi_open,(int)PL_multi_close,(IV)PL_multi_start); - PL_multi_end = 0; - } - if (PL_in_eval & EVAL_WARNONLY) { - PL_in_eval &= ~EVAL_WARNONLY; - Perl_ck_warner_d(aTHX_ packWARN(WARN_SYNTAX), "%"SVf, SVfARG(msg)); + context = PL_oldbufptr; + contlen = PL_bufptr - PL_oldbufptr; + } + else if (yychar > 255) + sv_catpvs(where_sv, "next token ???"); + else if (yychar == YYEMPTY) { + if (PL_lex_state == LEX_NORMAL) + sv_catpvs(where_sv, "at end of line"); + else if (PL_lex_inpat) + sv_catpvs(where_sv, "within pattern"); + else + sv_catpvs(where_sv, "within string"); + } + else { + sv_catpvs(where_sv, "next char "); + if (yychar < 32) + Perl_sv_catpvf(aTHX_ where_sv, "^%c", toCTRL(yychar)); + else if (isPRINT_LC(yychar)) { + const char string = yychar; + sv_catpvn(where_sv, &string, 1); + } + else + Perl_sv_catpvf(aTHX_ where_sv, "\\%03o", yychar & 255); + } + msg = newSVpvn_flags(s, len, (flags & SVf_UTF8) | SVs_TEMP); + Perl_sv_catpvf(aTHX_ msg, " at %s line %" IVdf ", ", + OutCopFILE(PL_curcop), + (IV)(PL_parser->preambling == NOLINE + ? CopLINE(PL_curcop) + : PL_parser->preambling)); + if (context) + Perl_sv_catpvf(aTHX_ msg, "near \"%" UTF8f "\"\n", + UTF8fARG(UTF, contlen, context)); + else + Perl_sv_catpvf(aTHX_ msg, "%" SVf "\n", SVfARG(where_sv)); + if ( PL_multi_start < PL_multi_end + && (U32)(CopLINE(PL_curcop) - PL_multi_end) <= 1) + { + Perl_sv_catpvf(aTHX_ msg, + " (Might be a runaway multi-line %c%c string starting on" + " line %" IVdf ")\n", + (int)PL_multi_open,(int)PL_multi_close,(IV)PL_multi_start); + PL_multi_end = 0; + } + if (PL_in_eval & EVAL_WARNONLY) { + PL_in_eval &= ~EVAL_WARNONLY; + Perl_ck_warner_d(aTHX_ packWARN(WARN_SYNTAX), "%" SVf, SVfARG(msg)); + } + else { + qerror(msg); + } } - else - qerror(msg); - if (PL_error_count >= 10) { - SV * errsv; - if (PL_in_eval && ((errsv = ERRSV), SvCUR(errsv))) - Perl_croak(aTHX_ "%"SVf"%s has too many errors.\n", - SVfARG(errsv), OutCopFILE(PL_curcop)); - else - Perl_croak(aTHX_ "%s has too many errors.\n", - OutCopFILE(PL_curcop)); + if (s == NULL || PL_error_count >= 10) { + const char * msg = ""; + const char * const name = OutCopFILE(PL_curcop); + + if (PL_in_eval) { + SV * errsv = ERRSV; + if (SvCUR(errsv)) { + msg = Perl_form(aTHX_ "%" SVf, SVfARG(errsv)); + } + } + + if (s == NULL) { + abort_execution(msg, name); + } + else { + Perl_croak(aTHX_ "%s%s has too many errors.\n", msg, name); + } } PL_in_my = 0; PL_in_my_stash = NULL; @@ -11369,10 +11795,10 @@ S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen) Perl_croak(aTHX_ "panic: utf16_textfilter called in block mode (for %d characters)", maxlen); } if (status < 0) { - Perl_croak(aTHX_ "panic: utf16_textfilter called after error (status=%"IVdf")", status); + Perl_croak(aTHX_ "panic: utf16_textfilter called after error (status=%" IVdf ")", status); } DEBUG_P(PerlIO_printf(Perl_debug_log, - "utf16_textfilter(%p,%ce): idx=%d maxlen=%d status=%"IVdf" utf16=%"UVuf" utf8=%"UVuf"\n", + "utf16_textfilter(%p,%ce): idx=%d maxlen=%d status=%" IVdf " utf16=%" UVuf " utf8=%" UVuf "\n", FPTR2DPTR(void *, S_utf16_textfilter), reverse ? 'l' : 'b', idx, maxlen, status, (UV)SvCUR(utf16_buffer), (UV)SvCUR(utf8_buffer))); @@ -11427,7 +11853,7 @@ S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen) status = FILTER_READ(idx + 1, utf16_buffer, 160 + (SvCUR(utf16_buffer) & 1)); - DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter status=%"IVdf" SvCUR(sv)=%"UVuf"\n", status, (UV)SvCUR(utf16_buffer))); + DEBUG_P(PerlIO_printf(Perl_debug_log, "utf16_textfilter status=%" IVdf " SvCUR(sv)=%" UVuf "\n", status, (UV)SvCUR(utf16_buffer))); DEBUG_P({ sv_dump(utf16_buffer); sv_dump(utf8_buffer);}); if (status < 0) { /* Error */ @@ -11463,7 +11889,7 @@ S_utf16_textfilter(pTHX_ int idx, SV *sv, int maxlen) } } DEBUG_P(PerlIO_printf(Perl_debug_log, - "utf16_textfilter: returns, status=%"IVdf" utf16=%"UVuf" utf8=%"UVuf"\n", + "utf16_textfilter: returns, status=%" IVdf " utf16=%" UVuf " utf8=%" UVuf "\n", status, (UV)SvCUR(utf16_buffer), (UV)SvCUR(utf8_buffer))); DEBUG_P({ sv_dump(utf8_buffer); sv_dump(sv);}); @@ -11887,7 +12313,7 @@ Perl_parse_label(pTHX_ U32 flags) STRLEN wlen, bufptr_pos; lex_read_space(0); t = s = PL_bufptr; - if (!isIDFIRST_lazy_if(s, UTF)) + if (!isIDFIRST_lazy_if_safe(s, PL_bufend, UTF)) goto no_label; t = scan_word(s, PL_tokenbuf, sizeof PL_tokenbuf, FALSE, &wlen); if (word_takes_any_delimiter(s, wlen))