X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/10c25cd94120c0e509e5ce54480c24d08281e090..4e96da834c8a37737d5de382697fd3646ba68673:/toke.c diff --git a/toke.c b/toke.c index 9c9731a..ae832c0 100644 --- a/toke.c +++ b/toke.c @@ -99,9 +99,9 @@ static const char* const ident_too_long = "Identifier too long"; #define XFAKEBRACK 0x80 #ifdef USE_UTF8_SCRIPTS -# define UTF (!IN_BYTES) +# define UTF cBOOL(!IN_BYTES) #else -# define UTF ((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8))) +# define UTF cBOOL((PL_linestr && DO_UTF8(PL_linestr)) || ( !(PL_parser->lex_flags & LEX_IGNORE_UTF8_HINTS) && (PL_hints & HINT_UTF8))) #endif /* The maximum number of characters preceding the unrecognized one to display */ @@ -206,7 +206,7 @@ static const char* const lex_state_names[] = { #define TOKEN(retval) return ( PL_bufptr = s, REPORT(retval)) #define OPERATOR(retval) return (PL_expect = XTERM, PL_bufptr = s, REPORT(retval)) -#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, REPORT(retval))) +#define AOPERATOR(retval) return ao((PL_expect = XTERM, PL_bufptr = s, retval)) #define PREBLOCK(retval) return (PL_expect = XBLOCK,PL_bufptr = s, REPORT(retval)) #define PRETERMBLOCK(retval) return (PL_expect = XTERMBLOCK,PL_bufptr = s, REPORT(retval)) #define PREREF(retval) return (PL_expect = XREF,PL_bufptr = s, REPORT(retval)) @@ -220,14 +220,14 @@ static const char* const lex_state_names[] = { #define FUN0(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0)) #define FUN0OP(f) return (pl_yylval.opval=f, CLINE, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC0OP)) #define FUN1(f) return (pl_yylval.ival=f, PL_expect=XOPERATOR, PL_bufptr=s, REPORT((int)FUNC1)) -#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITOROP))) -#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)BITANDOP))) -#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)SHIFTOP))) -#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)POWOP))) +#define BOop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITOROP)) +#define BAop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)BITANDOP)) +#define SHop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)SHIFTOP)) +#define PWop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)POWOP)) #define PMop(f) return(pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MATCHOP)) -#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)ADDOP))) +#define Aop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)ADDOP)) #define AopNOASSIGN(f) return (pl_yylval.ival=f, PL_bufptr=s, REPORT((int)ADDOP)) -#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)MULOP))) +#define Mop(f) return ao((pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, (int)MULOP)) #define Eop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)EQOP)) #define Rop(f) return (pl_yylval.ival=f, PL_expect=XTERM, PL_bufptr=s, REPORT((int)RELOP)) @@ -486,7 +486,7 @@ S_ao(pTHX_ int toketype) pl_yylval.ival = OP_DORASSIGN; toketype = ASSIGNOP; } - return toketype; + return REPORT(toketype); } /* @@ -730,7 +730,7 @@ Perl_lex_start(pTHX_ SV *line, PerlIO *rsfp, U32 flags) parser->bufend = parser->bufptr + SvCUR(parser->linestr); parser->last_lop = parser->last_uni = NULL; - assert(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES + STATIC_ASSERT_STMT(FITS_IN_8_BITS(LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES |LEX_DONT_CLOSE_RSFP)); parser->lex_flags = (U8) (flags & (LEX_IGNORE_UTF8_HINTS|LEX_EVALBYTES |LEX_DONT_CLOSE_RSFP)); @@ -1968,7 +1968,7 @@ S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len) SV * const sv = newSVpvn_utf8(start, len, !IN_BYTES && UTF - && !is_ascii_string((const U8*)start, len) + && !is_invariant_string((const U8*)start, len) && is_utf8_string((const U8*)start, len)); return sv; } @@ -1987,7 +1987,6 @@ S_newSV_maybe_utf8(pTHX_ const char *const start, STRLEN len) * a keyword (do this if the word is a label, e.g. goto FOO) * int allow_pack : if true, : characters will also be allowed (require, * use, etc. do this) - * int allow_initial_tick : used by the "sub" lexer only. */ STATIC char * @@ -2054,7 +2053,7 @@ S_force_ident(pTHX_ const char *s, int kind) warnings if the symbol must be introduced in an eval. GSAR 96-10-12 */ gv_fetchpvn_flags(s, len, - (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL) + (PL_in_eval ? GV_ADDMULTI : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ), kind == '$' ? SVt_PV : kind == '@' ? SVt_PVAV : @@ -3085,6 +3084,7 @@ S_scan_const(pTHX_ char *start) * symbol meaning, e.g. \x{2E} would be confused with a dot. But * in spite of this, we do have to process \N here while the proper * charnames handler is in scope. See bugs #56444 and #62056. + * * There is a complication because \N in a pattern may also stand * for 'match a non-nl', and not mean a charname, in which case its * processing should be deferred to the regex compiler. To be a @@ -3189,9 +3189,13 @@ S_scan_const(pTHX_ char *start) SvPOK_on(sv); *d = '\0'; /* See Note on sizing above. */ - sv_utf8_upgrade_flags_grow(sv, - SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, - UNISKIP(uv) + (STRLEN)(send - s) + 1); + sv_utf8_upgrade_flags_grow( + sv, + SV_GMAGIC|SV_FORCE_UTF8_UPGRADE + /* Above-latin1 in string + * implies no encoding */ + |SV_UTF8_NO_ENCODING, + UNISKIP(uv) + (STRLEN)(send - s) + 1); d = SvPVX(sv) + SvCUR(sv); has_utf8 = TRUE; } @@ -3219,31 +3223,44 @@ S_scan_const(pTHX_ char *start) continue; case 'N': - /* In a non-pattern \N must be a named character, like \N{LATIN - * SMALL LETTER A} or \N{U+0041}. For patterns, it also can - * mean to match a non-newline. For non-patterns, named - * characters are converted to their string equivalents. In - * patterns, named characters are not converted to their - * ultimate forms for the same reasons that other escapes - * aren't. Instead, they are converted to the \N{U+...} form - * to get the value from the charnames that is in effect right - * now, while preserving the fact that it was a named character - * so that the regex compiler knows this */ - - /* The structure of this section of code (besides checking for + /* In a non-pattern \N must be like \N{U+0041}, or it can be a + * named character, like \N{LATIN SMALL LETTER A}, or a named + * sequence, like \N{LATIN CAPITAL LETTER A WITH MACRON AND + * GRAVE}. For convenience all three forms are referred to as + * "named characters" below. + * + * For patterns, \N also can mean to match a non-newline. Code + * before this 'switch' statement should already have handled + * this situation, and hence this code only has to deal with + * the named character cases. + * + * For non-patterns, the named characters are converted to + * their string equivalents. In patterns, named characters are + * not converted to their ultimate forms for the same reasons + * that other escapes aren't. Instead, they are converted to + * the \N{U+...} form to get the value from the charnames that + * is in effect right now, while preserving the fact that it + * was a named character, so that the regex compiler knows + * this. + * + * The structure of this section of code (besides checking for * errors and upgrading to utf8) is: - * Further disambiguate between the two meanings of \N, and if - * not a charname, go process it elsewhere - * If of form \N{U+...}, pass it through if a pattern; - * otherwise convert to utf8 - * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a - * pattern; otherwise convert to utf8 */ - - /* Here, s points to the 'N'; the test below is guaranteed to - * succeed if we are being called on a pattern as we already - * know from a test above that the next character is a '{'. - * On a non-pattern \N must mean 'named sequence, which - * requires braces */ + * If the named character is of the form \N{U+...}, pass it + * through if a pattern; otherwise convert the code point + * to utf8 + * Otherwise must be some \N{NAME}: convert to \N{U+c1.c2...} + * if a pattern; otherwise convert to utf8 + * + * If the regex compiler should ever need to differentiate + * between the \N{U+...} and \N{name} forms, that could easily + * be done here by stripping any leading zeros from the + * \N{U+...} case, and adding them to the other one. */ + + /* Here, 's' points to the 'N'; the test below is guaranteed to + * succeed if we are being called on a pattern, as we already + * know from a test above that the next character is a '{'. A + * non-pattern \N must mean 'named character', which requires + * braces */ s++; if (*s != '{') { yyerror("Missing braces on \\N{}"); @@ -3268,8 +3285,6 @@ S_scan_const(pTHX_ char *start) | PERL_SCAN_DISALLOW_PREFIX; STRLEN len; - /* For \N{U+...}, the '...' is a unicode value even on - * EBCDIC machines */ s += 2; /* Skip to next char after the 'U+' */ len = e - s; uv = grok_hex(s, &len, &flags, NULL); @@ -3280,27 +3295,26 @@ S_scan_const(pTHX_ char *start) } if (PL_lex_inpat) { - - /* On non-EBCDIC platforms, pass through to the regex - * compiler unchanged. The reason we evaluated the - * number above is to make sure there wasn't a syntax - * error. But on EBCDIC we convert to native so - * downstream code can continue to assume it's native - */ s -= 5; /* Include the '\N{U+' */ #ifdef EBCDIC - d += my_snprintf(d, e - s + 1 + 1, /* includes the } + /* On EBCDIC platforms, in \N{U+...}, the '...' is a + * Unicode value, so convert to native so downstream + * code can continue to assume it's native */ + d += my_snprintf(d, e - s + 1 + 1, /* includes the '}' and the \0 */ - "\\N{U+%X}", - (unsigned int) UNI_TO_NATIVE(uv)); + "\\N{U+%X}", + (unsigned int) UNI_TO_NATIVE(uv)); #else - Copy(s, d, e - s + 1, char); /* 1 = include the } */ + /* On non-EBCDIC platforms, pass it through unchanged. + * The reason we evaluated the number above is to make + * sure there wasn't a syntax error. */ + Copy(s, d, e - s + 1, char); /* +1 is for the '}' */ d += e - s + 1; #endif } else { /* Not a pattern: convert the hex to string */ - /* If destination is not in utf8, unconditionally + /* If the destination is not in utf8, unconditionally * recode it to be so. This is because \N{} implies * Unicode semantics, and scalars have to be in utf8 * to guarantee those semantics */ @@ -3353,13 +3367,18 @@ S_scan_const(pTHX_ char *start) * through the string. Each character takes up * 2 hex digits plus either a trailing dot or * the "}" */ + const char initial_text[] = "\\N{U+"; + const STRLEN initial_len = sizeof(initial_text) + - 1; d = off + SvGROW(sv, off + 3 * len - + 6 /* For the "\N{U+", and - trailing NUL */ + + /* +1 for trailing NUL */ + + initial_len + 1 + + (STRLEN)(send - e)); - Copy("\\N{U+", d, 5, char); - d += 5; + Copy(initial_text, d, initial_len, char); + d += initial_len; while (str < str_end) { char hex_string[4]; int len = @@ -3371,7 +3390,7 @@ S_scan_const(pTHX_ char *start) d += 3; str++; } - d--; /* We will overwrite below the final + d--; /* Below, we will overwrite the final dot with a right brace */ } else { @@ -3456,8 +3475,8 @@ S_scan_const(pTHX_ char *start) const STRLEN off = d - SvPVX_const(sv); d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1); } - if (! SvUTF8(res)) { /* Make sure is \N{} return is UTF-8 */ - sv_utf8_upgrade(res); + if (! SvUTF8(res)) { /* Make sure \N{} return is UTF-8 */ + sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING); str = SvPV_const(res, len); } Copy(str, d, len, char); @@ -3573,8 +3592,8 @@ S_scan_const(pTHX_ char *start) " >= %"UVuf, (UV)SvCUR(sv), (UV)SvLEN(sv)); SvPOK_on(sv); - if (PL_encoding && !has_utf8) { - sv_recode_to_utf8(sv, PL_encoding); + if (IN_ENCODING && !has_utf8) { + sv_recode_to_utf8(sv, _get_encoding()); if (SvUTF8(sv)) has_utf8 = TRUE; } @@ -4128,7 +4147,7 @@ S_find_in_my_stash(pTHX_ const char *pkgname, STRLEN len) if (gv && GvCV(gv)) { SV * const sv = cv_const_sv(GvCV(gv)); if (sv) - pkgname = SvPV_const(sv, len); + return gv_stashsv(sv, 0); } return gv_stashpvn(pkgname, len, UTF ? SVf_UTF8 : 0); @@ -5486,9 +5505,10 @@ Perl_yylex(pTHX) OPERATOR(HASHBRACK); } if (PL_expect == XREF && PL_oldoldbufptr != PL_last_lop) { - /* ${...} or @{...} etc., but not print {...} */ - PL_expect = XTERM; - break; + /* ${...} or @{...} etc., but not print {...} + * Skip the disambiguation and treat this as a block. + */ + goto block_expectation; } /* This hack serves to disambiguate a pair of curlies * as being a block or an anon hash. Normally, expectation @@ -5572,7 +5592,28 @@ Perl_yylex(pTHX) || (*t == '=' && t[1] == '>'))) OPERATOR(HASHBRACK); if (PL_expect == XREF) - PL_expect = XTERM; + { + block_expectation: + /* If there is an opening brace or 'sub:', treat it + as a term to make ${{...}}{k} and &{sub:attr...} + dwim. Otherwise, treat it as a statement, so + map {no strict; ...} works. + */ + s = skipspace(s); + if (*s == '{') { + PL_expect = XTERM; + break; + } + if (strnEQ(s, "sub", 3)) { + d = s + 3; + d = skipspace(d); + if (*d == ':') { + PL_expect = XTERM; + break; + } + } + PL_expect = XSTATE; + } else { PL_lex_brackstack[PL_lex_brackets-1] = XSTATE; PL_expect = XSTATE; @@ -5581,8 +5622,7 @@ Perl_yylex(pTHX) break; } pl_yylval.ival = CopLINE(PL_curcop); - if (isSPACE(*s) || *s == '#') - PL_copline = NOLINE; /* invalidate current command line number */ + PL_copline = NOLINE; /* invalidate current command line number */ TOKEN(formbrack ? '=' : '{'); case '}': if (PL_lex_brackets && PL_lex_brackstack[PL_lex_brackets-1] == XFAKEEOF) @@ -5797,7 +5837,7 @@ Perl_yylex(pTHX) if (PL_expect != XOPERATOR) { if (s[1] != '<' && !strchr(s,'>')) check_uni(); - if (s[1] == '<') + if (s[1] == '<' && s[2] != '>') s = scan_heredoc(s); else s = scan_inputsymbol(s); @@ -6352,7 +6392,7 @@ Perl_yylex(pTHX) char tmpbuf[sizeof PL_tokenbuf + 1]; *tmpbuf = '&'; Copy(PL_tokenbuf, tmpbuf+1, len, char); - off = pad_findmy_pvn(tmpbuf, len+1, UTF ? SVf_UTF8 : 0); + off = pad_findmy_pvn(tmpbuf, len+1, 0); if (off != NOT_IN_PAD) { assert(off); /* we assume this is boolean-true below */ if (PAD_COMPNAME_FLAGS_isOUR(off)) { @@ -6474,10 +6514,6 @@ Perl_yylex(pTHX) just_a_word: { int pkgname = 0; const char lastchar = (PL_bufptr == PL_oldoldbufptr ? 0 : PL_bufptr[-1]); - const char penultchar = - lastchar && PL_bufptr - 2 >= PL_linestart - ? PL_bufptr[-2] - : 0; bool safebw; @@ -6556,7 +6592,11 @@ Perl_yylex(pTHX) rv2cv_op = newCVREF(OPpMAY_RETURN_CONSTANT<<8, const_op); cv = lex - ? isGV(gv) ? GvCV(gv) : (CV *)gv + ? isGV(gv) + ? GvCV(gv) + : SvROK(gv) && SvTYPE(SvRV(gv)) == SVt_PVCV + ? (CV *)SvRV(gv) + : ((CV *)gv) : rv2cv_op_cv(rv2cv_op, RV2CVOPCV_RETURN_STUB); } @@ -6583,11 +6623,7 @@ Perl_yylex(pTHX) if ((isIDFIRST_lazy_if(s,UTF) || *s == '$') && (tmp = intuit_method(s, lex ? NULL : sv, cv))) { - op_free(rv2cv_op); - if (tmp == METHOD && !PL_lex_allbrackets && - PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) - PL_lex_fakeeof = LEX_FAKEEOF_LOWLOGIC; - return REPORT(tmp); + goto method; } /* If not a declared subroutine, it's an indirect object. */ @@ -6615,13 +6651,17 @@ Perl_yylex(pTHX) if (*s == '=' && s[1] == '>' && !pkgname) { op_free(rv2cv_op); CLINE; - /* This is our own scalar, created a few lines above, - so this is safe. */ - SvREADONLY_off(cSVOPx(pl_yylval.opval)->op_sv); - sv_setpv(((SVOP*)pl_yylval.opval)->op_sv, PL_tokenbuf); - if (UTF && !IN_BYTES && is_utf8_string((U8*)PL_tokenbuf, len)) - SvUTF8_on(((SVOP*)pl_yylval.opval)->op_sv); - SvREADONLY_on(cSVOPx(pl_yylval.opval)->op_sv); + if (gvp || (lex && !off)) { + assert (cSVOPx(pl_yylval.opval)->op_sv == sv); + /* This is our own scalar, created a few lines + above, so this is safe. */ + SvREADONLY_off(sv); + sv_setpv(sv, PL_tokenbuf); + if (UTF && !IN_BYTES + && is_utf8_string((U8*)PL_tokenbuf, len)) + SvUTF8_on(sv); + SvREADONLY_on(sv); + } TERM(WORD); } @@ -6665,6 +6705,16 @@ Perl_yylex(pTHX) if (tmp == 1 && !orig_keyword && (isIDFIRST_lazy_if(s,UTF) || *s == '$') && (tmp = intuit_method(s, lex ? NULL : sv, cv))) { + method: + if (lex && !off) { + assert(cSVOPx(pl_yylval.opval)->op_sv == sv); + SvREADONLY_off(sv); + sv_setpvn(sv, PL_tokenbuf, len); + if (UTF && !IN_BYTES + && is_utf8_string((U8*)PL_tokenbuf, len)) + SvUTF8_on (sv); + else SvUTF8_off(sv); + } op_free(rv2cv_op); if (tmp == METHOD && !PL_lex_allbrackets && PL_lex_fakeeof > LEX_FAKEEOF_LOWLOGIC) @@ -6675,14 +6725,6 @@ Perl_yylex(pTHX) /* Not a method, so call it a subroutine (if defined) */ if (cv) { - OP *gvop; - if (lastchar == '-' && penultchar != '-') { - const STRLEN l = len ? len : strlen(PL_tokenbuf); - Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS), - "Ambiguous use of -%"UTF8f" resolved as -&%"UTF8f"()", - UTF8fARG(UTF, l, PL_tokenbuf), - UTF8fARG(UTF, l, PL_tokenbuf)); - } /* Check for a constant sub */ if ((sv = cv_const_sv_or_av(cv))) { its_constant: @@ -6700,20 +6742,6 @@ Perl_yylex(pTHX) TOKEN(WORD); } - /* Resolve to GV now if this is a placeholder. */ - if (!off && (gvop = cUNOPx(rv2cv_op)->op_first) - && gvop->op_type == OP_GV) { - GV *gv2 = cGVOPx_gv(gvop); - if (gv2 && !isGV(gv2)) { - gv = gv_fetchpv(PL_tokenbuf, 0, SVt_PVCV); - assert (SvTYPE(gv) == SVt_PVGV); - /* cv must have been some sort of placeholder, - so now needs replacing with a real code - reference. */ - cv = GvCV(gv); - } - } - op_free(pl_yylval.opval); pl_yylval.opval = off ? (OP *)newCVREF(0, rv2cv_op) : rv2cv_op; @@ -6896,13 +6924,13 @@ Perl_yylex(pTHX) if (!IN_BYTES) { if (UTF) PerlIO_apply_layers(aTHX_ PL_rsfp, NULL, ":utf8"); - else if (PL_encoding) { + else if (IN_ENCODING) { SV *name; dSP; ENTER; SAVETMPS; PUSHMARK(sp); - XPUSHs(PL_encoding); + XPUSHs(_get_encoding()); PUTBACK; call_method("name", G_SCALAR); SPAGAIN; @@ -6922,7 +6950,9 @@ Perl_yylex(pTHX) } case KEY___SUB__: - FUN0OP(newPVOP(OP_RUNCV,0,NULL)); + FUN0OP(CvCLONE(PL_compcv) + ? newOP(OP_RUNCV, 0) + : newPVOP(OP_RUNCV,0,NULL)); case KEY_AUTOLOAD: case KEY_DESTROY: @@ -7851,7 +7881,7 @@ Perl_yylex(pTHX) *PL_tokenbuf = '&'; if (memchr(tmpbuf, ':', len) || key != KEY_sub || pad_findmy_pvn( - PL_tokenbuf, len + 1, UTF ? SVf_UTF8 : 0 + PL_tokenbuf, len + 1, 0 ) != NOT_IN_PAD) sv_setpvn(PL_subname, tmpbuf, len); else { @@ -8127,10 +8157,13 @@ S_pending_ident(pTHX) } else { if (has_colon) { + /* "my" variable %s can't be in a package */ /* PL_no_myglob is constant */ GCC_DIAG_IGNORE(-Wformat-nonliteral); yyerror_pv(Perl_form(aTHX_ PL_no_myglob, - PL_in_my == KEY_my ? "my" : "state", PL_tokenbuf), + PL_in_my == KEY_my ? "my" : "state", + *PL_tokenbuf == '&' ? "subroutin" : "variabl", + PL_tokenbuf), UTF ? SVf_UTF8 : 0); GCC_DIAG_RESTORE; } @@ -8149,7 +8182,7 @@ S_pending_ident(pTHX) if (!has_colon) { if (!PL_in_my) tmp = pad_findmy_pvn(PL_tokenbuf, tokenbuf_len, - UTF ? SVf_UTF8 : 0); + 0); if (tmp != NOT_IN_PAD) { /* might be an "our" variable" */ if (PAD_COMPNAME_FLAGS_isOUR(tmp)) { @@ -8163,10 +8196,7 @@ S_pending_ident(pTHX) pl_yylval.opval->op_private = OPpCONST_ENTERED; if (pit != '&') gv_fetchsv(sym, - (PL_in_eval - ? (GV_ADDMULTI | GV_ADDINEVAL) - : GV_ADDMULTI - ), + GV_ADDMULTI, ((PL_tokenbuf[0] == '$') ? SVt_PV : (PL_tokenbuf[0] == '@') ? SVt_PVAV : SVt_PVHV)); @@ -8210,7 +8240,7 @@ S_pending_ident(pTHX) pl_yylval.opval->op_private = OPpCONST_ENTERED; if (pit != '&') gv_fetchpvn_flags(PL_tokenbuf+1, tokenbuf_len - 1, - (PL_in_eval ? (GV_ADDMULTI | GV_ADDINEVAL) : GV_ADD) + (PL_in_eval ? GV_ADDMULTI : GV_ADD) | ( UTF ? SVf_UTF8 : 0 ), ((PL_tokenbuf[0] == '$') ? SVt_PV : (PL_tokenbuf[0] == '@') ? SVt_PVAV @@ -8259,12 +8289,20 @@ S_checkcomma(pTHX_ const char *s, const char *name, const char *what) s++; if (*s == ',') { GV* gv; + PADOFFSET off; if (keyword(w, s - w, 0)) return; gv = gv_fetchpvn_flags(w, s - w, ( UTF ? SVf_UTF8 : 0 ), SVt_PVCV); if (gv && GvCVu(gv)) return; + if (s - w <= 254) { + char tmpbuf[256]; + Copy(w, tmpbuf+1, s - w, char); + *tmpbuf = '&'; + off = pad_findmy_pvn(tmpbuf, s-w+1, 0); + if (off != NOT_IN_PAD) return; + } Perl_croak(aTHX_ "No comma allowed after %s", what); } } @@ -8539,25 +8577,54 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni) /* Is the byte 'd' a legal single character identifier name? 'u' is true * iff Unicode semantics are to be used. The legal ones are any of: - * a) ASCII digits - * b) ASCII punctuation + * a) all ASCII characters except: + * 1) space-type ones, like \t and SPACE; + 2) NUL; + * 3) '{' + * The final case currently doesn't get this far in the program, so we + * don't test for it. If that were to change, it would be ok to allow it. * c) When not under Unicode rules, any upper Latin1 character - * d) \c?, \c\, \c^, \c_, and \cA..\cZ, minus the ones that have traditionally - * been matched by \s on ASCII platforms. That is: \c?, plus 1-32, minus - * the \s ones. */ -#define VALID_LEN_ONE_IDENT(d, u) (isPUNCT_A((U8)(d)) \ - || isDIGIT_A((U8)(d)) \ - || (!(u) && !isASCII((U8)(d))) \ - || ((((U8)(d)) < 32) \ - && (((((U8)(d)) >= 14) \ - || (((U8)(d)) <= 8 && (d) != 0) \ - || (((U8)(d)) == 13)))) \ - || (((U8)(d)) == toCTRL('?'))) - if (s < PL_bufend - && (isIDFIRST_lazy_if(s, is_utf8) || VALID_LEN_ONE_IDENT(*s, is_utf8))) + * d) Otherwise, when unicode rules are used, all XIDS characters. + * + * Because all ASCII characters have the same representation whether + * encoded in UTF-8 or not, we can use the foo_A macros below and '\0' and + * '{' without knowing if is UTF-8 or not. + * EBCDIC already uses the rules that ASCII platforms will use after the + * deprecation cycle; see comment below about the deprecation. */ +#ifdef EBCDIC +# define VALID_LEN_ONE_IDENT(s, is_utf8) \ + (isGRAPH_A(*(s)) || ((is_utf8) \ + ? isIDFIRST_utf8((U8*) (s)) \ + : (isGRAPH_L1(*s) \ + && LIKELY((U8) *(s) != LATIN1_TO_NATIVE(0xAD))))) +#else +# define VALID_LEN_ONE_IDENT(s, is_utf8) (! isSPACE_A(*(s)) \ + && LIKELY(*(s) != '\0') \ + && (! is_utf8 \ + || isASCII_utf8((U8*) (s)) \ + || isIDFIRST_utf8((U8*) (s)))) +#endif + if ((s <= PL_bufend - (is_utf8) + ? UTF8SKIP(s) + : 1) + && VALID_LEN_ONE_IDENT(s, is_utf8)) { - if ( isCNTRL_A((U8)*s) ) { - deprecate("literal control characters in variable names"); + /* Deprecate all non-graphic characters. Include SHY as a non-graphic, + * because often it has no graphic representation. (We can't get to + * here with SHY when 'is_utf8' is true, so no need to include a UTF-8 + * test for it.) */ + if ((is_utf8) + ? ! isGRAPH_utf8( (U8*) s) + : (! isGRAPH_L1( (U8) *s) + || UNLIKELY((U8) *(s) == LATIN1_TO_NATIVE(0xAD)))) + { + /* Split messages for back compat */ + if (isCNTRL_A( (U8) *s)) { + deprecate("literal control characters in variable names"); + } + else { + deprecate("literal non-graphic characters in variable names"); + } } if (is_utf8) { @@ -8676,14 +8743,14 @@ S_scan_ident(pTHX_ char *s, char *dest, STRLEN destlen, I32 ck_uni) } static bool -S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charset) { - - /* Adds, subtracts to/from 'pmfl' based on regex modifier flags found in - * the parse starting at 's', based on the subset that are valid in this - * context input to this routine in 'valid_flags'. Advances s. Returns - * TRUE if the input should be treated as a valid flag, so the next char - * may be as well; otherwise FALSE. 'charset' should point to a NUL upon - * first call on the current regex. This routine will set it to any +S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charset, unsigned int * x_mod_count) { + + /* Adds, subtracts to/from 'pmfl' based on the next regex modifier flag + * found in the parse starting at 's', based on the subset that are valid + * in this context input to this routine in 'valid_flags'. Advances s. + * Returns TRUE if the input should be treated as a valid flag, so the next + * char may be as well; otherwise FALSE. 'charset' should point to a NUL + * upon first call on the current regex. This routine will set it to any * charset modifier found. The caller shouldn't change it. This way, * another charset modifier encountered in the parse can be detected as an * error, as we have decided to allow only one */ @@ -8705,7 +8772,7 @@ S_pmflag(pTHX_ const char* const valid_flags, U32 * pmfl, char** s, char* charse switch (c) { - CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl); + CASE_STD_PMMOD_FLAGS_PARSE_SET(pmfl, *x_mod_count); case GLOBAL_PAT_MOD: *pmfl |= PMf_GLOBAL; break; case CONTINUE_PAT_MOD: *pmfl |= PMf_CONTINUE; break; case ONCE_PAT_MOD: *pmfl |= PMf_KEEP; break; @@ -8780,6 +8847,7 @@ S_scan_pat(pTHX_ char *start, I32 type) const char * const valid_flags = (const char *)((type == OP_QR) ? QR_PAT_MODS : M_PAT_MODS); char charset = '\0'; /* character set modifier */ + unsigned int x_mod_count = 0; PERL_ARGS_ASSERT_SCAN_PAT; @@ -8829,7 +8897,9 @@ S_scan_pat(pTHX_ char *start, I32 type) pm->op_pmflags |= PMf_IS_QR; } - while (*s && S_pmflag(aTHX_ valid_flags, &(pm->op_pmflags), &s, &charset)) {}; + while (*s && S_pmflag(aTHX_ valid_flags, &(pm->op_pmflags), + &s, &charset, &x_mod_count)) + {}; /* issue a warning if /c is specified,but /g is not */ if ((pm->op_pmflags & PMf_CONTINUE) && !(pm->op_pmflags & PMf_GLOBAL)) { @@ -8837,6 +8907,8 @@ S_scan_pat(pTHX_ char *start, I32 type) "Use of /c modifier is meaningless without /g" ); } + STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count); + PL_lex_op = (OP*)pm; pl_yylval.ival = OP_MATCH; return s; @@ -8851,6 +8923,7 @@ S_scan_subst(pTHX_ char *start) line_t first_line; I32 es = 0; char charset = '\0'; /* character set modifier */ + unsigned int x_mod_count = 0; char *t; PERL_ARGS_ASSERT_SCAN_SUBST; @@ -8884,12 +8957,15 @@ S_scan_subst(pTHX_ char *start) s++; es++; } - else if (! S_pmflag(aTHX_ S_PAT_MODS, &(pm->op_pmflags), &s, &charset)) + else if (! S_pmflag(aTHX_ S_PAT_MODS, &(pm->op_pmflags), + &s, &charset, &x_mod_count)) { break; } } + STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count); + if ((pm->op_pmflags & PMf_CONTINUE)) { Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), "Use of /c modifier is meaningless in s///" ); } @@ -9207,7 +9283,14 @@ S_scan_heredoc(pTHX_ char *s) origline + 1 + PL_parser->herelines); if (!lex_next_chunk(LEX_NO_TERM) && (!SvCUR(tmpstr) || SvEND(tmpstr)[-1] != '\n')) { - SvREFCNT_dec(linestr_save); + /* Simply freeing linestr_save might seem simpler here, as it + does not matter what PL_linestr points to, since we are + about to croak; but in a quote-like op, linestr_save + will have been prospectively freed already, via + SAVEFREESV(PL_linestr) in sublex_push, so it’s easier to + restore PL_linestr. */ + SvREFCNT_dec_NN(PL_linestr); + PL_linestr = linestr_save; goto interminable; } CopLINE_set(PL_curcop, origline); @@ -9236,7 +9319,8 @@ S_scan_heredoc(pTHX_ char *s) else if (PL_bufend - PL_linestart == 1 && PL_bufend[-1] == '\r') PL_bufend[-1] = '\n'; #endif - if (*s == term && memEQ(s,PL_tokenbuf + 1,len)) { + if (*s == term && PL_bufend-s >= len + && memEQ(s,PL_tokenbuf + 1,len)) { SvREFCNT_dec(PL_linestr); PL_linestr = linestr_save; PL_linestart = SvPVX(linestr_save); @@ -9256,8 +9340,8 @@ S_scan_heredoc(pTHX_ char *s) if (!IN_BYTES) { if (UTF && is_utf8_string((U8*)SvPVX_const(tmpstr), SvCUR(tmpstr))) SvUTF8_on(tmpstr); - else if (PL_encoding) - sv_recode_to_utf8(tmpstr, PL_encoding); + else if (IN_ENCODING) + sv_recode_to_utf8(tmpstr, _get_encoding()); } PL_lex_stuff = tmpstr; pl_yylval.ival = op_type; @@ -9277,6 +9361,7 @@ S_scan_heredoc(pTHX_ char *s) This code handles: <> read from ARGV + <<>> read from ARGV without magic open read from filehandle read from package qualified filehandle read from package qualified filehandle @@ -9291,6 +9376,7 @@ S_scan_inputsymbol(pTHX_ char *start) char *s = start; /* current position in buffer */ char *end; I32 len; + bool nomagicopen = FALSE; char *d = PL_tokenbuf; /* start of temp holding space */ const char * const e = PL_tokenbuf + sizeof PL_tokenbuf; /* end of temp holding space */ @@ -9299,7 +9385,14 @@ S_scan_inputsymbol(pTHX_ char *start) end = strchr(s, '\n'); if (!end) end = PL_bufend; - s = delimcpy(d, e, s + 1, end, '>', &len); /* extract until > */ + if (s[1] == '<' && s[2] == '>' && s[3] == '>') { + nomagicopen = TRUE; + *d = '\0'; + len = 0; + s += 3; + } + else + s = delimcpy(d, e, s + 1, end, '>', &len); /* extract until > */ /* die if we didn't have space for the contents of the <>, or if it didn't end, or if we see a newline @@ -9359,7 +9452,7 @@ S_scan_inputsymbol(pTHX_ char *start) /* try to find it in the pad for this block, otherwise find add symbol table ops */ - const PADOFFSET tmp = pad_findmy_pvn(d, len, UTF ? SVf_UTF8 : 0); + const PADOFFSET tmp = pad_findmy_pvn(d, len, 0); if (tmp != NOT_IN_PAD) { if (PAD_COMPNAME_FLAGS_isOUR(tmp)) { HV * const stash = PAD_COMPNAME_OURSTASH(tmp); @@ -9385,9 +9478,7 @@ S_scan_inputsymbol(pTHX_ char *start) ++d; intro_sym: gv = gv_fetchpv(d, - (PL_in_eval - ? (GV_ADDMULTI | GV_ADDINEVAL) - : GV_ADDMULTI) | ( UTF ? SVf_UTF8 : 0 ), + GV_ADDMULTI | ( UTF ? SVf_UTF8 : 0 ), SVt_PV); PL_lex_op = readline_overriden ? (OP*)newUNOP(OP_ENTERSUB, OPf_STACKED, @@ -9411,7 +9502,7 @@ intro_sym: op_append_elem(OP_LIST, newGVOP(OP_GV, 0, gv), newCVREF(0, newGVOP(OP_GV, 0, gv_readline)))) - : (OP*)newUNOP(OP_READLINE, 0, newGVOP(OP_GV, 0, gv)); + : (OP*)newUNOP(OP_READLINE, nomagicopen ? OPf_SPECIAL : 0, newGVOP(OP_GV, 0, gv)); pl_yylval.ival = OP_NULL; } } @@ -9540,12 +9631,12 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re sv_catpvn(sv, s, termlen); s += termlen; for (;;) { - if (PL_encoding && !UTF && !re_reparse) { + if (IN_ENCODING && !UTF && !re_reparse) { bool cont = TRUE; while (cont) { int offset = s - SvPVX_const(PL_linestr); - const bool found = sv_cat_decode(sv, PL_encoding, PL_linestr, + const bool found = sv_cat_decode(sv, _get_encoding(), PL_linestr, &offset, (char*)termstr, termlen); const char *ns; char *svlast; @@ -9758,13 +9849,13 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re /* at this point, we have successfully read the delimited string */ - if (!PL_encoding || UTF || re_reparse) { + if (!IN_ENCODING || UTF || re_reparse) { if (keep_delims) sv_catpvn(sv, s, termlen); s += termlen; } - if (has_utf8 || (PL_encoding && !re_reparse)) + if (has_utf8 || (IN_ENCODING && !re_reparse)) SvUTF8_on(sv); PL_multi_end = CopLINE(PL_curcop); @@ -10434,8 +10525,8 @@ S_scan_formline(pTHX_ char *s) if (!IN_BYTES) { if (UTF && is_utf8_string((U8*)SvPVX_const(stuff), SvCUR(stuff))) SvUTF8_on(stuff); - else if (PL_encoding) - sv_recode_to_utf8(stuff, PL_encoding); + else if (IN_ENCODING) + sv_recode_to_utf8(stuff, _get_encoding()); } NEXTVAL_NEXTTOKE.opval = (OP*)newSVOP(OP_CONST, 0, stuff); force_next(THING); @@ -10462,12 +10553,11 @@ Perl_start_subparse(pTHX_ I32 is_format, U32 flags) CvFLAGS(PL_compcv) |= flags; PL_subline = CopLINE(PL_curcop); - CvPADLIST(PL_compcv) = pad_new(padnew_SAVE|padnew_SAVESUB); + CvPADLIST_set(PL_compcv, pad_new(padnew_SAVE|padnew_SAVESUB)); CvOUTSIDE(PL_compcv) = MUTABLE_CV(SvREFCNT_inc_simple(outsidecv)); CvOUTSIDE_SEQ(PL_compcv) = PL_cop_seqmax; if (outsidecv && CvPADLIST(outsidecv)) - CvPADLIST(PL_compcv)->xpadl_outid = - PadlistNAMES(CvPADLIST(outsidecv)); + CvPADLIST(PL_compcv)->xpadl_outid = CvPADLIST(outsidecv)->xpadl_id; return oldsavestack_ix; } @@ -10479,7 +10569,6 @@ S_yywarn(pTHX_ const char *const s, U32 flags) PL_in_eval |= EVAL_WARNONLY; yyerror_pv(s, flags); - PL_in_eval &= ~EVAL_WARNONLY; return 0; } @@ -10583,6 +10672,7 @@ Perl_yyerror_pvn(pTHX_ const char *const s, STRLEN len, U32 flags) PL_multi_end = 0; } if (PL_in_eval & EVAL_WARNONLY) { + PL_in_eval &= ~EVAL_WARNONLY; Perl_ck_warner_d(aTHX_ packWARN(WARN_SYNTAX), "%"SVf, SVfARG(msg)); } else @@ -11373,7 +11463,6 @@ S_parse_opt_lexvar(pTHX) PL_bufptr = s; if (d == PL_tokenbuf+1) return NULL; - *d = 0; var = newOP(sigil == '$' ? OP_PADSV : sigil == '@' ? OP_PADAV : OP_PADHV, OPf_MOD | (OPpLVAL_INTRO<<8)); var->op_targ = allocmy(PL_tokenbuf, d - PL_tokenbuf, UTF ? SVf_UTF8 : 0); @@ -11454,10 +11543,16 @@ Perl_parse_subsignature(pTHX) scalar(newUNOP(OP_RV2AV, 0, newGVOP(OP_GV, 0, PL_defgv))), newSVOP(OP_CONST, 0, newSViv(1))), - newLISTOP(OP_DIE, 0, newOP(OP_PUSHMARK, 0), - newSVOP(OP_CONST, 0, - newSVpvs("Odd name/value argument " - "for subroutine")))); + op_convert_list(OP_DIE, 0, + op_convert_list(OP_SPRINTF, 0, + op_append_list(OP_LIST, + newSVOP(OP_CONST, 0, + newSVpvs("Odd name/value argument for subroutine at %s line %d.\n")), + newSLICEOP(0, + op_append_list(OP_LIST, + newSVOP(OP_CONST, 0, newSViv(1)), + newSVOP(OP_CONST, 0, newSViv(2))), + newOP(OP_CALLER, 0)))))); if (pos != min_arity) chkop = newLOGOP(OP_AND, 0, newBINOP(OP_GT, 0, @@ -11520,9 +11615,16 @@ Perl_parse_subsignature(pTHX) scalar(newUNOP(OP_RV2AV, 0, newGVOP(OP_GV, 0, PL_defgv))), newSVOP(OP_CONST, 0, newSViv(min_arity))), - newLISTOP(OP_DIE, 0, newOP(OP_PUSHMARK, 0), - newSVOP(OP_CONST, 0, - newSVpvs("Too few arguments for subroutine"))))), + op_convert_list(OP_DIE, 0, + op_convert_list(OP_SPRINTF, 0, + op_append_list(OP_LIST, + newSVOP(OP_CONST, 0, + newSVpvs("Too few arguments for subroutine at %s line %d.\n")), + newSLICEOP(0, + op_append_list(OP_LIST, + newSVOP(OP_CONST, 0, newSViv(1)), + newSVOP(OP_CONST, 0, newSViv(2))), + newOP(OP_CALLER, 0))))))), initops); } if (max_arity != -1) { @@ -11533,9 +11635,16 @@ Perl_parse_subsignature(pTHX) scalar(newUNOP(OP_RV2AV, 0, newGVOP(OP_GV, 0, PL_defgv))), newSVOP(OP_CONST, 0, newSViv(max_arity))), - newLISTOP(OP_DIE, 0, newOP(OP_PUSHMARK, 0), - newSVOP(OP_CONST, 0, - newSVpvs("Too many arguments for subroutine"))))), + op_convert_list(OP_DIE, 0, + op_convert_list(OP_SPRINTF, 0, + op_append_list(OP_LIST, + newSVOP(OP_CONST, 0, + newSVpvs("Too many arguments for subroutine at %s line %d.\n")), + newSLICEOP(0, + op_append_list(OP_LIST, + newSVOP(OP_CONST, 0, newSViv(1)), + newSVOP(OP_CONST, 0, newSViv(2))), + newOP(OP_CALLER, 0))))))), initops); } return initops;