X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/580561a3a5dd3bfef87627781ac362004e3d87b5..6f0017eefb47e6ee83d853f73e7f91de5b7bed3a:/toke.c?ds=sidebyside diff --git a/toke.c b/toke.c index a4e9471..7167004 100644 --- a/toke.c +++ b/toke.c @@ -583,7 +583,7 @@ S_missingterm(pTHX_ char *s) ((0 != (PL_hints & HINT_LOCALIZE_HH)) \ && S_feature_is_enabled(aTHX_ STR_WITH_LEN(name))) /* The longest string we pass in. */ -#define MAX_FEATURE_LEN (sizeof("switch")-1) +#define MAX_FEATURE_LEN (sizeof("unicode_strings")-1) /* * S_feature_is_enabled @@ -938,6 +938,7 @@ function is more convenient. void Perl_lex_stuff_pvn(pTHX_ char *pv, STRLEN len, U32 flags) { + dVAR; char *bufptr; PERL_ARGS_ASSERT_LEX_STUFF_PVN; if (flags & ~(LEX_STUFF_UTF8)) @@ -1197,12 +1198,10 @@ Perl_lex_next_chunk(pTHX_ U32 flags) STRLEN old_bufend_pos, new_bufend_pos; STRLEN bufptr_pos, oldbufptr_pos, oldoldbufptr_pos; STRLEN linestart_pos, last_uni_pos, last_lop_pos; + bool got_some_for_debugger = 0; bool got_some; if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF)) Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk"); -#ifdef PERL_MAD - flags |= LEX_KEEP_PREVIOUS; -#endif /* PERL_MAD */ linestr = PL_parser->linestr; buf = SvPVX(linestr); if (!(flags & LEX_KEEP_PREVIOUS) && @@ -1231,6 +1230,7 @@ Perl_lex_next_chunk(pTHX_ U32 flags) got_some = 0; } else if (filter_gets(linestr, old_bufend_pos)) { got_some = 1; + got_some_for_debugger = 1; } else { if (!SvPOK(linestr)) /* can get undefined by filter_gets */ sv_setpvs(linestr, ""); @@ -1270,7 +1270,7 @@ Perl_lex_next_chunk(pTHX_ U32 flags) PL_parser->last_uni = buf + last_uni_pos; if (PL_parser->last_lop) PL_parser->last_lop = buf + last_lop_pos; - if (got_some && (PERLDB_LINE || PERLDB_SAVESRC) && + if (got_some_for_debugger && (PERLDB_LINE || PERLDB_SAVESRC) && PL_curstash != PL_debstash) { /* debugger active and we're not compiling the debugger code, * so store the line into the debugger's array of lines @@ -1303,6 +1303,7 @@ is encountered, an exception is generated. I32 Perl_lex_peek_unichar(pTHX_ U32 flags) { + dVAR; char *s, *bufend; if (flags & ~(LEX_KEEP_PREVIOUS)) Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_peek_unichar"); @@ -1402,12 +1403,14 @@ chunk will not be discarded. =cut */ +#define LEX_NO_NEXT_CHUNK 0x80000000 + void Perl_lex_read_space(pTHX_ U32 flags) { char *s, *bufend; bool need_incline = 0; - if (flags & ~(LEX_KEEP_PREVIOUS)) + if (flags & ~(LEX_KEEP_PREVIOUS|LEX_NO_NEXT_CHUNK)) Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_read_space"); #ifdef PERL_MAD if (PL_skipwhite) { @@ -1440,6 +1443,8 @@ Perl_lex_read_space(pTHX_ U32 flags) if (PL_madskills) sv_catpvn(PL_skipwhite, PL_parser->bufptr, s-PL_parser->bufptr); #endif /* PERL_MAD */ + if (flags & LEX_NO_NEXT_CHUNK) + break; PL_parser->bufptr = s; CopLINE_inc(PL_curcop); got_more = lex_next_chunk(flags); @@ -1715,20 +1720,12 @@ S_skipspace(pTHX_ register char *s) if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) { while (s < PL_bufend && SPACE_OR_TAB(*s)) s++; - } else if (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE) { - while (isSPACE(*s) && *s != '\n') - s++; - if (*s == '#') { - do { - s++; - } while (s != PL_bufend && *s != '\n'); - } - if (*s == '\n') - s++; } else { STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr); PL_bufptr = s; - lex_read_space(LEX_KEEP_PREVIOUS); + lex_read_space(LEX_KEEP_PREVIOUS | + (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ? + LEX_NO_NEXT_CHUNK : 0)); s = PL_bufptr; PL_bufptr = SvPVX(PL_linestr) + bufptr_pos; if (PL_linestart > PL_bufptr) @@ -2098,7 +2095,13 @@ S_force_version(pTHX_ char *s, int guessing) #endif if (*d == ';' || isSPACE(*d) || *d == '}' || !*d) { SV *ver; +#ifdef USE_LOCALE_NUMERIC + char *loc = setlocale(LC_NUMERIC, "C"); +#endif s = scan_num(s, &pl_yylval); +#ifdef USE_LOCALE_NUMERIC + setlocale(LC_NUMERIC, loc); +#endif version = pl_yylval.opval; ver = cSVOPx(version)->op_sv; if (SvPOK(ver) && !SvNIOK(ver)) { @@ -2135,6 +2138,53 @@ S_force_version(pTHX_ char *s, int guessing) } /* + * S_force_strict_version + * Forces the next token to be a version number using strict syntax rules. + */ + +STATIC char * +S_force_strict_version(pTHX_ char *s) +{ + dVAR; + OP *version = NULL; +#ifdef PERL_MAD + I32 startoff = s - SvPVX(PL_linestr); +#endif + const char *errstr = NULL; + + PERL_ARGS_ASSERT_FORCE_STRICT_VERSION; + + while (isSPACE(*s)) /* leading whitespace */ + s++; + + if (is_STRICT_VERSION(s,&errstr)) { + SV *ver = newSV(0); + s = (char *)scan_version(s, ver, 0); + version = newSVOP(OP_CONST, 0, ver); + } + else if ( (*s != ';' && *s != '}' ) && (s = SKIPSPACE1(s), (*s != ';' && *s !='}' ))) { + PL_bufptr = s; + if (errstr) + yyerror(errstr); /* version required */ + return s; + } + +#ifdef PERL_MAD + if (PL_madskills && !version) { + sv_free(PL_nextwhite); /* let next token collect whitespace */ + PL_nextwhite = 0; + s = SvPVX(PL_linestr) + startoff; + } +#endif + /* NOTE: The parser sees the package name and the VERSION swapped */ + start_force(PL_curforce); + NEXTVAL_NEXTTOKE.opval = version; + force_next(WORD); + + return s; +} + +/* * S_tokeq * Tokenize a quoted string passed in as an SV. It finds the next * chunk, up to end of string or a backslash. It may make a new @@ -2423,10 +2473,7 @@ S_sublex_done(pTHX) In patterns: backslashes: - double-quoted style: \r and \n - regexp special ones: \D \s - constants: \x31 - backrefs: \1 + constants: \N{NAME} only case and quoting: \U \Q \E stops on @ and $, but not for $ as tail anchor @@ -2440,7 +2487,7 @@ S_sublex_done(pTHX) In double-quoted strings: backslashes: double-quoted style: \r and \n - constants: \x31 + constants: \x31, etc. deprecated backrefs: \1 (in substitution replacements) case and quoting: \U \Q \E stops on @ and $ @@ -2468,14 +2515,14 @@ S_sublex_done(pTHX) check for embedded arrays check for embedded scalars if (backslash) { - leave intact backslashes from leaveit (below) deprecate \1 in substitution replacements handle string-changing backslashes \l \U \Q \E, etc. switch (what was escaped) { handle \- in a transliteration (becomes a literal -) + if a pattern and not \N{, go treat as regular character handle \132 (octal characters) handle \x15 and \x{1234} (hex characters) - handle \N{name} (named characters) + handle \N{name} (named characters, also \N{3,5} in a pattern) handle \cV (control characters) handle printf-style backslashes (\f, \r, \n, etc) } (end switch) @@ -2533,6 +2580,7 @@ S_scan_const(pTHX_ char *start) while (s < send || dorange) { + /* get transliterations out of the way (they're most literal) */ if (PL_lex_inwhat == OP_TRANS) { /* expand a range A-Z to the full set of characters. AIE! */ @@ -2752,6 +2800,8 @@ S_scan_const(pTHX_ char *start) /* backslashes */ if (*s == '\\' && s+1 < send) { + char* e; /* Can be used for ending '}', etc. */ + s++; /* deprecate \1 in strings and substitution replacements */ @@ -2768,13 +2818,28 @@ S_scan_const(pTHX_ char *start) --s; break; } - /* skip any other backslash escapes in a pattern */ - else if (PL_lex_inpat) { + /* In a pattern, process \N, but skip any other backslash escapes. + * This is because we don't want to translate an escape sequence + * into a meta symbol and have the regex compiler use the meta + * symbol meaning, e.g. \x{2E} would be confused with a dot. But + * in spite of this, we do have to process \N here while the proper + * charnames handler is in scope. See bugs #56444 and #62056. + * There is a complication because \N in a pattern may also stand + * for 'match a non-nl', and not mean a charname, in which case its + * processing should be deferred to the regex compiler. To be a + * charname it must be followed immediately by a '{', and not look + * like \N followed by a curly quantifier, i.e., not something like + * \N{3,}. regcurly returns a boolean indicating if it is a legal + * quantifier */ + else if (PL_lex_inpat + && (*s != 'N' + || s[1] != '{' + || regcurly(s + 1))) + { *d++ = NATIVE_TO_NEED(has_utf8,'\\'); goto default_action; } - /* if we get here, it's either a quoted -, or a digit */ switch (*s) { /* quoted - in transliterations */ @@ -2833,15 +2898,13 @@ S_scan_const(pTHX_ char *start) } NUM_ESCAPE_INSERT: - /* Insert oct, hex, or \N{U+...} escaped character. There will - * always be enough room in sv since such escapes will be - * longer than any UTF-8 sequence they can end up as, except if - * they force us to recode the rest of the string into utf8 */ + /* Insert oct or hex escaped character. There will always be + * enough room in sv since such escapes will be longer than any + * UTF-8 sequence they can end up as, except if they force us + * to recode the rest of the string into utf8 */ /* Here uv is the ordinal of the next character being added in - * unicode (converted from native). (It has to be done before - * here because \N is interpreted as unicode, and oct and hex - * as native.) */ + * unicode (converted from native). */ if (!UNI_IS_INVARIANT(uv)) { if (!has_utf8 && uv > 255) { /* Might need to recode whatever we have accumulated so @@ -2881,92 +2944,337 @@ S_scan_const(pTHX_ char *start) } continue; - /* \N{LATIN SMALL LETTER A} is a named character, and so is - * \N{U+0041} */ case 'N': - ++s; - if (*s == '{') { - char* e = strchr(s, '}'); - SV *res; - STRLEN len; - const char *str; - - if (!e) { + /* In a non-pattern \N must be a named character, like \N{LATIN + * SMALL LETTER A} or \N{U+0041}. For patterns, it also can + * mean to match a non-newline. For non-patterns, named + * characters are converted to their string equivalents. In + * patterns, named characters are not converted to their + * ultimate forms for the same reasons that other escapes + * aren't. Instead, they are converted to the \N{U+...} form + * to get the value from the charnames that is in effect right + * now, while preserving the fact that it was a named character + * so that the regex compiler knows this */ + + /* This section of code doesn't generally use the + * NATIVE_TO_NEED() macro to transform the input. I (khw) did + * a close examination of this macro and determined it is a + * no-op except on utfebcdic variant characters. Every + * character generated by this that would normally need to be + * enclosed by this macro is invariant, so the macro is not + * needed, and would complicate use of copy(). There are other + * parts of this file where the macro is used inconsistently, + * but are saved by it being a no-op */ + + /* The structure of this section of code (besides checking for + * errors and upgrading to utf8) is: + * Further disambiguate between the two meanings of \N, and if + * not a charname, go process it elsewhere + * If of form \N{U+...}, pass it through if a pattern; + * otherwise convert to utf8 + * Otherwise must be \N{NAME}: convert to \N{U+c1.c2...} if a + * pattern; otherwise convert to utf8 */ + + /* Here, s points to the 'N'; the test below is guaranteed to + * succeed if we are being called on a pattern as we already + * know from a test above that the next character is a '{'. + * On a non-pattern \N must mean 'named sequence, which + * requires braces */ + s++; + if (*s != '{') { + yyerror("Missing braces on \\N{}"); + continue; + } + s++; + + /* If there is no matching '}', it is an error. */ + if (! (e = strchr(s, '}'))) { + if (! PL_lex_inpat) { yyerror("Missing right brace on \\N{}"); - e = s - 1; - goto cont_scan; - } - if (e > s + 2 && s[1] == 'U' && s[2] == '+') { - /* \N{U+...} The ... is a unicode value even on EBCDIC - * machines */ - I32 flags = PERL_SCAN_ALLOW_UNDERSCORES | - PERL_SCAN_DISALLOW_PREFIX; - s += 3; - len = e - s; - uv = grok_hex(s, &len, &flags, NULL); - if ( e > s && len != (STRLEN)(e - s) ) { - uv = 0xFFFD; - } - s = e + 1; - goto NUM_ESCAPE_INSERT; + } else { + yyerror("Missing right brace on \\N{} or unescaped left brace after \\N."); } - res = newSVpvn(s + 1, e - s - 1); - res = new_constant( NULL, 0, "charnames", - res, NULL, s - 2, e - s + 3 ); - if (has_utf8) - sv_utf8_upgrade(res); - str = SvPV_const(res,len); -#ifdef EBCDIC_NEVER_MIND - /* charnames uses pack U and that has been - * recently changed to do the below uni->native - * mapping, so this would be redundant (and wrong, - * the code point would be doubly converted). - * But leave this in just in case the pack U change - * gets revoked, but the semantics is still - * desireable for charnames. --jhi */ - { - UV uv = utf8_to_uvchr((const U8*)str, 0); + continue; + } - if (uv < 0x100) { - U8 tmpbuf[UTF8_MAXBYTES+1], *d; + /* Here it looks like a named character */ - d = uvchr_to_utf8(tmpbuf, UNI_TO_NATIVE(uv)); - sv_setpvn(res, (char *)tmpbuf, d - tmpbuf); - str = SvPV_const(res, len); - } - } -#endif - /* If destination is not in utf8 but this new character is, - * recode the dest to utf8 */ - if (!has_utf8 && SvUTF8(res)) { + if (PL_lex_inpat) { + + /* XXX This block is temporary code. \N{} implies that the + * pattern is to have Unicode semantics, and therefore + * currently has to be encoded in utf8. By putting it in + * utf8 now, we save a whole pass in the regular expression + * compiler. Once that code is changed so Unicode + * semantics doesn't necessarily have to be in utf8, this + * block should be removed */ + if (!has_utf8) { SvCUR_set(sv, d - SvPVX_const(sv)); SvPOK_on(sv); *d = '\0'; /* See Note on sizing above. */ sv_utf8_upgrade_flags_grow(sv, - SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, - len + (STRLEN)(send - s) + 1); + SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, + /* 5 = '\N{' + cur char + NUL */ + (STRLEN)(send - s) + 5); d = SvPVX(sv) + SvCUR(sv); has_utf8 = TRUE; - } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */ + } + } - /* See Note on sizing above. (NOTE: SvCUR() is not set - * correctly here). */ - const STRLEN off = d - SvPVX_const(sv); - d = SvGROW(sv, off + len + (STRLEN)(send - s) + 1) + off; + if (*s == 'U' && s[1] == '+') { /* \N{U+...} */ + I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + | PERL_SCAN_DISALLOW_PREFIX; + STRLEN len; + + /* For \N{U+...}, the '...' is a unicode value even on + * EBCDIC machines */ + s += 2; /* Skip to next char after the 'U+' */ + len = e - s; + uv = grok_hex(s, &len, &flags, NULL); + if (len == 0 || len != (STRLEN)(e - s)) { + yyerror("Invalid hexadecimal number in \\N{U+...}"); + s = e + 1; + continue; + } + + if (PL_lex_inpat) { + + /* Pass through to the regex compiler unchanged. The + * reason we evaluated the number above is to make sure + * there wasn't a syntax error. */ + s -= 5; /* Include the '\N{U+' */ + Copy(s, d, e - s + 1, char); /* 1 = include the } */ + d += e - s + 1; + } + else { /* Not a pattern: convert the hex to string */ + + /* If destination is not in utf8, unconditionally + * recode it to be so. This is because \N{} implies + * Unicode semantics, and scalars have to be in utf8 + * to guarantee those semantics */ + if (! has_utf8) { + SvCUR_set(sv, d - SvPVX_const(sv)); + SvPOK_on(sv); + *d = '\0'; + /* See Note on sizing above. */ + sv_utf8_upgrade_flags_grow( + sv, + SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, + UNISKIP(uv) + (STRLEN)(send - e) + 1); + d = SvPVX(sv) + SvCUR(sv); + has_utf8 = TRUE; + } + + /* Add the string to the output */ + if (UNI_IS_INVARIANT(uv)) { + *d++ = (char) uv; + } + else d = (char*)uvuni_to_utf8((U8*)d, uv); + } + } + else { /* Here is \N{NAME} but not \N{U+...}. */ + + SV *res; /* result from charnames */ + const char *str; /* the string in 'res' */ + STRLEN len; /* its length */ + + /* Get the value for NAME */ + res = newSVpvn(s, e - s); + res = new_constant( NULL, 0, "charnames", + /* includes all of: \N{...} */ + res, NULL, s - 3, e - s + 4 ); + + /* Most likely res will be in utf8 already since the + * standard charnames uses pack U, but a custom translator + * can leave it otherwise, so make sure. XXX This can be + * revisited to not have charnames use utf8 for characters + * that don't need it when regexes don't have to be in utf8 + * for Unicode semantics. If doing so, remember EBCDIC */ + sv_utf8_upgrade(res); + str = SvPV_const(res, len); + + /* Don't accept malformed input */ + if (! is_utf8_string((U8 *) str, len)) { + yyerror("Malformed UTF-8 returned by \\N"); + } + else if (PL_lex_inpat) { + + if (! len) { /* The name resolved to an empty string */ + Copy("\\N{}", d, 4, char); + d += 4; + } + else { + /* In order to not lose information for the regex + * compiler, pass the result in the specially made + * syntax: \N{U+c1.c2.c3...}, where c1 etc. are + * the code points in hex of each character + * returned by charnames */ + + const char *str_end = str + len; + STRLEN char_length; /* cur char's byte length */ + STRLEN output_length; /* and the number of bytes + after this is translated + into hex digits */ + const STRLEN off = d - SvPVX_const(sv); + + /* 2 hex per byte; 2 chars for '\N'; 2 chars for + * max('U+', '.'); and 1 for NUL */ + char hex_string[2 * UTF8_MAXBYTES + 5]; + + /* Get the first character of the result. */ + U32 uv = utf8n_to_uvuni((U8 *) str, + len, + &char_length, + UTF8_ALLOW_ANYUV); + + /* The call to is_utf8_string() above hopefully + * guarantees that there won't be an error. But + * it's easy here to make sure. The function just + * above warns and returns 0 if invalid utf8, but + * it can also return 0 if the input is validly a + * NUL. Disambiguate */ + if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') { + uv = UNICODE_REPLACEMENT; + } + + /* Convert first code point to hex, including the + * boiler plate before it */ + sprintf(hex_string, "\\N{U+%X", (unsigned int) uv); + output_length = strlen(hex_string); + + /* Make sure there is enough space to hold it */ + d = off + SvGROW(sv, off + + output_length + + (STRLEN)(send - e) + + 2); /* '}' + NUL */ + /* And output it */ + Copy(hex_string, d, output_length, char); + d += output_length; + + /* For each subsequent character, append dot and + * its ordinal in hex */ + while ((str += char_length) < str_end) { + const STRLEN off = d - SvPVX_const(sv); + U32 uv = utf8n_to_uvuni((U8 *) str, + str_end - str, + &char_length, + UTF8_ALLOW_ANYUV); + if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') { + uv = UNICODE_REPLACEMENT; + } + + sprintf(hex_string, ".%X", (unsigned int) uv); + output_length = strlen(hex_string); + + d = off + SvGROW(sv, off + + output_length + + (STRLEN)(send - e) + + 2); /* '}' + NUL */ + Copy(hex_string, d, output_length, char); + d += output_length; + } + + *d++ = '}'; /* Done. Add the trailing brace */ + } + } + else { /* Here, not in a pattern. Convert the name to a + * string. */ + + /* If destination is not in utf8, unconditionally + * recode it to be so. This is because \N{} implies + * Unicode semantics, and scalars have to be in utf8 + * to guarantee those semantics */ + if (! has_utf8) { + SvCUR_set(sv, d - SvPVX_const(sv)); + SvPOK_on(sv); + *d = '\0'; + /* See Note on sizing above. */ + sv_utf8_upgrade_flags_grow(sv, + SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, + len + (STRLEN)(send - s) + 1); + d = SvPVX(sv) + SvCUR(sv); + has_utf8 = TRUE; + } else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */ + + /* See Note on sizing above. (NOTE: SvCUR() is not + * set correctly here). */ + const STRLEN off = d - SvPVX_const(sv); + d = off + SvGROW(sv, off + len + (STRLEN)(send - s) + 1); + } + Copy(str, d, len, char); + d += len; } + SvREFCNT_dec(res); + + /* Deprecate non-approved name syntax */ + if (ckWARN_d(WARN_DEPRECATED)) { + bool problematic = FALSE; + char* i = s; + + /* For non-ut8 input, look to see that the first + * character is an alpha, then loop through the rest + * checking that each is a continuation */ + if (! this_utf8) { + if (! isALPHAU(*i)) problematic = TRUE; + else for (i = s + 1; i < e; i++) { + if (isCHARNAME_CONT(*i)) continue; + problematic = TRUE; + break; + } + } + else { + /* Similarly for utf8. For invariants can check + * directly. We accept anything above the latin1 + * range because it is immaterial to Perl if it is + * correct or not, and is expensive to check. But + * it is fairly easy in the latin1 range to convert + * the variants into a single character and check + * those */ + if (UTF8_IS_INVARIANT(*i)) { + if (! isALPHAU(*i)) problematic = TRUE; + } else if (UTF8_IS_DOWNGRADEABLE_START(*i)) { + if (! isALPHAU(UNI_TO_NATIVE(UTF8_ACCUMULATE(*i, + *(i+1))))) + { + problematic = TRUE; + } + } + if (! problematic) for (i = s + UTF8SKIP(s); + i < e; + i+= UTF8SKIP(i)) + { + if (UTF8_IS_INVARIANT(*i)) { + if (isCHARNAME_CONT(*i)) continue; + } else if (! UTF8_IS_DOWNGRADEABLE_START(*i)) { + continue; + } else if (isCHARNAME_CONT( + UNI_TO_NATIVE( + UTF8_ACCUMULATE(*i, *(i+1))))) + { + continue; + } + problematic = TRUE; + break; + } + } + if (problematic) { + char *string; + Newx(string, e - i + 1, char); + Copy(i, string, e - i, char); + string[e - i] = '\0'; + Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), + "Deprecated character(s) in \\N{...} starting at '%s'", + string); + Safefree(string); + } + } + } /* End \N{NAME} */ #ifdef EBCDIC - if (!dorange) - native_range = FALSE; /* \N{} is guessed to be Unicode */ + if (!dorange) + native_range = FALSE; /* \N{} is defined to be Unicode */ #endif - Copy(str, d, len, char); - d += len; - SvREFCNT_dec(res); - cont_scan: - s = e + 1; - } - else - yyerror("Missing braces on \\N{}"); + s = e + 1; /* Point to just after the '}' */ continue; /* \c is a control character */ @@ -3808,7 +4116,8 @@ S_tokenize_use(pTHX_ int is_use, char *s) { s = SKIPSPACE1(s); if (isDIGIT(*s) || (*s == 'v' && isDIGIT(s[1]))) { s = force_version(s, TRUE); - if (*s == ';' || (s = SKIPSPACE1(s), *s == ';')) { + if (*s == ';' || *s == '}' + || (s = SKIPSPACE1(s), (*s == ';' || *s == '}'))) { start_force(PL_curforce); NEXTVAL_NEXTTOKE.opval = NULL; force_next(WORD); @@ -4324,10 +4633,13 @@ Perl_yylex(pTHX) fake_eof = LEX_FAKE_EOF; } PL_bufptr = PL_bufend; + CopLINE_inc(PL_curcop); if (!lex_next_chunk(fake_eof)) { + CopLINE_dec(PL_curcop); s = PL_bufptr; TOKEN(';'); /* not infinite loop because rsfp is NULL now */ } + CopLINE_dec(PL_curcop); #ifdef PERL_MAD if (!PL_rsfp) PL_realtokenstart = -1; @@ -4360,11 +4672,10 @@ Perl_yylex(pTHX) PL_doextract = FALSE; } } - incline(s); + if (PL_rsfp) + incline(s); } while (PL_doextract); PL_oldoldbufptr = PL_oldbufptr = PL_bufptr = PL_linestart = s; - if ((PERLDB_LINE || PERLDB_SAVESRC) && PL_curstash != PL_debstash) - update_debugger_info(PL_linestr, NULL, 0); PL_bufend = SvPVX(PL_linestr) + SvCUR(PL_linestr); PL_last_lop = PL_last_uni = NULL; if (CopLINE(PL_curcop) == 1) { @@ -5441,7 +5752,7 @@ Perl_yylex(pTHX) d = s; { const char tmp = *s; - if (PL_lex_state == LEX_NORMAL) + if (PL_lex_state == LEX_NORMAL || PL_lex_brackets) s = SKIPSPACE1(s); if ((PL_expect != XREF || PL_oldoldbufptr == PL_last_lop) @@ -5640,8 +5951,6 @@ Perl_yylex(pTHX) pl_yylval.ival = 0; OPERATOR(DOTDOT); } - if (PL_expect != XOPERATOR) - check_uni(); Aop(OP_CONCAT); } /* FALL THROUGH */ @@ -6587,8 +6896,14 @@ Perl_yylex(pTHX) case KEY_eval: s = SKIPSPACE1(s); - PL_expect = (*s == '{') ? XTERMBLOCK : XTERM; - UNIBRACK(OP_ENTEREVAL); + if (*s == '{') { /* block eval */ + PL_expect = XTERMBLOCK; + UNIBRACK(OP_ENTERTRY); + } + else { /* string eval */ + PL_expect = XTERM; + UNIBRACK(OP_ENTEREVAL); + } case KEY_eof: UNI(OP_EOF); @@ -6959,7 +7274,8 @@ Perl_yylex(pTHX) case KEY_package: s = force_word(s,WORD,FALSE,TRUE,FALSE); - s = force_version(s, FALSE); + s = SKIPSPACE1(s); + s = force_strict_version(s); OPERATOR(PACKAGE); case KEY_pipe: @@ -7346,7 +7662,7 @@ Perl_yylex(pTHX) bool must_be_last = FALSE; bool underscore = FALSE; bool seen_underscore = FALSE; - const bool warnsyntax = ckWARN(WARN_SYNTAX); + const bool warnillegalproto = ckWARN(WARN_ILLEGALPROTO); s = scan_str(s,!!PL_madskills,FALSE); if (!s) @@ -7358,7 +7674,7 @@ Perl_yylex(pTHX) if (!isSPACE(*p)) { d[tmp++] = *p; - if (warnsyntax) { + if (warnillegalproto) { if (must_be_last) proto_after_greedy_proto = TRUE; if (!strchr("$@%*;[]&\\_", *p)) { @@ -7391,11 +7707,11 @@ Perl_yylex(pTHX) } d[tmp] = '\0'; if (proto_after_greedy_proto) - Perl_warner(aTHX_ packWARN(WARN_SYNTAX), + Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO), "Prototype after '%c' for %"SVf" : %s", greedy_proto, SVfARG(PL_subname), d); if (bad_proto) - Perl_warner(aTHX_ packWARN(WARN_SYNTAX), + Perl_warner(aTHX_ packWARN(WARN_ILLEGALPROTO), "Illegal character %sin prototype for %"SVf" : %s", seen_underscore ? "after '_' " : "", SVfARG(PL_subname), d); @@ -7426,7 +7742,7 @@ Perl_yylex(pTHX) else if (*s != '{' && key == KEY_sub) { if (!have_name) Perl_croak(aTHX_ "Illegal declaration of anonymous subroutine"); - else if (*s != ';') + else if (*s != ';' && *s != '}') Perl_croak(aTHX_ "Illegal declaration of subroutine %"SVf, SVfARG(PL_subname)); } @@ -11252,6 +11568,11 @@ S_new_constant(pTHX_ const char *s, STRLEN len, const char *key, STRLEN keylen, SvREFCNT_dec(msg); return sv; } + + /* charnames doesn't work well if there have been errors found */ + if (PL_error_count > 0 && strEQ(key,"charnames")) + return &PL_sv_undef; + cvp = hv_fetch(table, key, keylen, FALSE); if (!cvp || !SvOK(*cvp)) { why1 = "$^H{"; @@ -11365,7 +11686,7 @@ S_scan_ident(pTHX_ register char *s, register const char *send, char *dest, STRL char *bracket = NULL; char funny = *s++; register char *d = dest; - register char * const e = d + destlen + 3; /* two-character token, ending NUL */ + register char * const e = d + destlen - 3; /* two-character token, ending NUL */ PERL_ARGS_ASSERT_SCAN_IDENT; @@ -12018,10 +12339,12 @@ S_scan_heredoc(pTHX_ register char *s) } #endif PL_bufptr = s; + CopLINE_inc(PL_curcop); if (!outer || !lex_next_chunk(0)) { CopLINE_set(PL_curcop, (line_t)PL_multi_start); missingterm(PL_tokenbuf); } + CopLINE_dec(PL_curcop); s = PL_bufptr; #ifdef PERL_MAD stuffstart = s - SvPVX(PL_linestr); @@ -12044,8 +12367,6 @@ S_scan_heredoc(pTHX_ register char *s) else if (PL_bufend - PL_linestart == 1 && PL_bufend[-1] == '\r') PL_bufend[-1] = '\n'; #endif - if ((PERLDB_LINE || PERLDB_SAVESRC) && PL_curstash != PL_debstash) - update_debugger_info(PL_linestr, NULL, 0); if (*s == term && memEQ(s,PL_tokenbuf,len)) { STRLEN off = PL_bufend - 1 - SvPVX_const(PL_linestr); *(SvPVX(PL_linestr) + off ) = ' '; @@ -13286,17 +13607,17 @@ S_swallow_bom(pTHX_ U8 *s) switch (s[0]) { case 0xFF: if (s[1] == 0xFE) { - /* UTF-16 little-endian? (or UTF32-LE?) */ + /* UTF-16 little-endian? (or UTF-32LE?) */ if (s[2] == 0 && s[3] == 0) /* UTF-32 little-endian */ - Perl_croak(aTHX_ "Unsupported script encoding UTF32-LE"); + Perl_croak(aTHX_ "Unsupported script encoding UTF-32LE"); #ifndef PERL_NO_UTF16_FILTER - if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF16-LE script encoding (BOM)\n"); + if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16LE script encoding (BOM)\n"); s += 2; if (PL_bufend > (char*)s) { s = add_utf16_textfilter(s, TRUE); } #else - Perl_croak(aTHX_ "Unsupported script encoding UTF16-LE"); + Perl_croak(aTHX_ "Unsupported script encoding UTF-16LE"); #endif } break; @@ -13309,7 +13630,7 @@ S_swallow_bom(pTHX_ U8 *s) s = add_utf16_textfilter(s, FALSE); } #else - Perl_croak(aTHX_ "Unsupported script encoding UTF16-BE"); + Perl_croak(aTHX_ "Unsupported script encoding UTF-16BE"); #endif } break; @@ -13324,15 +13645,19 @@ S_swallow_bom(pTHX_ U8 *s) if (s[1] == 0) { if (s[2] == 0xFE && s[3] == 0xFF) { /* UTF-32 big-endian */ - Perl_croak(aTHX_ "Unsupported script encoding UTF32-BE"); + Perl_croak(aTHX_ "Unsupported script encoding UTF-32BE"); } } else if (s[2] == 0 && s[3] != 0) { /* Leading bytes * 00 xx 00 xx * are a good indicator of UTF-16BE. */ +#ifndef PERL_NO_UTF16_FILTER if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16BE script encoding (no BOM)\n"); - s = add_utf16_textfilter(s, FALSE); + s = add_utf16_textfilter(s, FALSE); +#else + Perl_croak(aTHX_ "Unsupported script encoding UTF-16BE"); +#endif } } #ifdef EBCDIC @@ -13349,8 +13674,12 @@ S_swallow_bom(pTHX_ U8 *s) /* Leading bytes * xx 00 xx 00 * are a good indicator of UTF-16LE. */ +#ifndef PERL_NO_UTF16_FILTER if (DEBUG_p_TEST || DEBUG_T_TEST) PerlIO_printf(Perl_debug_log, "UTF-16LE script encoding (no BOM)\n"); s = add_utf16_textfilter(s, TRUE); +#else + Perl_croak(aTHX_ "Unsupported script encoding UTF-16LE"); +#endif } } return (char*)s;