* It prints "Missing operator before end of line" if there's nothing
* after the missing operator, or "... before <...>" if there is something
* after the missing operator.
+ *
+ * PL_bufptr is expected to point to the start of the thing that was found,
+ * and s after the next token or partial token.
*/
STATIC void
*/
#define LEX_FAKE_EOF 0x80000000
-#define LEX_NO_TERM 0x40000000
+#define LEX_NO_TERM 0x40000000 /* here-doc */
bool
Perl_lex_next_chunk(pTHX_ U32 flags)
bool got_some;
if (flags & ~(LEX_KEEP_PREVIOUS|LEX_FAKE_EOF|LEX_NO_TERM))
Perl_croak(aTHX_ "Lexing code internal error (%s)", "lex_next_chunk");
+ if (!(flags & LEX_NO_TERM) && PL_lex_inwhat)
+ return FALSE;
linestr = PL_parser->linestr;
buf = SvPVX(linestr);
if (!(flags & LEX_KEEP_PREVIOUS) &&
incline(s);
need_incline = 0;
}
+ } else if (!c) {
+ s++;
} else {
break;
}
const char *n;
const char *e;
line_t line_num;
+ UV uv;
PERL_ARGS_ASSERT_INCLINE;
if (*e != '\n' && *e != '\0')
return; /* false alarm */
- line_num = grok_atou(n, &e) - 1;
+ if (!grok_atoUV(n, &uv, &e))
+ return;
+ line_num = ((line_t)uv) - 1;
if (t - s > 0) {
const STRLEN len = t - s;
{
PERL_ARGS_ASSERT_SKIPSPACE_FLAGS;
if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
- while (s < PL_bufend && SPACE_OR_TAB(*s))
+ while (s < PL_bufend && (SPACE_OR_TAB(*s) || !*s))
s++;
} else {
STRLEN bufptr_pos = PL_bufptr - SvPVX(PL_linestr);
PL_bufptr = s;
lex_read_space(flags | LEX_KEEP_PREVIOUS |
- (PL_sublex_info.sub_inwhat || PL_lex_state == LEX_FORMLINE ?
+ (PL_lex_inwhat || PL_lex_state == LEX_FORMLINE ?
LEX_NO_NEXT_CHUNK : 0));
s = PL_bufptr;
PL_bufptr = SvPVX(PL_linestr) + bufptr_pos;
PL_last_uni++;
s = PL_last_uni;
while (isWORDCHAR_lazy_if(s,UTF) || *s == '-')
- s++;
+ s += UTF ? UTF8SKIP(s) : 1;
if ((t = strchr(s, '(')) && t < PL_bufptr)
return;
Perl_ck_warner_d(aTHX_ packWARN(WARN_AMBIGUOUS),
- "Warning: Use of \"%.*s\" without parentheses is ambiguous",
- (int)(s - PL_last_uni), PL_last_uni);
+ "Warning: Use of \"%"UTF8f"\" without parentheses is ambiguous",
+ UTF8fARG(UTF, (int)(s - PL_last_uni), PL_last_uni));
}
/*
tokereport(type, &NEXTVAL_NEXTTOKE);
}
#endif
+ assert(PL_nexttoke < C_ARRAY_LENGTH(PL_nexttype));
PL_nexttype[PL_nexttoke] = type;
PL_nexttoke++;
if (PL_lex_state != LEX_KNOWNEXT) {
return THING;
}
if (op_type == OP_CONST) {
- SV *sv = tokeq(PL_lex_stuff);
+ SV *sv = PL_lex_stuff;
+ PL_lex_stuff = NULL;
+ sv = tokeq(sv);
if (SvTYPE(sv) == SVt_PVIV) {
/* Overloaded constants, nothing fancy: Convert to SVt_PV: */
sv = nsv;
}
pl_yylval.opval = (OP*)newSVOP(op_type, 0, sv);
- PL_lex_stuff = NULL;
return THING;
}
SAVEI32(PL_lex_casemods);
SAVEI32(PL_lex_starts);
SAVEI8(PL_lex_state);
+ SAVEI8(PL_lex_defer);
SAVESPTR(PL_lex_repl);
SAVEVPTR(PL_lex_inpat);
SAVEI16(PL_lex_inwhat);
PL_lex_stuff = NULL;
PL_sublex_info.repl = NULL;
+ /* Arrange for PL_lex_stuff to be freed on scope exit, in case it gets
+ set for an inner quote-like operator and then an error causes scope-
+ popping. We must not have a PL_lex_stuff value left dangling, as
+ that breaks assumptions elsewhere. See bug #123617. */
+ SAVEGENERICSV(PL_lex_stuff);
+ SAVEGENERICSV(PL_sublex_info.repl);
+
PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
= SvPVX(PL_linestr);
PL_bufend += SvCUR(PL_linestr);
PL_bufend = SvPVX(PL_linestr);
PL_bufend += SvCUR(PL_linestr);
PL_expect = XOPERATOR;
- PL_sublex_info.sub_inwhat = 0;
return ')';
}
}
/* We deliberately don't try to print the malformed character, which
* might not print very well; it also may be just the first of many
* malformations, so don't print what comes after it */
- yyerror(Perl_form(aTHX_
+ yyerror_pv(Perl_form(aTHX_
"Malformed UTF-8 character immediately after '%.*s'",
- (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr));
+ (int) (first_bad_char_loc - (U8 *) backslash_ptr), backslash_ptr),
+ SVf_UTF8);
return NULL;
}
(@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
*/
else if (*s == '@' && s[1]) {
- if (isWORDCHAR_lazy_if(s+1,UTF))
+ if (UTF ? isIDFIRST_utf8((U8*)s+1) : isWORDCHAR_A(s[1]))
break;
if (strchr(":'{$", s[1]))
break;
* Otherwise must be some \N{NAME}: convert to \N{U+c1.c2...}
* if a pattern; otherwise convert to utf8
*
- * If the regex compiler should ever need to differentiate
- * between the \N{U+...} and \N{name} forms, that could easily
- * be done here by stripping any leading zeros from the
- * \N{U+...} case, and adding them to the other one. */
-
- /* Here, 's' points to the 'N'; the test below is guaranteed to
+ * Here, 's' points to the 'N'; the test below is guaranteed to
* succeed if we are being called on a pattern, as we already
* know from a test above that the next character is a '{'. A
* non-pattern \N must mean 'named character', which requires
/* Here it looks like a named character */
if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
- I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_SILENT_ILLDIGIT
- | PERL_SCAN_DISALLOW_PREFIX;
- STRLEN len;
-
s += 2; /* Skip to next char after the 'U+' */
- len = e - s;
- uv = grok_hex(s, &len, &flags, NULL);
- if (len == 0
- || ( len != (STRLEN)(e - s) && s[len] != '.'
- && PL_lex_inpat))
- {
- bad_NU:
- yyerror("Invalid hexadecimal number in \\N{U+...}");
- s = e + 1;
- continue;
- }
-
if (PL_lex_inpat) {
-#ifdef EBCDIC
- s -= 5; /* Include the '\N{U+' */
- /* On EBCDIC platforms, in \N{U+...}, the '...' is a
- * Unicode value, so convert to native so downstream
- * code can continue to assume it's native */
- /* XXX This should be in the regexp parser,
- because doing it here makes /\N{U+41}/ and
- =~ '\N{U+41}' do different things. */
- d += my_snprintf(d, e - s + 1 + 1, /* includes the '}'
- and the \0 */
- "\\N{U+%X",
- (unsigned int) UNI_TO_NATIVE(uv));
- s += 5 + len;
- while (*s == '.') {
- s++;
- len = e - s;
- uv = grok_hex(s, &len, &flags, NULL);
- if (!len
- || (len != (STRLEN)(e - s) && s[len] != '.'))
- goto bad_NU;
- s--;
- d += my_snprintf(
- d, e - s + 1 + 1, ".%X",
- (unsigned int)UNI_TO_NATIVE(uv)
- );
- s += len + 1;
+
+ /* In patterns, we can have \N{U+xxxx.yyyy.zzzz...} */
+ /* Check the syntax. */
+ const char *orig_s;
+ orig_s = s - 5;
+ if (!isXDIGIT(*s)) {
+ bad_NU:
+ yyerror(
+ "Invalid hexadecimal number in \\N{U+...}"
+ );
+ s = e + 1;
+ continue;
}
- *(d++) = '}';
-#else
- /* On non-EBCDIC platforms, pass it through unchanged.
- * The reason we evaluate the numbers is to make
- * sure there wasn't a syntax error. */
- const char * const orig_s = s - 5;
- while (*s == '.') {
- s++;
- len = e - s;
- uv = grok_hex(s, &len, &flags, NULL);
- if (!len
- || (len != (STRLEN)(e - s) && s[len] != '.'))
- goto bad_NU;
+ while (++s < e) {
+ if (isXDIGIT(*s))
+ continue;
+ else if ((*s == '.' || *s == '_')
+ && isXDIGIT(s[1]))
+ continue;
+ goto bad_NU;
}
- /* +1 is for the '}' */
+
+ /* Pass everything through unchanged.
+ * +1 is for the '}' */
Copy(orig_s, d, e - orig_s + 1, char);
d += e - orig_s + 1;
-#endif
}
else { /* Not a pattern: convert the hex to string */
+ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ | PERL_SCAN_SILENT_ILLDIGIT
+ | PERL_SCAN_DISALLOW_PREFIX;
+ STRLEN len = e - s;
+ uv = grok_hex(s, &len, &flags, NULL);
+ if (len == 0 || (len != (STRLEN)(e - s)))
+ goto bad_NU;
/* If the destination is not in utf8, unconditionally
* recode it to be so. This is because \N{} implies
char hex_string[4];
int len =
my_snprintf(hex_string,
- sizeof(hex_string),
- "%02X.", (U8) *str);
- PERL_MY_SNPRINTF_POST_GUARD(len, sizeof(hex_string));
+ sizeof(hex_string),
+ "%02X.",
+
+ /* The regex compiler is
+ * expecting Unicode, not
+ * native */
+ (U8) NATIVE_TO_LATIN1(*str));
+ PERL_MY_SNPRINTF_POST_GUARD(len,
+ sizeof(hex_string));
Copy(hex_string, d, 3, char);
d += 3;
str++;
len,
&char_length,
UTF8_ALLOW_ANYUV);
- /* Convert first code point to hex, including
- * the boiler plate before it. */
+ /* Convert first code point to Unicode hex,
+ * including the boiler plate before it. */
output_length =
my_snprintf(hex_string, sizeof(hex_string),
- "\\N{U+%X",
- (unsigned int) uv);
+ "\\N{U+%X",
+ (unsigned int) NATIVE_TO_UNI(uv));
/* Make sure there is enough space to hold it */
d = off + SvGROW(sv, off
d += output_length;
/* For each subsequent character, append dot and
- * its ordinal in hex */
+ * its Unicode code point in hex */
while ((str += char_length) < str_end) {
const STRLEN off = d - SvPVX_const(sv);
U32 uv = utf8n_to_uvchr((U8 *) str,
UTF8_ALLOW_ANYUV);
output_length =
my_snprintf(hex_string,
- sizeof(hex_string),
- ".%X",
- (unsigned int) uv);
+ sizeof(hex_string),
+ ".%X",
+ (unsigned int) NATIVE_TO_UNI(uv));
d = off + SvGROW(sv, off
+ output_length
SvREFCNT_dec(tmp);
} );
- switch (PL_lex_state) {
- case LEX_NORMAL:
- case LEX_INTERPNORMAL:
- break;
-
/* when we've already built the next token, just pull it out of the queue */
- case LEX_KNOWNEXT:
+ if (PL_nexttoke) {
PL_nexttoke--;
pl_yylval = PL_nextval[PL_nexttoke];
if (!PL_nexttoke) {
}
return REPORT(next_type == 'p' ? pending_ident() : next_type);
}
+ }
+
+ switch (PL_lex_state) {
+ case LEX_NORMAL:
+ case LEX_INTERPNORMAL:
+ break;
/* interpolated case modifiers like \L \U, including \Q and \E.
when we get here, PL_bufptr is at the \
/* FALLTHROUGH */
case LEX_INTERPEND:
+ /* Treat state as LEX_NORMAL if we have no inner lexing scope.
+ XXX This hack can be removed if we stop setting PL_lex_state to
+ LEX_KNOWNEXT, as can the hack under LEX_INTREPCONCAT below. */
+ if (UNLIKELY(!PL_lex_inwhat)) {
+ PL_lex_state = LEX_NORMAL;
+ break;
+ }
+
if (PL_lex_dojoin) {
const U8 dojoin_was = PL_lex_dojoin;
PL_lex_dojoin = FALSE;
Perl_croak(aTHX_ "panic: INTERPCONCAT, lex_brackets=%ld",
(long) PL_lex_brackets);
#endif
+ /* Treat state as LEX_NORMAL when not in an inner lexing scope.
+ XXX This hack can be removed if we stop setting PL_lex_state to
+ LEX_KNOWNEXT. */
+ if (UNLIKELY(!PL_lex_inwhat)) {
+ PL_lex_state = LEX_NORMAL;
+ break;
+ }
+
if (PL_bufptr == PL_bufend)
return REPORT(sublex_done());
case 26:
goto fake_eof; /* emulate EOF on ^D or ^Z */
case 0:
- if (!PL_rsfp && (!PL_parser->filtered || s+1 < PL_bufend)) {
+ if ((!PL_rsfp || PL_lex_inwhat)
+ && (!PL_parser->filtered || s+1 < PL_bufend)) {
PL_last_uni = 0;
PL_last_lop = 0;
if (PL_lex_brackets &&
d = instr(s,"perl -");
if (!d) {
d = instr(s,"perl");
+ if (d && d[4] == '6')
+ d = NULL;
#if defined(DOSISH)
/* avoid getting into infinite loops when shebang
* line contains "Perl" rather than "perl" */
}
if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
PL_lex_state = LEX_FORMLINE;
- NEXTVAL_NEXTTOKE.ival = 0;
force_next(FORMRBRACK);
TOKEN(';');
}
incline(s);
if (PL_lex_formbrack && PL_lex_brackets <= PL_lex_formbrack) {
PL_lex_state = LEX_FORMLINE;
- NEXTVAL_NEXTTOKE.ival = 0;
force_next(FORMRBRACK);
TOKEN(';');
}
||(*s == '*' && (s[1] == '*' || s[1] == '{'))
))
{
- Perl_ck_warner_d(aTHX_
- packWARN(WARN_EXPERIMENTAL__POSTDEREF),
- "Postfix dereference is experimental"
- );
PL_expect = XPOSTDEREF;
TOKEN(ARROW);
}
sv_catsv(sv, PL_lex_stuff);
attrs = op_append_elem(OP_LIST, attrs,
newSVOP(OP_CONST, 0, sv));
- SvREFCNT_dec(PL_lex_stuff);
+ SvREFCNT_dec_NN(PL_lex_stuff);
PL_lex_stuff = NULL;
}
else {
}
switch (PL_expect) {
case XTERM:
+ case XTERMORDORDOR:
PL_lex_brackstack[PL_lex_brackets++] = XOPERATOR;
PL_lex_allbrackets++;
OPERATOR(HASHBRACK);
PL_tokenbuf[0] = '&';
s = scan_ident(s - 1, PL_tokenbuf + 1,
sizeof PL_tokenbuf - 1, TRUE);
+ pl_yylval.ival = (OPpENTERSUB_AMPER<<8);
if (PL_tokenbuf[1]) {
- PL_expect = XOPERATOR;
force_ident_maybe_lex('&');
}
else
PREREF('&');
- pl_yylval.ival = (OPpENTERSUB_AMPER<<8);
TERM('&');
case '|':
PL_tokenbuf[0] = '@';
s = scan_ident(s + 1, PL_tokenbuf + 1,
sizeof PL_tokenbuf - 1, FALSE);
- if (PL_expect == XOPERATOR)
- no_op("Array length", s);
+ if (PL_expect == XOPERATOR) {
+ d = s;
+ if (PL_bufptr > s) {
+ d = PL_bufptr-1;
+ PL_bufptr = PL_oldbufptr;
+ }
+ no_op("Array length", d);
+ }
if (!PL_tokenbuf[1])
PREREF(DOLSHARP);
PL_expect = XOPERATOR;
PL_tokenbuf[0] = '$';
s = scan_ident(s, PL_tokenbuf + 1,
sizeof PL_tokenbuf - 1, FALSE);
- if (PL_expect == XOPERATOR)
- no_op("Scalar", s);
+ if (PL_expect == XOPERATOR) {
+ d = s;
+ if (PL_bufptr > s) {
+ d = PL_bufptr-1;
+ PL_bufptr = PL_oldbufptr;
+ }
+ no_op("Scalar", d);
+ }
if (!PL_tokenbuf[1]) {
if (s == PL_bufend)
yyerror("Final $ should be \\$ or $name");
char *t = s+1;
while (isSPACE(*t) || isWORDCHAR_lazy_if(t,UTF) || *t == '$')
- t++;
+ t += UTF ? UTF8SKIP(t) : 1;
if (*t++ == ',') {
PL_bufptr = skipspace(PL_bufptr); /* XXX can realloc */
while (t < PL_bufend && *t != ']')
t++;
Perl_warner(aTHX_ packWARN(WARN_SYNTAX),
- "Multidimensional syntax %.*s not supported",
- (int)((t - PL_bufptr) + 1), PL_bufptr);
+ "Multidimensional syntax %"UTF8f" not supported",
+ UTF8fARG(UTF,(int)((t - PL_bufptr) + 1), PL_bufptr));
}
}
}
}
if (!words)
words = newNULLLIST();
- if (PL_lex_stuff) {
- SvREFCNT_dec(PL_lex_stuff);
- PL_lex_stuff = NULL;
- }
+ SvREFCNT_dec_NN(PL_lex_stuff);
+ PL_lex_stuff = NULL;
PL_expect = XOPERATOR;
pl_yylval.opval = sawparens(words);
TOKEN(QWLIST);
PERL_ARGS_ASSERT_SCAN_IDENT;
- if (isSPACE(*s))
+ if (isSPACE(*s) || !*s)
s = skipspace(s);
if (isDIGIT(*s)) {
while (isDIGIT(*s)) {
first_line = CopLINE(PL_curcop);
s = scan_str(s,FALSE,FALSE,FALSE,NULL);
if (!s) {
- if (PL_lex_stuff) {
- SvREFCNT_dec(PL_lex_stuff);
- PL_lex_stuff = NULL;
- }
+ SvREFCNT_dec_NN(PL_lex_stuff);
+ PL_lex_stuff = NULL;
Perl_croak(aTHX_ "Substitution replacement not terminated");
}
PL_multi_start = first_start; /* so whole substitution is taken together */
s = scan_str(s,FALSE,FALSE,FALSE,NULL);
if (!s) {
- if (PL_lex_stuff) {
- SvREFCNT_dec(PL_lex_stuff);
- PL_lex_stuff = NULL;
- }
+ SvREFCNT_dec_NN(PL_lex_stuff);
+ PL_lex_stuff = NULL;
Perl_croak(aTHX_ "Transliteration replacement not terminated");
}
term = '"';
if (!isWORDCHAR_lazy_if(s,UTF))
deprecate("bare << to mean <<\"\"");
- for (; isWORDCHAR_lazy_if(s,UTF); s++) {
- if (d < e)
- *d++ = *s;
+ peek = s;
+ while (isWORDCHAR_lazy_if(peek,UTF)) {
+ peek += UTF ? UTF8SKIP(peek) : 1;
}
+ len = (peek - s >= e - d) ? (e - d) : (peek - s);
+ Copy(s, d, len, char);
+ s += len;
+ d += len;
}
if (d >= PL_tokenbuf + sizeof PL_tokenbuf - 1)
Perl_croak(aTHX_ "Delimiter for here document is too long");
lexing scope. In a file, we will have broken out of the
loop in the previous iteration. In an eval, the string buf-
fer ends with "\n;", so the while condition above will have
- evaluated to false. So shared can never be null. */
- assert(shared);
+ evaluated to false. So shared can never be null. Or so you
+ might think. Odd syntax errors like s;@{<<; can gobble up
+ the implicit semicolon at the end of a flie, causing the
+ file handle to be closed even when we are not in a string
+ eval. So shared may be null in that case. */
+ if (UNLIKELY(!shared))
+ goto interminable;
/* A LEXSHARED struct with a null ls_prev pointer is the outer-
most lexing scope. In a file, shared->ls_linestr at that
level is just one line, so there is no body to steal. */
floatit = TRUE;
}
if (floatit) {
- STORE_NUMERIC_LOCAL_SET_STANDARD();
+ STORE_LC_NUMERIC_UNDERLYING_SET_STANDARD();
/* terminate the string */
*d = '\0';
if (UNLIKELY(hexfp)) {
} else {
nv = Atof(PL_tokenbuf);
}
- RESTORE_NUMERIC_LOCAL();
+ RESTORE_LC_NUMERIC_UNDERLYING();
sv = newSVnv(nv);
}
}
/*
- * Local variables:
- * c-indentation-style: bsd
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- *
* ex: set ts=8 sts=4 sw=4 et:
*/