/* LEX_* are values for PL_lex_state, the state of the lexer.
* They are arranged oddly so that the guard on the switch statement
* can get by with a single comparison (if the compiler is smart enough).
+ *
+ * These values refer to the various states within a sublex parse,
+ * i.e. within a double quotish string
*/
/* #define LEX_NOTPARSING 11 is done in perl.h. */
SAVEI32(PL_lex_casemods);
SAVEI32(PL_lex_starts);
SAVEI8(PL_lex_state);
+ SAVEPPTR(PL_sublex_info.re_eval_start);
SAVEVPTR(PL_lex_inpat);
SAVEI16(PL_lex_inwhat);
SAVECOPLINE(PL_curcop);
PL_linestr = PL_lex_stuff;
PL_lex_stuff = NULL;
+ PL_sublex_info.re_eval_start = NULL;
PL_bufend = PL_bufptr = PL_oldbufptr = PL_oldoldbufptr = PL_linestart
= SvPVX(PL_linestr);
/*
scan_const
- Extracts a pattern, double-quoted string, or transliteration. This
- is terrifying code.
+ Extracts the next constant part of a pattern, double-quoted string,
+ or transliteration. This is terrifying code.
+
+ For example, in parsing the double-quoted string "ab\x63$d", it would
+ stop at the '$' and return an OP_CONST containing 'abc'.
It looks at PL_lex_inwhat and PL_lex_inpat to find out whether it's
processing a pattern (PL_lex_inpat is true), a transliteration
Returns a pointer to the character scanned up to. If this is
advanced from the start pointer supplied (i.e. if anything was
- successfully parsed), will leave an OP for the substring scanned
+ successfully parsed), will leave an OP_CONST for the substring scanned
in pl_yylval. Caller must intuit reason for not parsing further
by looking at the next characters herself.
In patterns:
- backslashes:
- constants: \N{NAME} only
- case and quoting: \U \Q \E
- stops on @ and $, but not for $ as tail anchor
+ expand:
+ \N{ABC} => \N{U+41.42.43}
+
+ pass through:
+ all other \-char, including \N and \N{ apart from \N{ABC}
+
+ stops on:
+ @ and $ where it appears to be a var, but not for $ as tail anchor
+ \l \L \u \U \Q \E
+ (?{ or (??{
+
In transliterations:
characters are VERY literal, except for - not at the start or end
it's a tail anchor if $ is the last thing in the string, or if it's
followed by one of "()| \r\n\t"
- \1 (backreferences) are turned into $1
+ \1 (backreferences) are turned into $1 in substitutions
The structure of the code is
while (there's a character to process) {
register char *d = SvPVX(sv); /* destination for copies */
bool dorange = FALSE; /* are we in a translit range? */
bool didrange = FALSE; /* did we just finish a range? */
+ bool in_charclass = FALSE; /* within /[...]/ */
bool has_utf8 = FALSE; /* Output constant is UTF8 */
bool this_utf8 = cBOOL(UTF); /* Is the source string assumed
to be UTF8? But, this can
/* if we get here, we're not doing a transliteration */
- /* skip for regexp comments /(?#comment)/ and code /(?{code})/,
- except for the last char, which will be done separately. */
+ else if (*s == '[' && PL_lex_inpat && !in_charclass) {
+ char *s1 = s-1;
+ int esc = 0;
+ while (s1 >= start && *s1-- == '\\')
+ esc = !esc;
+ if (!esc)
+ in_charclass = TRUE;
+ }
+
+ else if (*s == ']' && PL_lex_inpat && in_charclass) {
+ char *s1 = s-1;
+ int esc = 0;
+ while (s1 >= start && *s1-- == '\\')
+ esc = !esc;
+ if (!esc)
+ in_charclass = FALSE;
+ }
+
+ /* skip for regexp comments /(?#comment)/, except for the last
+ * char, which will be done separately.
+ * Stop on (?{..}) and friends */
+
else if (*s == '(' && PL_lex_inpat && s[1] == '?') {
if (s[2] == '#') {
while (s+1 < send && *s != ')')
*d++ = NATIVE_TO_NEED(has_utf8,*s++);
}
- else if (s[2] == '{' /* This should match regcomp.c */
- || (s[2] == '?' && s[3] == '{'))
+ else if (!PL_lex_casemods && !in_charclass &&
+ ( s[2] == '{' /* This should match regcomp.c */
+ || (s[2] == '?' && s[3] == '{')))
{
- I32 count = 1;
- char *regparse = s + (s[2] == '{' ? 3 : 4);
- char c;
-
- while (count && (c = *regparse)) {
- if (c == '\\' && regparse[1])
- regparse++;
- else if (c == '{')
- count++;
- else if (c == '}')
- count--;
- regparse++;
- }
- if (*regparse != ')')
- regparse--; /* Leave one char for continuation. */
- while (s < regparse)
- *d++ = NATIVE_TO_NEED(has_utf8,*s++);
+ break;
}
}
*d++ = NATIVE_TO_NEED(has_utf8,*s++);
}
+ /* no further processing of single-quoted regex */
+ else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'')
+ goto default_action;
+
/* check for embedded arrays
(@foo, @::foo, @'foo, @{foo}, @$foo, @+, @-)
*/
/* FALL THROUGH */
default:
{
- if ((isALPHA(*s) || isDIGIT(*s)))
+ if ((isALNUMC(*s)))
Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
"Unrecognized escape \\%c passed through",
*s);
/* eg. \x24 indicates the hex constant 0x24 */
case 'x':
- ++s;
- if (*s == '{') {
- char* const e = strchr(s, '}');
- I32 flags = PERL_SCAN_ALLOW_UNDERSCORES |
- PERL_SCAN_DISALLOW_PREFIX;
+ {
STRLEN len;
+ const char* error;
- ++s;
- if (!e) {
- yyerror("Missing right brace on \\x{}");
+ bool valid = grok_bslash_x(s, &uv, &len, &error, 1);
+ s += len;
+ if (! valid) {
+ yyerror(error);
continue;
}
- len = e - s;
- uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
- s = e + 1;
- }
- else {
- {
- STRLEN len = 2;
- I32 flags = PERL_SCAN_DISALLOW_PREFIX;
- uv = NATIVE_TO_UNI(grok_hex(s, &len, &flags, NULL));
- s += len;
- }
}
NUM_ESCAPE_INSERT:
} else if (PL_lex_inwhat == OP_SUBST && !PL_lex_inpat) {
type = "s";
typelen = 1;
+ } else if (PL_lex_inpat && SvIVX(PL_linestr) == '\'') {
+ type = "q";
+ typelen = 1;
} else {
type = "qq";
typelen = 2;
case LEX_INTERPSTART:
if (PL_bufptr == PL_bufend)
return REPORT(sublex_done());
- DEBUG_T({ PerlIO_printf(Perl_debug_log,
+ DEBUG_T({ if(*PL_bufptr != '(') PerlIO_printf(Perl_debug_log,
"### Interpolated variable\n"); });
PL_expect = XTERM;
PL_lex_dojoin = (*PL_bufptr == '@');
NEXTVAL_NEXTTOKE.ival = OP_JOIN; /* emulate join($", ...) */
force_next(FUNC);
}
+ /* Convert (?{...}) and friends to 'do {...}' */
+ if (PL_lex_inpat && *PL_bufptr == '(') {
+ PL_sublex_info.re_eval_start = PL_bufptr;
+ PL_bufptr += 2;
+ if (*PL_bufptr != '{')
+ PL_bufptr++;
+ start_force(PL_curforce);
+ /* XXX probably need a CURMAD(something) here */
+ PL_expect = XTERMBLOCK;
+ force_next(DO);
+ }
+
if (PL_lex_starts++) {
s = PL_bufptr;
#ifdef PERL_MAD
Perl_croak(aTHX_ "Bad evalled substitution pattern");
PL_lex_repl = NULL;
}
+ if (PL_sublex_info.re_eval_start) {
+ if (*PL_bufptr != ')')
+ Perl_croak(aTHX_ "Sequence (?{...}) not terminated with ')'");
+ PL_bufptr++;
+ /* having compiled a (?{..}) expression, return the original
+ * text too, as a const */
+ start_force(PL_curforce);
+ /* XXX probably need a CURMAD(something) here */
+ NEXTVAL_NEXTTOKE.opval =
+ (OP*)newSVOP(OP_CONST, 0,
+ newSVpvn(PL_sublex_info.re_eval_start,
+ PL_bufptr - PL_sublex_info.re_eval_start));
+ force_next(THING);
+ PL_sublex_info.re_eval_start = NULL;
+ PL_expect = XTERM;
+ return REPORT(',');
+ }
+
/* FALLTHROUGH */
case LEX_INTERPCONCAT:
#ifdef DEBUGGING
if (PL_bufptr == PL_bufend)
return REPORT(sublex_done());
- if (SvIVX(PL_linestr) == '\'') {
+ /* m'foo' still needs to be parsed for possible (?{...}) */
+ if (SvIVX(PL_linestr) == '\'' && !PL_lex_inpat) {
SV *sv = newSVsv(PL_linestr);
- if (!PL_lex_inpat)
- sv = tokeq(sv);
- else if ( PL_hints & HINT_NEW_RE )
- sv = new_constant(NULL, 0, "qr", sv, sv, "q", 1);
+ sv = tokeq(sv);
pl_yylval.opval = (OP*)newSVOP(OP_CONST, 0, sv);
s = PL_bufend;
}
}
sv = newSVpvn_flags(s, len, UTF ? SVf_UTF8 : 0);
if (*d == '(') {
- d = scan_str(d,TRUE,TRUE);
+ d = scan_str(d,TRUE,TRUE,FALSE);
if (!d) {
/* MUST advance bufptr here to avoid bogus
"at end of line" context messages from yyerror().
TERM(THING);
case '\'':
- s = scan_str(s,!!PL_madskills,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE);
DEBUG_T( { printbuf("### Saw string before %s\n", s); } );
if (PL_expect == XOPERATOR) {
if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) {
TERM(sublex_start());
case '"':
- s = scan_str(s,!!PL_madskills,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE);
DEBUG_T( { printbuf("### Saw string before %s\n", s); } );
if (PL_expect == XOPERATOR) {
if (PL_lex_formbrack && PL_lex_brackets == PL_lex_formbrack) {
TERM(sublex_start());
case '`':
- s = scan_str(s,!!PL_madskills,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE);
DEBUG_T( { printbuf("### Saw backtick string before %s\n", s); } );
if (PL_expect == XOPERATOR)
no_op("Backticks",s);
LOP(OP_PIPE_OP,XTERM);
case KEY_q:
- s = scan_str(s,!!PL_madskills,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE);
if (!s)
missingterm(NULL);
pl_yylval.ival = OP_CONST;
case KEY_qw: {
OP *words = NULL;
- s = scan_str(s,!!PL_madskills,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE);
if (!s)
missingterm(NULL);
PL_expect = XOPERATOR;
}
case KEY_qq:
- s = scan_str(s,!!PL_madskills,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE);
if (!s)
missingterm(NULL);
pl_yylval.ival = OP_STRINGIFY;
TERM(sublex_start());
case KEY_qx:
- s = scan_str(s,!!PL_madskills,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE);
if (!s)
missingterm(NULL);
readpipe_override();
const bool warnillegalproto = ckWARN(WARN_ILLEGALPROTO);
STRLEN tmplen;
- s = scan_str(s,!!PL_madskills,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE);
if (!s)
Perl_croak(aTHX_ "Prototype not terminated");
/* strip spaces and check for bad characters */
/* Adds, subtracts to/from 'pmfl' based on regex modifier flags found in
* the parse starting at 's', based on the subset that are valid in this
* context input to this routine in 'valid_flags'. Advances s. Returns
- * TRUE if the input was a valid flag, so the next char may be as well;
- * otherwise FALSE. 'charset' should point to a NUL upon first call on the
- * current regex. This routine will set it to any charset modifier found.
- * The caller shouldn't change it. This way, another charset modifier
- * encountered in the parse can be detected as an error, as we have decided
- * to allow only one */
+ * TRUE if the input should be treated as a valid flag, so the next char
+ * may be as well; otherwise FALSE. 'charset' should point to a NUL upon
+ * first call on the current regex. This routine will set it to any
+ * charset modifier found. The caller shouldn't change it. This way,
+ * another charset modifier encountered in the parse can be detected as an
+ * error, as we have decided to allow only one */
const char c = **s;
-
- if (! strchr(valid_flags, c)) {
- if (isALNUM(c)) {
- yyerror(Perl_form(aTHX_ "Unknown regexp modifier \"/%c\"", c));
- (*s)++;
+ STRLEN charlen = UTF ? UTF8SKIP(*s) : 1;
+
+ if ( charlen != 1 || ! strchr(valid_flags, c) ) {
+ if (isALNUM_lazy_if(*s, UTF)) {
+ yyerror_pv(Perl_form(aTHX_ "Unknown regexp modifier \"/%.*s\"", charlen, *s),
+ UTF ? SVf_UTF8 : 0);
+ (*s) += charlen;
+ /* Pretend that it worked, so will continue processing before
+ * dieing */
return TRUE;
}
return FALSE;
{
dVAR;
PMOP *pm;
- char *s = scan_str(start,!!PL_madskills,FALSE);
+ char *s = scan_str(start,!!PL_madskills,FALSE, PL_reg_state.re_reparsing);
const char * const valid_flags =
(const char *)((type == OP_QR) ? QR_PAT_MODS : M_PAT_MODS);
char charset = '\0'; /* character set modifier */
PERL_ARGS_ASSERT_SCAN_PAT;
+ /* this was only needed for the initial scan_str; set it to false
+ * so that any (?{}) code blocks etc are parsed normally */
+ PL_reg_state.re_reparsing = FALSE;
if (!s) {
const char * const delimiter = skipspace(start);
Perl_croak(aTHX_
#ifdef PERL_MAD
modstart = s;
#endif
+
+ /* if qr/...(?{..}).../, then need to parse the pattern within a new
+ * anon CV. False positives like qr/[(?{]/ are harmless */
+
+ if (type == OP_QR) {
+ STRLEN len;
+ char *e, *p = SvPV(PL_lex_stuff, len);
+ e = p + len;
+ for (; p < e; p++) {
+ if (p[0] == '(' && p[1] == '?'
+ && (p[2] == '{' || (p[2] == '?' && p[3] == '{')))
+ {
+ pm->op_pmflags |= PMf_HAS_CV;
+ break;
+ }
+ }
+ pm->op_pmflags |= PMf_IS_QR;
+ }
+
while (*s && S_pmflag(aTHX_ valid_flags, &(pm->op_pmflags), &s, &charset)) {};
#ifdef PERL_MAD
if (PL_madskills && modstart != s) {
pl_yylval.ival = OP_NULL;
- s = scan_str(start,!!PL_madskills,FALSE);
+ s = scan_str(start,!!PL_madskills,FALSE,FALSE);
if (!s)
Perl_croak(aTHX_ "Substitution pattern not terminated");
#endif
first_start = PL_multi_start;
- s = scan_str(s,!!PL_madskills,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE);
if (!s) {
if (PL_lex_stuff) {
SvREFCNT_dec(PL_lex_stuff);
pl_yylval.ival = OP_NULL;
- s = scan_str(start,!!PL_madskills,FALSE);
+ s = scan_str(start,!!PL_madskills,FALSE,FALSE);
if (!s)
Perl_croak(aTHX_ "Transliteration pattern not terminated");
}
#endif
- s = scan_str(s,!!PL_madskills,FALSE);
+ s = scan_str(s,!!PL_madskills,FALSE,FALSE);
if (!s) {
if (PL_lex_stuff) {
SvREFCNT_dec(PL_lex_stuff);
if (d - PL_tokenbuf != len) {
pl_yylval.ival = OP_GLOB;
- s = scan_str(start,!!PL_madskills,FALSE);
+ s = scan_str(start,!!PL_madskills,FALSE,FALSE);
if (!s)
Perl_croak(aTHX_ "Glob not terminated");
return s;
takes: start position in buffer
keep_quoted preserve \ on the embedded delimiter(s)
keep_delims preserve the delimiters around the string
+ re_reparse compiling a run-time /(?{})/:
+ collapse // to /, and skip encoding src
returns: position to continue reading from buffer
side-effects: multi_start, multi_close, lex_repl or lex_stuff, and
updates the read buffer.
*/
STATIC char *
-S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims)
+S_scan_str(pTHX_ char *start, int keep_quoted, int keep_delims, int re_reparse)
{
dVAR;
SV *sv; /* scalar value: string */
}
#endif
for (;;) {
- if (PL_encoding && !UTF) {
+ if (PL_encoding && !UTF && !re_reparse) {
bool cont = TRUE;
while (cont) {
CopLINE_inc(PL_curcop);
/* handle quoted delimiters */
if (*s == '\\' && s+1 < PL_bufend && term != '\\') {
- if (!keep_quoted && s[1] == term)
+ if (!keep_quoted
+ && (s[1] == term
+ || (re_reparse && s[1] == '\\'))
+ )
s++;
- /* any other quotes are simply copied straight through */
+ /* any other quotes are simply copied straight through */
else
*to++ = *s++;
}
/* at this point, we have successfully read the delimited string */
- if (!PL_encoding || UTF) {
+ if (!PL_encoding || UTF || re_reparse) {
#ifdef PERL_MAD
if (PL_madskills) {
char * const tstart = SvPVX(PL_linestr) + stuffstart;
}
}
#endif
- if (has_utf8 || PL_encoding)
+ if (has_utf8 || (PL_encoding && !re_reparse))
SvUTF8_on(sv);
PL_multi_end = CopLINE(PL_curcop);