ENTER;
SAVETMPS;
- save_re_context();
PUSHSTACKi(PERLSI_REQUIRE);
/* G_RE_REPARSING causes the toker to collapse \\ into \ when
* parsing qr''; normally only q'' does this. It also alters
else if (PL_regkind[OP(first)] == BOL) {
r->intflags |= (OP(first) == MBOL
? PREGf_ANCH_MBOL
- : (OP(first) == SBOL
- ? PREGf_ANCH_SBOL
- : PREGf_ANCH_BOL));
+ : PREGf_ANCH_SBOL);
first = NEXTOPER(first);
goto again;
}
if (PL_regkind[fop] == NOTHING && nop == END)
r->extflags |= RXf_NULL;
- else if (PL_regkind[fop] == BOL && nop == END)
+ else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
+ /* when fop is SBOL first->flags will be true only when it was
+ * produced by parsing /\A/, and not when parsing /^/. This is
+ * very important for the split code as there we want to
+ * treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
+ * See rt #122761 for more details. -- Yves */
r->extflags |= RXf_START_ONLY;
else if (fop == PLUS
&& PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
if (valuep && (substitute_parse || ! has_multiple_chars)) {
STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX
+ | PERL_SCAN_DISALLOW_PREFIX
/* No errors in the first pass (See [perl
* #122671].) We let the code below find the
* bypass it by using single quoting, so check. Don't do the check
* here when there are multiple chars; we do it below anyway. */
if (! has_multiple_chars) {
- if (length_of_hex == 0
- || length_of_hex != (STRLEN)(endchar - RExC_parse) )
- {
- RExC_parse += length_of_hex; /* Includes all the valid */
- RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
- ? UTF8SKIP(RExC_parse)
- : 1;
- /* Guard against malformed utf8 */
- if (RExC_parse >= endchar) {
- RExC_parse = endchar;
+ if (length_of_hex == 0
+ || length_of_hex != (STRLEN)(endchar - RExC_parse) )
+ {
+ RExC_parse += length_of_hex; /* Includes all the valid */
+ RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
+ ? UTF8SKIP(RExC_parse)
+ : 1;
+ /* Guard against malformed utf8 */
+ if (RExC_parse >= endchar) {
+ RExC_parse = endchar;
+ }
+ vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
- vFAIL("Invalid hexadecimal number in \\N{U+...}");
- }
- RExC_parse = endbrace + 1;
- return 1;
+ RExC_parse = endbrace + 1;
+ return 1;
}
}
RExC_override_recoding = 1;
if (node_p) {
- if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
- if (flags & RESTART_UTF8) {
- *flagp = RESTART_UTF8;
- return (STRLEN) -1;
+ if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
+ if (flags & RESTART_UTF8) {
+ *flagp = RESTART_UTF8;
+ return (STRLEN) -1;
+ }
+ FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
+ (UV) flags);
}
- FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
- (UV) flags);
- }
- *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
+ *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
}
RExC_parse = endbrace;
nextchar(pRExC_state);
if (RExC_flags & RXf_PMf_MULTILINE)
ret = reg_node(pRExC_state, MBOL);
- else if (RExC_flags & RXf_PMf_SINGLELINE)
- ret = reg_node(pRExC_state, SBOL);
else
- ret = reg_node(pRExC_state, BOL);
+ ret = reg_node(pRExC_state, SBOL);
Set_Node_Length(ret, 1); /* MJD */
break;
case '$':
RExC_seen_zerolen++;
if (RExC_flags & RXf_PMf_MULTILINE)
ret = reg_node(pRExC_state, MEOL);
- else if (RExC_flags & RXf_PMf_SINGLELINE)
- ret = reg_node(pRExC_state, SEOL);
else
- ret = reg_node(pRExC_state, EOL);
+ ret = reg_node(pRExC_state, SEOL);
Set_Node_Length(ret, 1); /* MJD */
break;
case '.':
case 'A':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, SBOL);
+ /* SBOL is shared with /^/ so we set the flags so we can tell
+ * /\A/ from /^/ in split. We check ret because first pass we
+ * have no regop struct to set the flags on. */
+ if (PASS2)
+ ret->flags = 1;
*flagp |= SIMPLE;
goto finish_meta_pat;
case 'G':
ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
- if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
+ if ((U8) *(RExC_parse + 1) == '{') {
/* diag_listed_as: Use "%s" instead of "%s" */
vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
}
ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
- if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
+ if ((U8) *(RExC_parse + 1) == '{') {
/* diag_listed_as: Use "%s" instead of "%s" */
vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
}
}
}
+STATIC AV *
+S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count)
+{
+ /* This adds the string scalar <multi_string> to the array
+ * <multi_char_matches>. <multi_string> is known to have exactly
+ * <cp_count> code points in it. This is used when constructing a
+ * bracketed character class and we find something that needs to match more
+ * than a single character.
+ *
+ * <multi_char_matches> is actually an array of arrays. Each top-level
+ * element is an array that contains all the strings known so far that are
+ * the same length. And that length (in number of code points) is the same
+ * as the index of the top-level array. Hence, the [2] element is an
+ * array, each element thereof is a string containing TWO code points; while element
+ * [3] is for strings of THREE characters, and so on. Since this is for
+ * multi-char strings there can never be a [0] nor [1] element.
+ *
+ * When we rewrite the character class below, we will do so such that the
+ * longest strings are written first, so that it prefers the longest
+ * matching strings first. This is done even if it turns out that any
+ * quantifier is non-greedy, out of this programmer's (khw) laziness. Tom
+ * Christiansen has agreed that this is ok. This makes the test for the
+ * ligature 'ffi' come before the test for 'ff', for example */
+
+ AV* this_array;
+ AV** this_array_ptr;
+
+ PERL_ARGS_ASSERT_ADD_MULTI_MATCH;
+
+ if (! multi_char_matches) {
+ multi_char_matches = newAV();
+ }
+
+ if (av_exists(multi_char_matches, cp_count)) {
+ this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
+ this_array = *this_array_ptr;
+ }
+ else {
+ this_array = newAV();
+ av_store(multi_char_matches, cp_count,
+ (SV*) this_array);
+ }
+ av_push(this_array, multi_string);
+
+ return multi_char_matches;
+}
+
/* The names of properties whose definitions are not known at compile time are
* stored in this SV, after a constant heading. So if the length has been
* changed since initialization, then there is a run-time definition. */
}
}
else { /* cp_count > 1 */
- /* We only pay attention to the first char of
- * multichar strings being returned in char
- * classes. I kinda wonder if this makes sense as
- * it does change the behaviour from earlier
- * versions, OTOH that behaviour was broken as
- * well. XXX Solution is to recharacterize as
- * [rest-of-class]|multi1|multi2... */
+ if (! RExC_in_multi_char_class) {
+ if (invert || range || *RExC_parse == '-') {
if (strict) {
RExC_parse--;
- vFAIL("\\N{} in character class restricted to one character");
+ vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
}
else if (PASS2) {
ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
}
+ }
+ else {
+ multi_char_matches
+ = add_multi_match(multi_char_matches,
+ as_text,
+ cp_count);
+ }
break; /* <value> contains the first code
point. Drop out of the switch to
process it */
+ }
} /* End of cp_count != 1 */
/* This element should not be processed further in this
* again. Otherwise add this character to the list of
* multi-char folds. */
if (! RExC_in_multi_char_class) {
- AV** this_array_ptr;
- AV* this_array;
STRLEN cp_count = utf8_length(foldbuf,
foldbuf + foldlen);
SV* multi_fold = sv_2mortal(newSVpvs(""));
Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
+ multi_char_matches
+ = add_multi_match(multi_char_matches,
+ multi_fold,
+ cp_count);
- if (! multi_char_matches) {
- multi_char_matches = newAV();
- }
-
- /* <multi_char_matches> is actually an array of arrays.
- * There will be one or two top-level elements: [2],
- * and/or [3]. The [2] element is an array, each
- * element thereof is a character which folds to TWO
- * characters; [3] is for folds to THREE characters.
- * (Unicode guarantees a maximum of 3 characters in any
- * fold.) When we rewrite the character class below,
- * we will do so such that the longest folds are
- * written first, so that it prefers the longest
- * matching strings first. This is done even if it
- * turns out that any quantifier is non-greedy, out of
- * programmer laziness. Tom Christiansen has agreed
- * that this is ok. This makes the test for the
- * ligature 'ffi' come before the test for 'ff' */
- if (av_exists(multi_char_matches, cp_count)) {
- this_array_ptr = (AV**) av_fetch(multi_char_matches,
- cp_count, FALSE);
- this_array = *this_array_ptr;
- }
- else {
- this_array = newAV();
- av_store(multi_char_matches, cp_count,
- (SV*) this_array);
- }
- av_push(this_array, multi_fold);
}
/* This element should not be processed further in this
RExC_parse = SvPV(substitute_parse, len);
RExC_end = RExC_parse + len;
RExC_in_multi_char_class = 1;
+ RExC_override_recoding = 1;
RExC_emit = (regnode *)orig_emit;
ret = reg(pRExC_state, 1, ®_flags, depth+1);
RExC_parse = save_parse;
RExC_end = save_end;
RExC_in_multi_char_class = 0;
+ RExC_override_recoding = 0;
SvREFCNT_dec_NN(multi_char_matches);
return ret;
}
}
if (r->intflags & PREGf_ANCH) {
PerlIO_printf(Perl_debug_log, "anchored");
- if (r->intflags & PREGf_ANCH_BOL)
- PerlIO_printf(Perl_debug_log, "(BOL)");
if (r->intflags & PREGf_ANCH_MBOL)
PerlIO_printf(Perl_debug_log, "(MBOL)");
if (r->intflags & PREGf_ANCH_SBOL)
}
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
+ else if (OP(o) == SBOL)
+ Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
#else
PERL_UNUSED_CONTEXT;
PERL_UNUSED_ARG(sv);
Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
}
-/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
-
-#ifndef PERL_IN_XSUB_RE
-void
-Perl_save_re_context(pTHX)
-{
- /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
- if (PL_curpm) {
- const REGEXP * const rx = PM_GETRE(PL_curpm);
- if (rx) {
- U32 i;
- for (i = 1; i <= RX_NPARENS(rx); i++) {
- char digits[TYPE_CHARS(long)];
- const STRLEN len = my_snprintf(digits, sizeof(digits),
- "%lu", (long)i);
- GV *const *const gvp
- = (GV**)hv_fetch(PL_defstash, digits, len, 0);
-
- if (gvp) {
- GV * const gv = *gvp;
- if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
- save_scalar(gv);
- }
- }
- }
- }
-}
-#endif
-
#ifdef DEBUGGING
/* Certain characters are output as a sequence with the first being a
* backslash. */