DEBUG_STUDYDATA("commit: ",data,0);
}
-/* These macros set, clear and test whether the synthetic start class ('SSC',
- * given by the parameter) matches an empty string (EOS). This uses the
- * 'next_off' field in the node, to save a bit in the flags field. The ssc
- * stands alone, so there is never a next_off, so this field is otherwise
- * unused. The EOS information is used only for compilation, but theoretically
- * it could be passed on to the execution code. This could be used to store
- * more than one bit of information, but only this one is currently used. This
- * flag could be moved back to the bitmap instead, shared with INVERT, as no
- * SSC is ever inverted */
-#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
-#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
-#define TEST_SSC_EOS(node) cBOOL((node)->next_off)
-
/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
* list that describes which code points it matches */
ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
_append_range_to_invlist(ssc->invlist, 0, UV_MAX);
- SET_SSC_EOS(ssc); /* Plus match empty string */
+ ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
}
STATIC int
S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
{
- /* Returns TRUE if the SSC 'ssc' can match the empty string or any code
- * point */
+ /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
+ * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
+ * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
+ * in any way, so there's no point in using it */
UV start, end;
bool ret;
assert(OP(ssc) == ANYOF_SYNTHETIC);
- if (! TEST_SSC_EOS(ssc)) {
+ if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
return FALSE;
}
S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
{
/* Take the flags 'and_with' and accumulate them anded into the flags for
- * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored. */
+ * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored.
+ * The flags 'and_with' should not come from another SSC (otherwise the
+ * EMPTY_STRING flag won't work) */
const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS;
/* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
* another SSC or a regular ANYOF class. Can create false positives. */
- /* If 'and_with' is an SSC, we already have its inversion list; otherwise
- * have to calculate it */
- SV* anded_cp_list = (OP(and_with) == ANYOF_SYNTHETIC)
- ? and_with->invlist
- : get_ANYOF_cp_list_for_ssc(pRExC_state,
- (regnode_charclass_posixl*) and_with);
+ SV* anded_cp_list;
+ U8 anded_flags;
PERL_ARGS_ASSERT_SSC_AND;
assert(OP(ssc) == ANYOF_SYNTHETIC);
- assert(! (ANYOF_FLAGS(ssc) & ANYOF_INVERT)); /* SSCs are never inverted */
+
+ /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
+ * the code point inversion list and just the relevant flags */
+ if (OP(and_with) == ANYOF_SYNTHETIC) {
+ anded_cp_list = and_with->invlist;
+ anded_flags = ANYOF_FLAGS(and_with);
+ }
+ else {
+ anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
+ (regnode_charclass_posixl*) and_with);
+ anded_flags = ANYOF_FLAGS(and_with) & ANYOF_LOCALE_FLAGS;
+ }
+
+ ANYOF_FLAGS(ssc) &= anded_flags;
/* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
* C2 is the list of code points in 'and-with'; P2, its posix classes.
* <= (C1 & ~C2) | (P1 & ~P2)
* */
- if (ANYOF_FLAGS(and_with) & ANYOF_INVERT) {
+ if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
+ && OP(and_with) != ANYOF_SYNTHETIC)
+ {
unsigned int i;
- assert(OP(and_with) != ANYOF_SYNTHETIC);
-
ssc_intersection(ssc,
anded_cp_list,
FALSE /* Has already been inverted */
ssc_intersection(ssc, anded_cp_list, FALSE);
}
}
-
- ssc_flags_and(ssc, ANYOF_FLAGS(and_with));
}
STATIC void
* another SSC or a regular ANYOF class. Can create false positives if
* 'or_with' is to be inverted. */
- /* If 'or_with' is an SSC, we already have its inversion list; otherwise
- * have to calculate it */
- SV* ored_cp_list = (OP(or_with) == ANYOF_SYNTHETIC)
- ? or_with->invlist
- : get_ANYOF_cp_list_for_ssc(pRExC_state,
- (regnode_charclass_posixl*) or_with);
+ SV* ored_cp_list;
+ U8 ored_flags;
PERL_ARGS_ASSERT_SSC_OR;
assert(OP(ssc) == ANYOF_SYNTHETIC);
- assert(! (ANYOF_FLAGS(ssc) & ANYOF_INVERT));
+
+ /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
+ * the code point inversion list and just the relevant flags */
+ if (OP(or_with) == ANYOF_SYNTHETIC) {
+ ored_cp_list = or_with->invlist;
+ ored_flags = ANYOF_FLAGS(or_with);
+ }
+ else {
+ ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
+ (regnode_charclass_posixl*) or_with);
+ ored_flags = ANYOF_FLAGS(or_with) & ANYOF_LOCALE_FLAGS;
+ }
+
+ ANYOF_FLAGS(ssc) |= ored_flags;
/* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
* C2 is the list of code points in 'or-with'; P2, its posix classes.
* (which results in actually simpler code than the non-inverted case)
* */
- /* Use just the SSC-related flags from 'or_with' */
- ANYOF_FLAGS(ssc) |= (ANYOF_FLAGS(or_with) & ANYOF_LOCALE_FLAGS);
-
- if (ANYOF_FLAGS(or_with) & ANYOF_INVERT) {
- assert(OP(or_with) != ANYOF_SYNTHETIC);
+ if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
+ && OP(or_with) != ANYOF_SYNTHETIC)
+ {
/* We ignore P2, leaving P1 going forward */
}
else { /* Not inverted */
assert(OP(ssc) == ANYOF_SYNTHETIC);
+ /* The code in this file assumes that all but these flags aren't relevant
+ * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
+ * time we reach here */
+ assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS));
+
populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
* can't match null string */
if (flags & SCF_DO_STCLASS_AND) {
ssc_cp_and(data->start_class, uc);
- CLEAR_SSC_EOS(data->start_class);
+ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
ssc_clear_locale(data->start_class);
}
else if (flags & SCF_DO_STCLASS_OR) {
}
}
if (flags & SCF_DO_STCLASS_AND) {
- CLEAR_SSC_EOS(data->start_class);
+ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
ANYOF_POSIXL_ZERO(data->start_class);
ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
}
flags &= ~SCF_DO_STCLASS_AND;
StructCopy(&this_class, data->start_class, regnode_ssc);
flags |= SCF_DO_STCLASS_OR;
+ ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
}
} else { /* Non-zero len */
if (flags & SCF_DO_STCLASS_OR) {
ssc_intersection(data->start_class,
PL_XPosix_ptrs[_CC_VERTSPACE], FALSE);
ssc_clear_locale(data->start_class);
- CLEAR_SSC_EOS(data->start_class); /* No match on empty */
+ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
}
else if (flags & SCF_DO_STCLASS_OR) {
ssc_union(data->start_class,
U8 namedclass;
if (flags & SCF_DO_STCLASS_AND) {
- CLEAR_SSC_EOS(data->start_class); /* No match on empty */
+ ANYOF_FLAGS(data->start_class) &= ~ANYOF_EMPTY_STRING;
}
/* Some of the logic below assumes that switching
classnum = FLAGS(scan);
namedclass = classnum_to_namedclass(classnum) + invert;
if (flags & SCF_DO_STCLASS_AND) {
- bool was_there = ANYOF_POSIXL_TEST(data->start_class,
- namedclass);
+ bool was_there = cBOOL(
+ ANYOF_POSIXL_TEST(data->start_class,
+ namedclass));
ANYOF_POSIXL_ZERO(data->start_class);
if (was_there) { /* Do an AND */
ANYOF_POSIXL_SET(data->start_class, namedclass);
ssc_init(pRExC_state, data->start_class);
} else {
/* AND before and after: combine and continue */
- const int was = TEST_SSC_EOS(data->start_class);
-
ssc_and(pRExC_state, data->start_class, &intrnl);
- if (was)
- SET_SSC_EOS(data->start_class);
}
}
}
*minnextp += min;
if (f & SCF_DO_STCLASS_AND) {
- const int was = TEST_SSC_EOS(data->start_class);
-
ssc_and(pRExC_state, data->start_class, &intrnl);
- if (was)
- SET_SSC_EOS(data->start_class);
}
if (data) {
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
&& stclass_flag
- && ! TEST_SSC_EOS(data.start_class)
+ && ! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
&& !ssc_is_anything(data.start_class))
{
const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
= r->float_substr = r->float_utf8 = NULL;
- if (! TEST_SSC_EOS(data.start_class)
- && !ssc_is_anything(data.start_class))
- {
+ if (! (ANYOF_FLAGS(data.start_class) & ANYOF_EMPTY_STRING)
+ && ! ssc_is_anything(data.start_class))
+ {
const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
ssc_finalize(pRExC_state, data.start_class);
if (*output == a) {
if (a != NULL) {
- if (! (make_temp = SvTEMP(a))) {
+ if (! (make_temp = cBOOL(SvTEMP(a)))) {
SvREFCNT_dec_NN(a);
}
}
else if ((len_b = _invlist_len(b)) == 0) {
bool make_temp = FALSE;
if (*output == b) {
- if (! (make_temp = SvTEMP(b))) {
+ if (! (make_temp = cBOOL(SvTEMP(b)))) {
SvREFCNT_dec_NN(b);
}
}
* so the union with <a> includes everything too */
if (complement_b) {
if (a == *output) {
- if (! (make_temp = SvTEMP(a))) {
+ if (! (make_temp = cBOOL(SvTEMP(a)))) {
SvREFCNT_dec_NN(a);
}
}
* simply 'a'. */
if (*i != a) {
if (*i == b) {
- if (! (make_temp = SvTEMP(b))) {
+ if (! (make_temp = cBOOL(SvTEMP(b)))) {
SvREFCNT_dec_NN(b);
}
}
/* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
* intersection must be empty */
if (*i == a) {
- if (! (make_temp = SvTEMP(a))) {
+ if (! (make_temp = cBOOL(SvTEMP(a)))) {
SvREFCNT_dec_NN(a);
}
}
else if (*i == b) {
- if (! (make_temp = SvTEMP(b))) {
+ if (! (make_temp = cBOOL(SvTEMP(b)))) {
SvREFCNT_dec_NN(b);
}
}
default:
fail_modifiers:
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
/*NOTREACHED*/
SV *sv_dat = reg_scan_name(pRExC_state,
SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
if (RExC_parse == name_start || *RExC_parse != ')')
+ /* diag_listed_as: Sequence ?P=... not terminated in regex; marked by <-- HERE in m/%s/ */
vFAIL2("Sequence %.3s... not terminated",parse_start);
if (!SIZE_ONLY) {
return ret;
}
RExC_parse++;
+ /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
/*NOTREACHED*/
case '<': /* (?<...) */
REG_RSN_RETURN_NULL);
if (RExC_parse == name_start) {
RExC_parse++;
+ /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
/*NOTREACHED*/
}
is_logical = 1;
if (*RExC_parse != '{') {
RExC_parse++;
+ /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL2utf8f(
"Sequence (%"UTF8f"...) not recognized",
UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
/* The [^\n] meaning of \N ignores spaces and comments under the /x
- * modifier. The other meaning does not */
+ * modifier. The other meaning does not, so use a temporary until we find
+ * out which we are being called with */
p = (RExC_flags & RXf_PMf_EXTENDED)
? regwhite( pRExC_state, RExC_parse )
: RExC_parse;
if (*p != '{' || regcurly(p, FALSE)) {
RExC_parse = p;
if (! node_p) {
- /* no bare \N in a charclass */
+ /* no bare \N allowed in a charclass */
if (in_char_class) {
vFAIL("\\N in a character class must be a named character: \\N{...}");
}
return FALSE;
}
+ RExC_parse--; /* Need to back off so nextchar() doesn't skip the
+ current char */
nextchar(pRExC_state);
*node_p = reg_node(pRExC_state, REG_ANY);
*flagp |= HASWIDTH|SIMPLE;
RExC_naughty++;
- RExC_parse--;
Set_Node_Length(*node_p, 1); /* MJD */
return TRUE;
}
}
}
+
+/* return atoi(p), unless it's too big to sensibly be a backref,
+ * in which case return I32_MAX (rather than possibly 32-bit wrapping) */
+
+static I32
+S_backref_value(char *p)
+{
+ char *q = p;
+
+ for (;isDIGIT(*q); q++); /* calculate length of num */
+ if (q - p == 0 || q - p > 9)
+ return I32_MAX;
+ return atoi(p);
+}
+
+
/*
- regatom - the lowest level
char ch= RExC_parse[1];
if (ch != '<' && ch != '\'' && ch != '{') {
RExC_parse++;
+ /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
vFAIL2("Sequence %.2s... not terminated",parse_start);
} else {
/* this pretty much dupes the code for (?P=...) in reg(), if
SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
if (RExC_parse == name_start || *RExC_parse != ch)
+ /* diag_listed_as: Sequence \%s... not terminated in regex; marked by <-- HERE in m/%s/ */
vFAIL2("Sequence %.3s... not terminated",parse_start);
if (!SIZE_ONLY) {
case '5': case '6': case '7': case '8': case '9':
{
I32 num;
- bool isg = *RExC_parse == 'g';
- bool isrel = 0;
bool hasbrace = 0;
- if (isg) {
+
+ if (*RExC_parse == 'g') {
+ bool isrel = 0;
+
RExC_parse++;
if (*RExC_parse == '{') {
RExC_parse++;
if (isrel) RExC_parse--;
RExC_parse -= 2;
goto parse_named_seq;
- } }
- num = atoi(RExC_parse);
- if (isg && num == 0) {
- if (*RExC_parse == '0') {
+ }
+
+ num = S_backref_value(RExC_parse);
+ if (num == 0)
vFAIL("Reference to invalid group 0");
+ else if (num == I32_MAX) {
+ if (isDIGIT(*RExC_parse))
+ vFAIL("Reference to nonexistent group");
+ else
+ vFAIL("Unterminated \\g... pattern");
}
- else {
- vFAIL("Unterminated \\g... pattern");
+
+ if (isrel) {
+ num = RExC_npar - num;
+ if (num < 1)
+ vFAIL("Reference to nonexistent or unclosed group");
}
}
- if (isrel) {
- num = RExC_npar - num;
- if (num < 1)
- vFAIL("Reference to nonexistent or unclosed group");
+ else {
+ num = S_backref_value(RExC_parse);
+ /* bare \NNN might be backref or octal */
+ if (num == I32_MAX || (num > 9 && num >= RExC_npar
+ && *RExC_parse != '8' && *RExC_parse != '9'))
+ /* Probably a character specified in octal, e.g. \35 */
+ goto defchar;
}
- if (!isg && num > 9 && num >= RExC_npar && *RExC_parse != '8' && *RExC_parse != '9')
- /* Probably a character specified in octal, e.g. \35 */
- goto defchar;
- else {
+
+ /* at this point RExC_parse definitely points to a backref
+ * number */
+ {
#ifdef RE_TRACK_PATTERN_OFFSETS
char * const parse_start = RExC_parse - 1; /* MJD */
#endif
* 118 OR as "\11" . "8" depending on whether there
* were 118 capture buffers defined already in the
* pattern. */
- if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
+ if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar)
{ /* Not to be treated as an octal constant, go
find backref */
--p;
/*
- regprop - printable representation of opcode
*/
-#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
-STMT_START { \
- if (do_sep) { \
- Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
- if (flags & ANYOF_INVERT) \
- /*make sure the invert info is in each */ \
- sv_catpvs(sv, "^"); \
- do_sep = 0; \
- } \
-} STMT_END
void
Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
|| _CC_VERTSPACE != 16
#error Need to adjust order of anyofs[]
#endif
- "[\\w]",
- "[\\W]",
- "[\\d]",
- "[\\D]",
+ "\\w",
+ "\\W",
+ "\\d",
+ "\\D",
"[:alpha:]",
"[:^alpha:]",
"[:lower:]",
"[:^graph:]",
"[:cased:]",
"[:^cased:]",
- "[\\s]",
- "[\\S]",
+ "\\s",
+ "\\S",
"[:blank:]",
"[:^blank:]",
"[:xdigit:]",
"[:^cntrl:]",
"[:ascii:]",
"[:^ascii:]",
- "[\\v]",
- "[\\V]"
+ "\\v",
+ "\\V"
};
RXi_GET_DECL(prog,progi);
GET_RE_DEBUG_FLAGS_DECL;
/* output what the standard cp 0-255 bitmap matches */
do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
- EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
- /* output any special charclass tests (used entirely under use locale) */
+ /* output any special charclass tests (used entirely under use
+ * locale) * */
if (ANYOF_POSIXL_TEST_ANY_SET(o)) {
int i;
- for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++) {
+ for (i = 0; i < ANYOF_POSIXL_MAX; i++) {
if (ANYOF_POSIXL_TEST(o,i)) {
sv_catpv(sv, anyofs[i]);
do_sep = 1;
}
}
- EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
+ if (flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_ABOVE_LATIN1_ALL)
+ || ANYOF_NONBITMAP(o))
+ {
+ if (do_sep) {
+ Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
+ if (flags & ANYOF_INVERT)
+ /*make sure the invert info is in each */
+ sv_catpvs(sv, "^");
+ }
if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
sv_catpvs(sv, "{non-utf8-latin1-all}");
SvREFCNT_dec_NN(lv);
}
}
+ }
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
}
Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
}
else {
+ if (*anyofs[index] != '[') {
+ sv_catpv(sv, "[");
+ }
sv_catpv(sv, anyofs[index]);
+ if (*anyofs[index] != '[') {
+ sv_catpv(sv, "]");
+ }
}
}
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))