# include "regcomp.h"
#endif
-#include "dquote_static.c"
-#include "inline_invlist.c"
+#include "dquote_inline.h"
+#include "invlist_inline.h"
#include "unicode_constants.h"
#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
I32 contains_locale;
I32 contains_i;
I32 override_recoding;
+#ifdef EBCDIC
+ I32 recode_x_to_native;
+#endif
I32 in_multi_char_class;
struct reg_code_block *code_blocks; /* positions of literal (?{})
within pattern */
#define RExC_mysv2 (pRExC_state->mysv2)
#endif
+ bool seen_unfolded_sharp_s;
};
#define RExC_flags (pRExC_state->flags)
#define RExC_end (pRExC_state->end)
#define RExC_parse (pRExC_state->parse)
#define RExC_whilem_seen (pRExC_state->whilem_seen)
+
+/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
+ * EXACTF node, hence was parsed under /di rules. If later in the parse,
+ * something forces the pattern into using /ui rules, the sharp s should be
+ * folded into the sequence 'ss', which takes up more space than previously
+ * calculated. This means that the sizing pass needs to be restarted. (The
+ * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
+ * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
+ * so there is no need to resize [perl #125990]. */
+#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
+
#ifdef RE_TRACK_PATTERN_OFFSETS
#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
others */
#define RExC_contains_locale (pRExC_state->contains_locale)
#define RExC_contains_i (pRExC_state->contains_i)
#define RExC_override_recoding (pRExC_state->override_recoding)
+#ifdef EBCDIC
+# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
+#endif
#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
#define RExC_frame_head (pRExC_state->frame_head)
#define RExC_frame_last (pRExC_state->frame_last)
#define SPSTART 0x04 /* Starts with * or + */
#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
#define TRYAGAIN 0x10 /* Weeded out a declaration. */
-#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
+#define RESTART_PASS1 0x20 /* Need to restart sizing pass */
+#define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
+ calcuate sizes as UTF-8 */
#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
-#define REQUIRE_UTF8 STMT_START { \
+#define REQUIRE_UTF8(flagp) STMT_START { \
if (!UTF) { \
- *flagp = RESTART_UTF8; \
+ assert(PASS1); \
+ *flagp = RESTART_PASS1|NEED_UTF8; \
return NULL; \
} \
- } STMT_END
+ } STMT_END
+
+/* Change from /d into /u rules, and restart the parse if we've already seen
+ * something whose size would increase as a result, by setting *flagp and
+ * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
+ * we've change to /u during the parse. */
+#define REQUIRE_UNI_RULES(flagp, restart_retval) \
+ STMT_START { \
+ if (DEPENDS_SEMANTICS) { \
+ assert(PASS1); \
+ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
+ RExC_uni_semantics = 1; \
+ if (RExC_seen_unfolded_sharp_s) { \
+ *flagp |= RESTART_PASS1; \
+ return restart_retval; \
+ } \
+ } \
+ } STMT_END
/* This converts the named class defined in regcomp.h to its equivalent class
* number defined in handy.h. */
} STMT_END
/* A specialized version of vFAIL2 that works with UTF8f */
-#define vFAIL2utf8f(m, a1) STMT_START { \
+#define vFAIL2utf8f(m, a1) STMT_START { \
const IV offset = RExC_parse - RExC_precomp; \
if (!SIZE_ONLY) \
SAVEFREESV(RExC_rx_sv); \
REPORT_LOCATION_ARGS(offset)); \
} STMT_END
+#define vFAIL3utf8f(m, a1, a2) STMT_START { \
+ const IV offset = RExC_parse - RExC_precomp; \
+ if (!SIZE_ONLY) \
+ SAVEFREESV(RExC_rx_sv); \
+ S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
+ REPORT_LOCATION_ARGS(offset)); \
+} STMT_END
+
/* These have asserts in them because of [perl #122671] Many warnings in
* regcomp.c can occur twice. If they get output in pass1 and later in that
* pass, the pattern has to be converted to UTF-8 and the pass restarted, they
if (RExC_seen & REG_GPOS_SEEN) \
PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
\
- if (RExC_seen & REG_CANY_SEEN) \
- PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
- \
if (RExC_seen & REG_RECURSE_SEEN) \
PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
\
/* If this can match all upper Latin1 code points, have to add them
* as well */
- if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
+ if (OP(node) == ANYOFD
+ && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+ {
_invlist_union(invlist, PL_UpperLatin1, &invlist);
}
* that should be; while the consequences for having /l bugs is
* incorrect matches */
if (ssc_is_anything((regnode_ssc *)and_with)) {
- anded_flags |= ANYOF_WARN_SUPER;
+ anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
}
}
else {
anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
- anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
+ if (OP(and_with) == ANYOFD) {
+ anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
+ }
+ else {
+ anded_flags = ANYOF_FLAGS(and_with)
+ &( ANYOF_COMMON_FLAGS
+ |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER);
+ }
}
ANYOF_FLAGS(ssc) &= anded_flags;
else {
ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
+ if (OP(or_with) != ANYOFD) {
+ ored_flags
+ |= ANYOF_FLAGS(or_with)
+ & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+ }
}
ANYOF_FLAGS(ssc) |= ored_flags;
/* The code in this file assumes that all but these flags aren't relevant
* to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
* by the time we reach here */
- assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
+ assert(! (ANYOF_FLAGS(ssc)
+ & ~( ANYOF_COMMON_FLAGS
+ |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)));
populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
}
+ if (RExC_contains_locale) {
+ OP(ssc) = ANYOFL;
+ }
+
assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
}
#define TRIE_STORE_REVCHAR(val) \
STMT_START { \
if (UTF) { \
- SV *zlopp = newSV(7); /* XXX: optimize me */ \
+ SV *zlopp = newSV(UTF8_MAXBYTES); \
unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
SvCUR_set(zlopp, kapow - flrbbbbb); \
* this function, we need to flag any occurrences of the sharp s.
* This character forbids trie formation (because of added
* complexity) */
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
while (s < s_end) {
if (*s == LATIN_SMALL_LETTER_SHARP_S) {
OP(scan) = EXACTFA_NO_TRIE;
break;
}
s++;
- continue;
}
}
else {
*min_subtract += len - 1;
s += len;
}
+#endif
}
}
Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
OP(scan));
#endif
- case CANY:
case SANY:
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
ssc_match_all_cp(data->start_class);
}
break;
+ case ANYOFD:
case ANYOFL:
case ANYOF:
if (flags & SCF_DO_STCLASS_AND)
ENTER;
SAVETMPS;
+ save_re_context();
PUSHSTACKi(PERLSI_REQUIRE);
/* G_RE_REPARSING causes the toker to collapse \\ into \ when
* parsing qr''; normally only q'' does this. It also alters
/* ignore the utf8ness if the pattern is 0 length */
RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
+
RExC_uni_semantics = 0;
+ RExC_seen_unfolded_sharp_s = 0;
RExC_contains_locale = 0;
RExC_contains_i = 0;
RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
});
redo_first_pass:
- /* we jump here if we upgrade the pattern to utf8 and have to
- * recompile */
+ /* we jump here if we have to recompile, e.g., from upgrading the pattern
+ * to utf8 */
if ((pm_flags & PMf_USE_RE_EVAL)
/* this second condition covers the non-regex literal case,
if (rx_flags & PMf_FOLD) {
RExC_contains_i = 1;
}
- if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
+ if ( initial_charset == REGEX_DEPENDS_CHARSET
+ && (RExC_utf8 ||RExC_uni_semantics))
+ {
/* Set to use unicode semantics if the pattern is in utf8 and has the
* 'depends' charset specified, as it means unicode when utf8 */
RExC_pm_flags = pm_flags;
if (runtime_code) {
- if (TAINTING_get && TAINT_get)
+ assert(TAINTING_get || !TAINT_get);
+ if (TAINT_get)
Perl_croak(aTHX_ "Eval-group in insecure regular expression");
if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
RExC_extralen = 0;
RExC_override_recoding = 0;
+#ifdef EBCDIC
+ RExC_recode_x_to_native = 0;
+#endif
RExC_in_multi_char_class = 0;
/* First pass: determine size, legality. */
at least some part of the pattern, and therefore must convert the whole
thing.
-- dmq */
- if (flags & RESTART_UTF8) {
- S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
+ if (flags & RESTART_PASS1) {
+ if (flags & NEED_UTF8) {
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
pRExC_state->num_code_blocks);
+ }
+ else {
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ "Need to redo pass 1\n"));
+ }
+
goto redo_first_pass;
}
Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags);
for (n = 0; n < pRExC_state->num_code_blocks; n++)
if (pRExC_state->code_blocks[n].src_regex)
SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
- SAVEFREEPV(pRExC_state->code_blocks);
+ if(pRExC_state->code_blocks)
+ SAVEFREEPV(pRExC_state->code_blocks); /* often null */
}
{
|| ! has_charset);
bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
== REG_RUN_ON_COMMENT_SEEN);
- U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
+ U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
>> RXf_PMf_STD_PMMOD_SHIFT);
const char *fptr = STD_PAT_MODS; /*"msixn"*/
char *p;
- /* Allocate for the worst case, which is all the std flags are turned
- * on. If more precision is desired, we could do a population count of
- * the flags set. This could be done with a small lookup table, or by
- * shifting, masking and adding, or even, when available, assembly
- * language for a machine-language population count.
- * We never output a minus, as all those are defaults, so are
+
+ /* We output all the necessary flags; we never output a minus, as all
+ * those are defaults, so are
* covered by the caret */
const STRLEN wraplen = plen + has_p + has_runon
+ has_default /* If needs a caret */
+ + PL_bitcount[reganch] /* 1 char for each set standard flag */
/* If needs a character set specifier */
+ ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
- + (sizeof(STD_PAT_MODS) - 1)
+ (sizeof("(?:)") - 1);
+ /* make sure PL_bitcount bounds not exceeded */
+ assert(sizeof(STD_PAT_MODS) <= 8);
+
Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
r->xpv_len_u.xpvlenu_pv = p;
if (RExC_utf8)
lookbehind */
if (pRExC_state->num_code_blocks)
r->extflags |= RXf_EVAL_SEEN;
- if (RExC_seen & REG_CANY_SEEN)
- r->intflags |= PREGf_CANY_SEEN;
if (RExC_seen & REG_VERBARG_SEEN)
{
r->intflags |= PREGf_VERBARG_SEEN;
* flags appropriately - Yves */
regnode *first = ri->program + 1;
U8 fop = OP(first);
- regnode *next = NEXTOPER(first);
+ regnode *next = regnext(first);
U8 nop = OP(next);
if (PL_regkind[fop] == NOTHING && nop == END)
r->extflags |= RXf_START_ONLY;
else if (fop == PLUS
&& PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
- && OP(regnext(first)) == END)
+ && nop == END)
r->extflags |= RXf_WHITE;
else if ( r->extflags & RXf_SPLIT
&& (fop == EXACT || fop == EXACTL)
&& STR_LEN(first) == 1
&& *(STRING(first)) == ' '
- && OP(regnext(first)) == END )
+ && nop == END )
r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
}
sv_setpvn(sv, s, i);
TAINT_set(oldtainted);
#endif
- if ( (rx->intflags & PREGf_CANY_SEEN)
- ? (RXp_MATCH_UTF8(rx)
- && (!i || is_utf8_string((U8*)s, i)))
- : (RXp_MATCH_UTF8(rx)) )
- {
+ if (RXp_MATCH_UTF8(rx))
SvUTF8_on(sv);
- }
else
SvUTF8_off(sv);
if (TAINTING_get) {
* Some of the methods should always be private to the implementation, and some
* should eventually be made public */
-/* The header definitions are in F<inline_invlist.c> */
+/* The header definitions are in F<invlist_inline.h> */
PERL_STATIC_INLINE UV*
S__invlist_array_init(SV* const invlist, const bool will_have_0)
}
/* Here, can't just append things, create and return a new inversion list
- * which is the union of this range and the existing inversion list */
+ * which is the union of this range and the existing inversion list. (If
+ * the new range is well-behaved wrt to the old one, we could just insert
+ * it, doing a Move() down on the tail of the old one (potentially growing
+ * it first). But to determine that means we would have the extra
+ * (possibly throw-away) work of first finding where the new one goes and
+ * whether it disrupts (splits) an existing range, so it doesn't appear to
+ * me (khw) that it's worth it) */
range_invlist = _new_invlist(2);
_append_range_to_invlist(range_invlist, start, end);
++RExC_parse;
}
- if (PASS2) {
- STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
- }
+ vFAIL("Sequence (?... not terminated");
}
/*
#endif
/* Returns NULL, setting *flagp to TRYAGAIN at the end of (?) that only sets
- flags. Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan
- needs to be restarted.
- Otherwise would only return NULL if regbranch() returns NULL, which
- cannot happen. */
+ flags. Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan
+ needs to be restarted, or'd with NEED_UTF8 if the pattern needs to be
+ upgraded to UTF-8. Otherwise would only return NULL if regbranch() returns
+ NULL, which cannot happen. */
STATIC regnode *
S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
/* paren: Parenthesized? 0=top; 1,2=inside '(': changed to letter.
STRLEN verb_len = 0;
char *start_arg = NULL;
unsigned char op = 0;
- int argok = 1;
- int internal_argval = 0; /* internal_argval is only useful if
- !argok */
+ int arg_required = 0;
+ int internal_argval = -1; /* if >-1 we are not allowed an argument*/
if (has_intervening_patws) {
RExC_parse++;
case 'F': /* (*FAIL) */
if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
op = OPFAIL;
- argok = 0;
}
break;
case ':': /* (*:NAME) */
case 'M': /* (*MARK:NAME) */
if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
op = MARKPOINT;
- argok = -1;
+ arg_required = 1;
}
break;
case 'P': /* (*PRUNE) */
"Unknown verb pattern '%"UTF8f"'",
UTF8fARG(UTF, verb_len, start_verb));
}
- if ( argok ) {
- if ( start_arg && internal_argval ) {
- vFAIL3("Verb pattern '%.*s' may not have an argument",
- verb_len, start_verb);
- } else if ( argok < 0 && !start_arg ) {
- vFAIL3("Verb pattern '%.*s' has a mandatory argument",
- verb_len, start_verb);
- } else {
- ret = reganode(pRExC_state, op, internal_argval);
- if ( ! internal_argval && ! SIZE_ONLY ) {
- if (start_arg) {
- SV *sv = newSVpvn( start_arg,
- RExC_parse - start_arg);
- ARG(ret) = add_data( pRExC_state,
- STR_WITH_LEN("S"));
- RExC_rxi->data->data[ARG(ret)]=(void*)sv;
- ret->flags = 0;
- } else {
- ret->flags = 1;
- }
- }
- }
- if (!internal_argval)
- RExC_seen |= REG_VERBARG_SEEN;
- } else if ( start_arg ) {
- vFAIL3("Verb pattern '%.*s' may not have an argument",
- verb_len, start_verb);
- } else {
- ret = reg_node(pRExC_state, op);
- }
+ if ( arg_required && !start_arg ) {
+ vFAIL3("Verb pattern '%.*s' has a mandatory argument",
+ verb_len, start_verb);
+ }
+ if (internal_argval == -1) {
+ ret = reganode(pRExC_state, op, 0);
+ } else {
+ ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
+ }
+ RExC_seen |= REG_VERBARG_SEEN;
+ if ( ! SIZE_ONLY ) {
+ if (start_arg) {
+ SV *sv = newSVpvn( start_arg,
+ RExC_parse - start_arg);
+ ARG(ret) = add_data( pRExC_state,
+ STR_WITH_LEN("S"));
+ RExC_rxi->data->data[ARG(ret)]=(void*)sv;
+ ret->flags = 1;
+ } else {
+ ret->flags = 0;
+ }
+ if ( internal_argval != -1 )
+ ARG2L_SET(ret, internal_argval);
+ }
nextchar(pRExC_state);
return ret;
}
case '!': /* (?!...) */
RExC_seen_zerolen++;
/* check if we're really just a "FAIL" assertion */
- --RExC_parse;
- nextchar(pRExC_state);
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ FALSE /* Don't force to /x */ );
if (*RExC_parse == ')') {
- ret=reg_node(pRExC_state, OPFAIL);
+ ret=reganode(pRExC_state, OPFAIL, 0);
nextchar(pRExC_state);
return ret;
}
int is_define= 0;
const int DEFINE_len = sizeof("DEFINE") - 1;
if (RExC_parse[0] == '?') { /* (?(?...)) */
- if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
- || RExC_parse[1] == '<'
- || RExC_parse[1] == '{') { /* Lookahead or eval. */
+ if (
+ RExC_parse[1] == '=' ||
+ RExC_parse[1] == '!' ||
+ RExC_parse[1] == '<' ||
+ RExC_parse[1] == '{'
+ ) { /* Lookahead or eval. */
I32 flag;
regnode *tail;
ret->flags = 1;
tail = reg(pRExC_state, 1, &flag, depth+1);
- if (flag & RESTART_UTF8) {
- *flagp = RESTART_UTF8;
+ if (flag & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flag & (RESTART_PASS1|NEED_UTF8);
return NULL;
}
REGTAIL(pRExC_state, ret, tail);
parno = (I32)uv;
RExC_parse = (char*)endptr;
}
- /* XXX else what? */
+ /* else "Switch condition not recognized" below */
} else if (RExC_parse[0] == '&') {
SV *sv_dat;
RExC_parse++;
else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
/* (?(1)...) */
char c;
- char *tmp;
UV uv;
if (grok_atoUV(RExC_parse, &uv, &endptr)
&& uv <= I32_MAX
parno = (I32)uv;
RExC_parse = (char*)endptr;
}
- /* XXX else what? */
+ else {
+ vFAIL("panic: grok_atoUV returned FALSE");
+ }
ret = reganode(pRExC_state, GROUPP, parno);
insert_if_check_paren:
- if (*(tmp = nextchar(pRExC_state)) != ')') {
- /* nextchar also skips comments, so undo its work
- * and skip over the the next character.
- */
- RExC_parse = tmp;
+ if (UCHARAT(RExC_parse) != ')') {
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
vFAIL("Switch condition not recognized");
}
+ nextchar(pRExC_state);
insert_if:
REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
br = regbranch(pRExC_state, &flags, 1,depth+1);
if (br == NULL) {
- if (flags & RESTART_UTF8) {
- *flagp = RESTART_UTF8;
+ if (flags & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flags & (RESTART_PASS1|NEED_UTF8);
return NULL;
}
FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
} else
REGTAIL(pRExC_state, br, reganode(pRExC_state,
LONGJMP, 0));
- c = *nextchar(pRExC_state);
+ c = UCHARAT(RExC_parse);
+ nextchar(pRExC_state);
if (flags&HASWIDTH)
*flagp |= HASWIDTH;
if (c == '|') {
lastbr = reganode(pRExC_state, IFTHEN, 0);
if (!regbranch(pRExC_state, &flags, 1,depth+1)) {
- if (flags & RESTART_UTF8) {
- *flagp = RESTART_UTF8;
+ if (flags & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flags & (RESTART_PASS1|NEED_UTF8);
return NULL;
}
FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"",
REGTAIL(pRExC_state, ret, lastbr);
if (flags&HASWIDTH)
*flagp |= HASWIDTH;
- c = *nextchar(pRExC_state);
+ c = UCHARAT(RExC_parse);
+ nextchar(pRExC_state);
}
else
lastbr = NULL;
/* branch_len = (paren != 0); */
if (br == NULL) {
- if (flags & RESTART_UTF8) {
- *flagp = RESTART_UTF8;
+ if (flags & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flags & (RESTART_PASS1|NEED_UTF8);
return NULL;
}
FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
br = regbranch(pRExC_state, &flags, 0, depth+1);
if (br == NULL) {
- if (flags & RESTART_UTF8) {
- *flagp = RESTART_UTF8;
+ if (flags & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flags & (RESTART_PASS1|NEED_UTF8);
return NULL;
}
FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags);
/* Check for proper termination. */
if (paren) {
- /* restore original flags, but keep (?p) */
+ /* restore original flags, but keep (?p) and, if we've changed from /d
+ * rules to /u, keep the /u */
RExC_flags = oregflags | (RExC_flags & RXf_PMf_KEEPCOPY);
- if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
+ if (DEPENDS_SEMANTICS && RExC_uni_semantics) {
+ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
+ }
+ if (RExC_parse >= RExC_end || UCHARAT(RExC_parse) != ')') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched (");
}
+ nextchar(pRExC_state);
}
else if (!paren && RExC_parse < RExC_end) {
if (*RExC_parse == ')') {
*
* Implements the concatenation operator.
*
- * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
- * restarted.
+ * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
+ * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
*/
STATIC regnode *
S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
*flagp = WORST; /* Tentatively. */
- RExC_parse--;
- nextchar(pRExC_state);
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ FALSE /* Don't force to /x */ );
while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
flags &= ~TRYAGAIN;
latest = regpiece(pRExC_state, &flags,depth+1);
if (latest == NULL) {
if (flags & TRYAGAIN)
continue;
- if (flags & RESTART_UTF8) {
- *flagp = RESTART_UTF8;
+ if (flags & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flags & (RESTART_PASS1|NEED_UTF8);
return NULL;
}
FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags);
*
* Returns NULL, setting *flagp to TRYAGAIN if regatom() returns NULL with
* TRYAGAIN.
- * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
- * restarted.
+ * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
+ * restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
*/
STATIC regnode *
S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
ret = regatom(pRExC_state, &flags,depth+1);
if (ret == NULL) {
- if (flags & (TRYAGAIN|RESTART_UTF8))
- *flagp |= flags & (TRYAGAIN|RESTART_UTF8);
+ if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8))
+ *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8);
else
FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags);
return(NULL);
* enough space for all the things we are about to throw
* away, but we can shrink it by the ammount we are about
* to re-use here */
- RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
+ RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
}
else {
ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
RExC_emit = orig_emit;
}
- ret = reg_node(pRExC_state, OPFAIL);
+ ret = reganode(pRExC_state, OPFAIL, 0);
return ret;
}
- else if (min == max
- && RExC_parse < RExC_end
- && (*RExC_parse == '?' || *RExC_parse == '+'))
+ else if (min == max && RExC_parse < RExC_end && *RExC_parse == '?')
{
if (PASS2) {
ckWARN2reg(RExC_parse + 1,
"Useless use of greediness modifier '%c'",
*RExC_parse);
}
- /* Absorb the modifier, so later code doesn't see nor use
- * it */
- nextchar(pRExC_state);
}
do_curly:
if ((flags&SIMPLE)) {
+ if (min == 0 && max == REG_INFTY) {
+ reginsert(pRExC_state, STAR, ret, depth+1);
+ ret->flags = 0;
+ MARK_NAUGHTY(4);
+ RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
+ goto nest_check;
+ }
+ if (min == 1 && max == REG_INFTY) {
+ reginsert(pRExC_state, PLUS, ret, depth+1);
+ ret->flags = 0;
+ MARK_NAUGHTY(3);
+ RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
+ goto nest_check;
+ }
MARK_NAUGHTY_EXP(2, 2);
reginsert(pRExC_state, CURLY, ret, depth+1);
Set_Node_Offset(ret, parse_start+1); /* MJD */
*flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
- if (op == '*' && (flags&SIMPLE)) {
- reginsert(pRExC_state, STAR, ret, depth+1);
- ret->flags = 0;
- MARK_NAUGHTY(4);
- RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
- }
- else if (op == '*') {
+ if (op == '*') {
min = 0;
goto do_curly;
}
- else if (op == '+' && (flags&SIMPLE)) {
- reginsert(pRExC_state, PLUS, ret, depth+1);
- ret->flags = 0;
- MARK_NAUGHTY(3);
- RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
- }
else if (op == '+') {
min = 1;
goto do_curly;
return(ret);
}
-STATIC STRLEN
-S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p,
- UV *valuep, I32 *flagp, U32 depth, SV** substitute_parse
+STATIC bool
+S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state,
+ regnode ** node_p,
+ UV * code_point_p,
+ int * cp_count,
+ I32 * flagp,
+ const U32 depth
)
{
-
- /* This is expected to be called by a parser routine that has recognized '\N'
- and needs to handle the rest. RExC_parse is expected to point at the first
- char following the N at the time of the call. On successful return,
- RExC_parse has been updated to point to just after the sequence identified
- by this routine, <*flagp> has been updated, and the non-NULL input pointers
- have been set appropriately.
-
- The typical case for this is \N{some character name}. This is usually
- called while parsing the input, filling in or ready to fill in an EXACTish
- node, and the code point for the character should be returned, so that it
- can be added to the node, and parsing continued with the next input
- character. But it may be that instead of a single character the \N{}
- expands to more than one, a named sequence. In this case any following
- quantifier applies to the whole sequence, and it is easier, given the code
- structure that calls this, to handle it from a different area of the code.
- For this reason, the input parameters can be set so that it returns valid
- only on one or the other of these cases.
-
- Another possibility is for the input to be an empty \N{}, which for
- backwards compatibility we accept, but generate a NOTHING node which should
- later get optimized out. This is handled from the area of code which can
- handle a named sequence, so if called with the parameters for the other, it
- fails.
-
- Still another possibility is for the \N to mean [^\n], and not a single
- character or explicit sequence at all. This is determined by context.
- Again, this is handled from the area of code which can handle a named
- sequence, so if called with the parameters for the other, it also fails.
-
- And the final possibility is for the \N to be called from within a bracketed
- character class. In this case the [^\n] meaning makes no sense, and so is
- an error. Other anomalous situations are left to the calling code to handle.
-
- For non-single-quoted regexes, the tokenizer has attempted to decide which
- of the above applies, and in the case of a named sequence, has converted it
- into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
- where c1... are the characters in the sequence. For single-quoted regexes,
- the tokenizer passes the \N sequence through unchanged; this code will not
- attempt to determine this nor expand those, instead raising a syntax error.
- The net effect is that if the beginning of the passed-in pattern isn't '{U+'
- or there is no '}', it signals that this \N occurrence means to match a
- non-newline. (This mostly was done because of [perl #56444].)
-
- The API is somewhat convoluted due to historical and the above reasons.
-
- The function raises an error (via vFAIL), and doesn't return for various
- syntax errors. For other failures, it returns (STRLEN) -1. For successes,
- it returns a count of how many characters were accounted for by it. (This
- can be 0 for \N{}; 1 for it meaning [^\n]; and otherwise the number of code
- points in the sequence. It sets <node_p>, <valuep>, and/or
- <substitute_parse> on success.
-
- If <valuep> is non-null, it means the caller can accept an input sequence
- consisting of just a single code point; <*valuep> is set to the value of the
- only or first code point in the input.
-
- If <substitute_parse> is non-null, it means the caller can accept an input
- sequence consisting of one or more code points; <*substitute_parse> is a
- newly created mortal SV* in this case, containing \x{} escapes representing
- those code points.
-
- Both <valuep> and <substitute_parse> can be non-NULL.
-
- If <node_p> is non-null, <substitute_parse> must be NULL. This signifies
- that the caller can accept any legal sequence other than a single code
- point. To wit, <*node_p> is set as follows:
- 1) \N means not-a-NL: points to a newly created REG_ANY node; return is 1
- 2) \N{}: points to a new NOTHING node; return is 0
- 3) otherwise: points to a new EXACT node containing the resolved
- string; return is the number of code points in the
- string. This will never be 1.
- Note that failure is returned for single code point sequences if <valuep> is
- null and <node_p> is not.
- */
-
- char * endbrace; /* '}' following the name */
- char* p;
+ /* This routine teases apart the various meanings of \N and returns
+ * accordingly. The input parameters constrain which meaning(s) is/are valid
+ * in the current context.
+ *
+ * Exactly one of <node_p> and <code_point_p> must be non-NULL.
+ *
+ * If <code_point_p> is not NULL, the context is expecting the result to be a
+ * single code point. If this \N instance turns out to a single code point,
+ * the function returns TRUE and sets *code_point_p to that code point.
+ *
+ * If <node_p> is not NULL, the context is expecting the result to be one of
+ * the things representable by a regnode. If this \N instance turns out to be
+ * one such, the function generates the regnode, returns TRUE and sets *node_p
+ * to point to that regnode.
+ *
+ * If this instance of \N isn't legal in any context, this function will
+ * generate a fatal error and not return.
+ *
+ * On input, RExC_parse should point to the first char following the \N at the
+ * time of the call. On successful return, RExC_parse will have been updated
+ * to point to just after the sequence identified by this routine. Also
+ * *flagp has been updated as needed.
+ *
+ * When there is some problem with the current context and this \N instance,
+ * the function returns FALSE, without advancing RExC_parse, nor setting
+ * *node_p, nor *code_point_p, nor *flagp.
+ *
+ * If <cp_count> is not NULL, the caller wants to know the length (in code
+ * points) that this \N sequence matches. This is set even if the function
+ * returns FALSE, as detailed below.
+ *
+ * There are 5 possibilities here, as detailed in the next 5 paragraphs.
+ *
+ * Probably the most common case is for the \N to specify a single code point.
+ * *cp_count will be set to 1, and *code_point_p will be set to that code
+ * point.
+ *
+ * Another possibility is for the input to be an empty \N{}, which for
+ * backwards compatibility we accept. *cp_count will be set to 0. *node_p
+ * will be set to a generated NOTHING node.
+ *
+ * Still another possibility is for the \N to mean [^\n]. *cp_count will be
+ * set to 0. *node_p will be set to a generated REG_ANY node.
+ *
+ * The fourth possibility is that \N resolves to a sequence of more than one
+ * code points. *cp_count will be set to the number of code points in the
+ * sequence. *node_p * will be set to a generated node returned by this
+ * function calling S_reg().
+ *
+ * The final possibility is that it is premature to be calling this function;
+ * that pass1 needs to be restarted. This can happen when this changes from
+ * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
+ * latter occurs only when the fourth possibility would otherwise be in
+ * effect, and is because one of those code points requires the pattern to be
+ * recompiled as UTF-8. The function returns FALSE, and sets the
+ * RESTART_PASS1 and NEED_UTF8 flags in *flagp, as appropriate. When this
+ * happens, the caller needs to desist from continuing parsing, and return
+ * this information to its caller. This is not set for when there is only one
+ * code point, as this can be called as part of an ANYOF node, and they can
+ * store above-Latin1 code points without the pattern having to be in UTF-8.
+ *
+ * For non-single-quoted regexes, the tokenizer has resolved character and
+ * sequence names inside \N{...} into their Unicode values, normalizing the
+ * result into what we should see here: '\N{U+c1.c2...}', where c1... are the
+ * hex-represented code points in the sequence. This is done there because
+ * the names can vary based on what charnames pragma is in scope at the time,
+ * so we need a way to take a snapshot of what they resolve to at the time of
+ * the original parse. [perl #56444].
+ *
+ * That parsing is skipped for single-quoted regexes, so we may here get
+ * '\N{NAME}'. This is a fatal error. These names have to be resolved by the
+ * parser. But if the single-quoted regex is something like '\N{U+41}', that
+ * is legal and handled here. The code point is Unicode, and has to be
+ * translated into the native character set for non-ASCII platforms.
+ */
+
+ char * endbrace; /* points to '}' following the name */
char *endchar; /* Points to '.' or '}' ending cur char in the input
stream */
- bool has_multiple_chars; /* true if the input stream contains a sequence of
- more than one character */
- bool in_char_class = substitute_parse != NULL;
- STRLEN count = 0; /* Number of characters in this sequence */
+ char* p = RExC_parse; /* Temporary */
GET_RE_DEBUG_FLAGS_DECL;
GET_RE_DEBUG_FLAGS;
- assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
- assert(! (node_p && substitute_parse)); /* At most 1 should be set */
+ assert(cBOOL(node_p) ^ cBOOL(code_point_p)); /* Exactly one should be set */
+ assert(! (node_p && cp_count)); /* At most 1 should be set */
+
+ if (cp_count) { /* Initialize return for the most common case */
+ *cp_count = 1;
+ }
/* The [^\n] meaning of \N ignores spaces and comments under the /x
- * modifier. The other meaning does not, so use a temporary until we find
+ * modifier. The other meanings do not, so use a temporary until we find
* out which we are being called with */
- p = (RExC_flags & RXf_PMf_EXTENDED)
- ? regpatws(pRExC_state, RExC_parse,
- TRUE) /* means recognize comments */
- : RExC_parse;
+ skip_to_be_ignored_text(pRExC_state, &p,
+ FALSE /* Don't force to /x */ );
/* Disambiguate between \N meaning a named character versus \N meaning
- * [^\n]. The former is assumed when it can't be the latter. */
+ * [^\n]. The latter is assumed when the {...} following the \N is a legal
+ * quantifier, or there is no '{' at all */
if (*p != '{' || regcurly(p)) {
RExC_parse = p;
+ if (cp_count) {
+ *cp_count = -1;
+ }
+
if (! node_p) {
- /* no bare \N allowed in a charclass */
- if (in_char_class) {
- vFAIL("\\N in a character class must be a named character: \\N{...}");
- }
- return (STRLEN) -1;
+ return FALSE;
}
- RExC_parse--; /* Need to back off so nextchar() doesn't skip the
- current char */
- nextchar(pRExC_state);
+
*node_p = reg_node(pRExC_state, REG_ANY);
*flagp |= HASWIDTH|SIMPLE;
MARK_NAUGHTY(1);
Set_Node_Length(*node_p, 1); /* MJD */
- return 1;
+ return TRUE;
}
/* Here, we have decided it should be a named character or sequence */
vFAIL("\\N{NAME} must be resolved by the lexer");
}
- RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
+ REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
+ semantics */
if (endbrace == RExC_parse) { /* empty: \N{} */
- if (node_p) {
- *node_p = reg_node(pRExC_state,NOTHING);
- }
- else if (! in_char_class) {
- return (STRLEN) -1;
+ if (cp_count) {
+ *cp_count = 0;
}
nextchar(pRExC_state);
- return 0;
+ if (! node_p) {
+ return FALSE;
+ }
+
+ *node_p = reg_node(pRExC_state,NOTHING);
+ return TRUE;
}
RExC_parse += 2; /* Skip past the 'U+' */
/* Code points are separated by dots. If none, there is only one code
* point, and is terminated by the brace */
- has_multiple_chars = (endchar < endbrace);
- /* We get the first code point if we want it, and either there is only one,
- * or we can accept both cases of one and there is more than one */
- if (valuep && (substitute_parse || ! has_multiple_chars)) {
- STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
- I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
+ if (endchar >= endbrace) {
+ STRLEN length_of_hex;
+ I32 grok_hex_flags;
+
+ /* Here, exactly one code point. If that isn't what is wanted, fail */
+ if (! code_point_p) {
+ RExC_parse = p;
+ return FALSE;
+ }
+
+ /* Convert code point from hex */
+ length_of_hex = (STRLEN)(endchar - RExC_parse);
+ grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
| PERL_SCAN_DISALLOW_PREFIX
/* No errors in the first pass (See [perl
* #122671].) We let the code below find the
* errors when there are multiple chars. */
- | ((SIZE_ONLY || has_multiple_chars)
+ | ((SIZE_ONLY)
? PERL_SCAN_SILENT_ILLDIGIT
: 0);
- *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
+ /* This routine is the one place where both single- and double-quotish
+ * \N{U+xxxx} are evaluated. The value is a Unicode code point which
+ * must be converted to native. */
+ *code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
+ &length_of_hex,
+ &grok_hex_flags,
+ NULL));
/* The tokenizer should have guaranteed validity, but it's possible to
* bypass it by using single quoting, so check. Don't do the check
* here when there are multiple chars; we do it below anyway. */
- if (! has_multiple_chars) {
- if (length_of_hex == 0
- || length_of_hex != (STRLEN)(endchar - RExC_parse) )
- {
- RExC_parse += length_of_hex; /* Includes all the valid */
- RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
- ? UTF8SKIP(RExC_parse)
- : 1;
- /* Guard against malformed utf8 */
- if (RExC_parse >= endchar) {
- RExC_parse = endchar;
- }
- vFAIL("Invalid hexadecimal number in \\N{U+...}");
+ if (length_of_hex == 0
+ || length_of_hex != (STRLEN)(endchar - RExC_parse) )
+ {
+ RExC_parse += length_of_hex; /* Includes all the valid */
+ RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
+ ? UTF8SKIP(RExC_parse)
+ : 1;
+ /* Guard against malformed utf8 */
+ if (RExC_parse >= endchar) {
+ RExC_parse = endchar;
}
-
- RExC_parse = endbrace + 1;
- return 1;
+ vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
- }
- /* Here, we should have already handled the case where a single character
- * is expected and found. So it is a failure if we aren't expecting
- * multiple chars and got them; or didn't get them but wanted them. We
- * fail without advancing the parse, so that the caller can try again with
- * different acceptance criteria */
- if ((! node_p && ! substitute_parse) || ! has_multiple_chars) {
- RExC_parse = p;
- return (STRLEN) -1;
+ RExC_parse = endbrace + 1;
+ return TRUE;
}
-
- {
- /* What is done here is to convert this to a sub-pattern of the form
- * \x{char1}\x{char2}...
- * and then either return it in <*substitute_parse> if non-null; or
- * call reg recursively to parse it (enclosing in "(?: ... )" ). That
- * way, it retains its atomicness, while not having to worry about
- * special handling that some code points may have. toke.c has
- * converted the original Unicode values to native, so that we can just
- * pass on the hex values unchanged. We do have to set a flag to keep
- * recoding from happening in the recursion */
-
- SV * dummy = NULL;
+ else { /* Is a multiple character sequence */
+ SV * substitute_parse;
STRLEN len;
char *orig_end = RExC_end;
I32 flags;
- if (substitute_parse) {
- *substitute_parse = newSVpvs("");
+ /* Count the code points, if desired, in the sequence */
+ if (cp_count) {
+ *cp_count = 0;
+ while (RExC_parse < endbrace) {
+ /* Point to the beginning of the next character in the sequence. */
+ RExC_parse = endchar + 1;
+ endchar = RExC_parse + strcspn(RExC_parse, ".}");
+ (*cp_count)++;
+ }
}
- else {
- substitute_parse = &dummy;
- *substitute_parse = newSVpvs("?:");
+
+ /* Fail if caller doesn't want to handle a multi-code-point sequence.
+ * But don't backup up the pointer if the caller want to know how many
+ * code points there are (they can then handle things) */
+ if (! node_p) {
+ if (! cp_count) {
+ RExC_parse = p;
+ }
+ return FALSE;
}
- *substitute_parse = sv_2mortal(*substitute_parse);
+
+ /* What is done here is to convert this to a sub-pattern of the form
+ * \x{char1}\x{char2}... and then call reg recursively to parse it
+ * (enclosing in "(?: ... )" ). That way, it retains its atomicness,
+ * while not having to worry about special handling that some code
+ * points may have. */
+
+ substitute_parse = newSVpvs("?:");
while (RExC_parse < endbrace) {
/* Convert to notation the rest of the code understands */
- sv_catpv(*substitute_parse, "\\x{");
- sv_catpvn(*substitute_parse, RExC_parse, endchar - RExC_parse);
- sv_catpv(*substitute_parse, "}");
+ sv_catpv(substitute_parse, "\\x{");
+ sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
+ sv_catpv(substitute_parse, "}");
/* Point to the beginning of the next character in the sequence. */
RExC_parse = endchar + 1;
endchar = RExC_parse + strcspn(RExC_parse, ".}");
- count++;
}
- if (! in_char_class) {
- sv_catpv(*substitute_parse, ")");
- }
+ sv_catpv(substitute_parse, ")");
- RExC_parse = SvPV(*substitute_parse, len);
+ RExC_parse = SvPV(substitute_parse, len);
/* Don't allow empty number */
- if (len < (STRLEN) ((substitute_parse) ? 6 : 8)) {
+ if (len < (STRLEN) 8) {
RExC_parse = endbrace;
vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
RExC_end = RExC_parse + len;
- /* The values are Unicode, and therefore not subject to recoding */
+ /* The values are Unicode, and therefore not subject to recoding, but
+ * have to be converted to native on a non-Unicode (meaning non-ASCII)
+ * platform. */
RExC_override_recoding = 1;
+#ifdef EBCDIC
+ RExC_recode_x_to_native = 1;
+#endif
if (node_p) {
if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
- if (flags & RESTART_UTF8) {
- *flagp = RESTART_UTF8;
- return (STRLEN) -1;
+ if (flags & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flags & (RESTART_PASS1|NEED_UTF8);
+ return FALSE;
}
FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"",
(UV) flags);
*flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
}
+ /* Restore the saved values */
RExC_parse = endbrace;
RExC_end = orig_end;
RExC_override_recoding = 0;
+#ifdef EBCDIC
+ RExC_recode_x_to_native = 0;
+#endif
+ SvREFCNT_dec_NN(substitute_parse);
nextchar(pRExC_state);
- }
- return count;
+ return TRUE;
+ }
}
* it returns U+FFFD (Replacement character) and sets *encp to NULL.
*/
STATIC UV
-S_reg_recode(pTHX_ const char value, SV **encp)
+S_reg_recode(pTHX_ const U8 value, SV **encp)
{
STRLEN numlen = 1;
- SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
+ SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP);
const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
const STRLEN newlen = SvCUR(sv);
UV uv = UNICODE_REPLACEMENT;
*character = (U8) code_point;
len = 1;
} /* Else is folded non-UTF8 */
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
-
+#else
+ else if (1) {
+#endif
/* We don't fold any non-UTF8 except possibly the Sharp s (see
* comments at join_exact()); */
*character = (U8) code_point;
/* A single character node is SIMPLE, except for the special-cased SHARP S
* under /di. */
- if ((len == 1 || (UTF && len == UNISKIP(code_point)))
- && (code_point != LATIN_SMALL_LETTER_SHARP_S
- || ! FOLD || ! DEPENDS_SEMANTICS))
- {
+ if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point)))
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
+ && ( code_point != LATIN_SMALL_LETTER_SHARP_S
+ || ! FOLD || ! DEPENDS_SEMANTICS)
+#endif
+ ) {
*flagp |= SIMPLE;
}
Returns NULL, setting *flagp to TRYAGAIN if reg() returns NULL with
TRYAGAIN.
- Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs to be
- restarted.
+ Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs to be
+ restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
Otherwise does not return NULL.
*/
{
regnode *ret = NULL;
I32 flags = 0;
- char *parse_start = RExC_parse;
+ char *parse_start;
U8 op;
int invert = 0;
U8 arg;
PERL_ARGS_ASSERT_REGATOM;
tryagain:
+ parse_start = RExC_parse;
switch ((U8)*RExC_parse) {
case '^':
RExC_seen_zerolen++;
TRUE, /* allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
(bool) RExC_strict,
+ TRUE, /* Allow an optimized regnode result */
NULL);
- if (*RExC_parse != ']') {
- RExC_parse = oregcomp_parse;
- vFAIL("Unmatched [");
- }
if (ret == NULL) {
- if (*flagp & RESTART_UTF8)
+ if (*flagp & (RESTART_PASS1|NEED_UTF8))
return NULL;
FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
(UV) *flagp);
}
+ if (*RExC_parse != ']') {
+ RExC_parse = oregcomp_parse;
+ vFAIL("Unmatched [");
+ }
nextchar(pRExC_state);
Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
break;
}
goto tryagain;
}
- if (flags & RESTART_UTF8) {
- *flagp = RESTART_UTF8;
+ if (flags & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flags & (RESTART_PASS1|NEED_UTF8);
return NULL;
}
FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"",
RExC_seen_zerolen++; /* Do not optimize RE away */
goto finish_meta_pat;
case 'C':
- ret = reg_node(pRExC_state, CANY);
- RExC_seen |= REG_CANY_SEEN;
- *flagp |= HASWIDTH|SIMPLE;
- if (PASS2) {
- ckWARNdep(RExC_parse+1, "\\C is deprecated");
- }
- goto finish_meta_pat;
+ vFAIL("\\C no longer supported");
case 'X':
ret = reg_node(pRExC_state, CLUMP);
*flagp |= HASWIDTH;
NOT_REACHED; /*NOTREACHED*/
}
RExC_parse = endbrace;
- RExC_uni_semantics = 1;
+ REQUIRE_UNI_RULES(flagp, NULL);
if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
OP(ret) = BOUNDU;
break;
case 'p':
case 'P':
- {
-#ifdef DEBUGGING
- char* parse_start = RExC_parse - 2;
-#endif
+ RExC_parse--;
+
+ ret = regclass(pRExC_state, flagp,depth+1,
+ TRUE, /* means just parse this element */
+ FALSE, /* don't allow multi-char folds */
+ FALSE, /* don't silence non-portable warnings. It
+ would be a bug if these returned
+ non-portables */
+ (bool) RExC_strict,
+ TRUE, /* Allow an optimized regnode result */
+ NULL);
+ if (*flagp & RESTART_PASS1)
+ return NULL;
+ /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
+ * multi-char folds are allowed. */
+ if (!ret)
+ FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
+ (UV) *flagp);
- RExC_parse--;
-
- ret = regclass(pRExC_state, flagp,depth+1,
- TRUE, /* means just parse this element */
- FALSE, /* don't allow multi-char folds */
- FALSE, /* don't silence non-portable warnings.
- It would be a bug if these returned
- non-portables */
- (bool) RExC_strict,
- NULL);
- /* regclass() can only return RESTART_UTF8 if multi-char folds
- are allowed. */
- if (!ret)
- FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"",
- (UV) *flagp);
-
- RExC_parse--;
-
- Set_Node_Offset(ret, parse_start + 2);
- Set_Node_Cur_Length(ret, parse_start);
- nextchar(pRExC_state);
- }
+ RExC_parse--;
+
+ Set_Node_Offset(ret, parse_start);
+ Set_Node_Cur_Length(ret, parse_start - 2);
+ nextchar(pRExC_state);
break;
case 'N':
- /* Handle \N and \N{NAME} with multiple code points here and not
- * below because it can be multicharacter. join_exact() will join
- * them up later on. Also this makes sure that things like
- * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
- * The options to the grok function call causes it to fail if the
- * sequence is just a single code point. We then go treat it as
- * just another character in the current EXACT node, and hence it
- * gets uniform treatment with all the other characters. The
- * special treatment for quantifiers is not needed for such single
- * character sequences */
+ /* Handle \N, \N{} and \N{NAMED SEQUENCE} (the latter meaning the
+ * \N{...} evaluates to a sequence of more than one code points).
+ * The function call below returns a regnode, which is our result.
+ * The parameters cause it to fail if the \N{} evaluates to a
+ * single code point; we handle those like any other literal. The
+ * reason that the multicharacter case is handled here and not as
+ * part of the EXACtish code is because of quantifiers. In
+ * /\N{BLAH}+/, the '+' applies to the whole thing, and doing it
+ * this way makes that Just Happen. dmq.
+ * join_exact() will join this up with adjacent EXACTish nodes
+ * later on, if appropriate. */
++RExC_parse;
- if ((STRLEN) -1 == grok_bslash_N(pRExC_state, &ret, NULL, flagp,
- depth, FALSE))
- {
- if (*flagp & RESTART_UTF8)
- return NULL;
- RExC_parse--;
- goto defchar;
+ if (grok_bslash_N(pRExC_state,
+ &ret, /* Want a regnode returned */
+ NULL, /* Fail if evaluates to a single code
+ point */
+ NULL, /* Don't need a count of how many code
+ points */
+ flagp,
+ depth)
+ ) {
+ break;
}
- break;
+
+ if (*flagp & RESTART_PASS1)
+ return NULL;
+
+ /* Here, evaluates to a single code point. Go get that */
+ RExC_parse = parse_start;
+ goto defchar;
+
case 'k': /* Handle \k<NAME> and \k'NAME' */
parse_named_seq:
{
* octal character escape, e.g. \35 or \777.
* The above logic should make it obvious why using
* octal escapes in patterns is problematic. - Yves */
+ RExC_parse = parse_start;
goto defchar;
}
}
* as an octal escape. It may or may not be a valid backref
* escape. For instance \88888888 is unlikely to be a valid
* backref. */
- {
-#ifdef RE_TRACK_PATTERN_OFFSETS
- char * const parse_start = RExC_parse - 1; /* MJD */
-#endif
- while (isDIGIT(*RExC_parse))
- RExC_parse++;
- if (hasbrace) {
- if (*RExC_parse != '}')
- vFAIL("Unterminated \\g{...} pattern");
- RExC_parse++;
- }
- if (!SIZE_ONLY) {
- if (num > (I32)RExC_rx->nparens)
- vFAIL("Reference to nonexistent group");
- }
- RExC_sawback = 1;
- ret = reganode(pRExC_state,
- ((! FOLD)
- ? REF
- : (ASCII_FOLD_RESTRICTED)
- ? REFFA
- : (AT_LEAST_UNI_SEMANTICS)
- ? REFFU
- : (LOC)
- ? REFFL
- : REFF),
- num);
- *flagp |= HASWIDTH;
+ while (isDIGIT(*RExC_parse))
+ RExC_parse++;
+ if (hasbrace) {
+ if (*RExC_parse != '}')
+ vFAIL("Unterminated \\g{...} pattern");
+ RExC_parse++;
+ }
+ if (!SIZE_ONLY) {
+ if (num > (I32)RExC_rx->nparens)
+ vFAIL("Reference to nonexistent group");
+ }
+ RExC_sawback = 1;
+ ret = reganode(pRExC_state,
+ ((! FOLD)
+ ? REF
+ : (ASCII_FOLD_RESTRICTED)
+ ? REFFA
+ : (AT_LEAST_UNI_SEMANTICS)
+ ? REFFU
+ : (LOC)
+ ? REFFL
+ : REFF),
+ num);
+ *flagp |= HASWIDTH;
- /* override incorrect value set in reganode MJD */
- Set_Node_Offset(ret, parse_start+1);
- Set_Node_Cur_Length(ret, parse_start);
- RExC_parse--;
- nextchar(pRExC_state);
- }
+ /* override incorrect value set in reganode MJD */
+ Set_Node_Offset(ret, parse_start);
+ Set_Node_Cur_Length(ret, parse_start-1);
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ FALSE /* Don't force to /x */ );
}
break;
case '\0':
default:
/* Do not generate "unrecognized" warnings here, we fall
back into the quick-grab loop below */
- parse_start--;
+ RExC_parse = parse_start;
goto defchar;
- }
+ } /* end of switch on a \foo sequence */
break;
case '#':
- if (RExC_flags & RXf_PMf_EXTENDED) {
+
+ /* '#' comments should have been spaced over before this function was
+ * called */
+ assert((RExC_flags & RXf_PMf_EXTENDED) == 0);
+ /*
+ if (RExC_flags & RXf_PMf_EXTENDED) {
RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
if (RExC_parse < RExC_end)
goto tryagain;
}
+ */
+
/* FALLTHROUGH */
default:
+ defchar: {
- parse_start = RExC_parse - 1;
-
- RExC_parse++;
+ /* Here, we have determined that the next thing is probably a
+ * literal character. RExC_parse points to the first byte of its
+ * definition. (It still may be an escape sequence that evaluates
+ * to a single character) */
- defchar: {
STRLEN len = 0;
UV ender = 0;
char *p;
* string's UTF8ness. The reason to do this is that EXACTF is not
* trie-able, EXACTFU is.
*
- * Similarly, we can convert EXACTFL nodes to EXACTFU if they
+ * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
* contain only above-Latin1 characters (hence must be in UTF8),
* which don't participate in folds with Latin1-range characters,
* as the latter's folds aren't known until runtime. (We don't
reparse:
- /* We do the EXACTFish to EXACT node only if folding. (And we
- * don't need to figure this out until pass 2) */
+ /* We look for the EXACTFish to EXACT node optimizaton only if
+ * folding. (And we don't need to figure this out until pass 2) */
maybe_exact = FOLD && PASS2;
/* XXX The node can hold up to 255 bytes, yet this only goes to
* could back off to end with only a code point that isn't such a
* non-final, but it is possible for there not to be any in the
* entire node. */
- for (p = RExC_parse - 1;
+
+ assert( ! UTF /* Is at the beginning of a character */
+ || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
+ || UTF8_IS_START(UCHARAT(RExC_parse)));
+
+ for (p = RExC_parse;
len < upper_parse && p < RExC_end;
len++)
{
oldp = p;
- if (RExC_flags & RXf_PMf_EXTENDED)
- p = regpatws(pRExC_state, p,
- TRUE); /* means recognize comments */
+ /* White space has already been ignored */
+ assert( (RExC_flags & RXf_PMf_EXTENDED) == 0
+ || ! is_PATWS_safe((p), RExC_end, UTF));
+
switch ((U8)*p) {
case '^':
case '$':
p++;
break;
case 'N': /* Handle a single-code point named character. */
- /* The options cause it to fail if a multiple code
- * point sequence. Handle those in the switch() above
- * */
RExC_parse = p + 1;
- if ((STRLEN) -1 == grok_bslash_N(pRExC_state, NULL,
- &ender,
- flagp,
- depth,
- FALSE
- )) {
- if (*flagp & RESTART_UTF8)
- FAIL("panic: grok_bslash_N set RESTART_UTF8");
+ if (! grok_bslash_N(pRExC_state,
+ NULL, /* Fail if evaluates to
+ anything other than a
+ single code point */
+ &ender, /* The returned single code
+ point */
+ NULL, /* Don't need a count of
+ how many code points */
+ flagp,
+ depth)
+ ) {
+ if (*flagp & NEED_UTF8)
+ FAIL("panic: grok_bslash_N set NEED_UTF8");
+ if (*flagp & RESTART_PASS1)
+ return NULL;
+
+ /* Here, it wasn't a single code point. Go close
+ * up this EXACTish node. The switch() prior to
+ * this switch handles the other cases */
RExC_parse = p = oldp;
goto loopdone;
}
p = RExC_parse;
if (ender > 0xff) {
- REQUIRE_UTF8;
+ REQUIRE_UTF8(flagp);
}
break;
case 'r':
goto recode_encoding;
}
if (ender > 0xff) {
- REQUIRE_UTF8;
+ REQUIRE_UTF8(flagp);
}
break;
}
}
ender = result;
- if (IN_ENCODING && ender < 0x100) {
- goto recode_encoding;
+ if (ender < 0x100) {
+#ifdef EBCDIC
+ if (RExC_recode_x_to_native) {
+ ender = LATIN1_TO_NATIVE(ender);
+ }
+ else
+#endif
+ if (IN_ENCODING) {
+ goto recode_encoding;
+ }
}
- if (ender > 0xff) {
- REQUIRE_UTF8;
+ else {
+ REQUIRE_UTF8(flagp);
}
break;
}
STRLEN numlen = 3;
ender = grok_oct(p, &numlen, &flags, NULL);
if (ender > 0xff) {
- REQUIRE_UTF8;
+ REQUIRE_UTF8(flagp);
}
p += numlen;
if (PASS2 /* like \08, \178 */
recode_encoding:
if (! RExC_override_recoding) {
SV* enc = _get_encoding();
- ender = reg_recode((const char)(U8)ender, &enc);
+ ender = reg_recode((U8)ender, &enc);
if (!enc && PASS2)
ckWARNreg(p, "Invalid escape in the specified encoding");
- REQUIRE_UTF8;
+ REQUIRE_UTF8(flagp);
}
break;
case '\0':
/* FALLTHROUGH */
default:
if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
- /* Include any { following the alpha to emphasize
+ /* Include any left brace following the alpha to emphasize
* that it could be part of an escape at some point
* in the future */
int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
case '{':
/* Currently we don't warn when the lbrace is at the start
* of a construct. This catches it in the middle of a
- * literal string, or when its the first thing after
+ * literal string, or when it's the first thing after
* something like "\b" */
if (! SIZE_ONLY
&& (len || (p > RExC_start && isALPHA_A(*(p -1)))))
/*FALLTHROUGH*/
default: /* A literal character */
normal_default:
- if (UTF8_IS_START(*p) && UTF) {
+ if (! UTF8_IS_INVARIANT(*p) && UTF) {
STRLEN numlen;
ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
&numlen, UTF8_ALLOW_DEFAULT);
} /* End of switch on the literal */
/* Here, have looked at the literal character and <ender>
- * contains its ordinal, <p> points to the character after it
- */
-
- if ( RExC_flags & RXf_PMf_EXTENDED)
- p = regpatws(pRExC_state, p,
- TRUE); /* means recognize comments */
+ * contains its ordinal, <p> points to the character after it.
+ * We need to check if the next non-ignored thing is a
+ * quantifier. Move <p> to after anything that should be
+ * ignored, which, as a side effect, positions <p> for the next
+ * loop iteration */
+ skip_to_be_ignored_text(pRExC_state, &p,
+ FALSE /* Don't force to /x */ );
/* If the next thing is a quantifier, it applies to this
* character only, which means that this character has to be in
* the node, close the node with just them, and set up to do
* this character again next time through, when it will be the
* only thing in its new node */
- if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
+ if ((next_is_quantifier = ( LIKELY(p < RExC_end)
+ && UNLIKELY(ISMULT2(p))))
+ && LIKELY(len))
{
p = oldp;
goto loopdone;
}
+ /* Ready to add 'ender' to the node */
+
if (! FOLD) { /* The simple case, just append the literal */
/* In the sizing pass, we need only the size of the
* its representation until PASS2. */
if (SIZE_ONLY) {
if (UTF) {
- const STRLEN unilen = UNISKIP(ender);
+ const STRLEN unilen = UVCHR_SKIP(ender);
s += unilen;
/* We have to subtract 1 just below (and again in
}
else /* A regular FOLD code point */
if (! ( UTF
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
/* See comments for join_exact() as to why we fold this
* non-UTF at compile time */
|| (node_type == EXACTFU
- && ender == LATIN_SMALL_LETTER_SHARP_S)))
- {
+ && ender == LATIN_SMALL_LETTER_SHARP_S)
+#endif
+ )) {
/* Here, are folding and are not UTF-8 encoded; therefore
* the character must be in the range 0-255, and is not /l
* (Not /l because we already handled these under /l in
/* See if the character's fold differs between /d and
* /u. This includes the multi-char fold SHARP S to
* 'ss' */
- if (maybe_exactfu
+ if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+ RExC_seen_unfolded_sharp_s = 1;
+ maybe_exactfu = FALSE;
+ }
+ else if (maybe_exactfu
&& (PL_fold[ender] != PL_fold_latin1[ender]
- || ender == LATIN_SMALL_LETTER_SHARP_S
- || (len > 0
- && isALPHA_FOLD_EQ(ender, 's')
- && isALPHA_FOLD_EQ(*(s-1), 's'))))
- {
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
+ || ( len > 0
+ && isALPHA_FOLD_EQ(ender, 's')
+ && isALPHA_FOLD_EQ(*(s-1), 's'))
+#endif
+ )) {
maybe_exactfu = FALSE;
}
}
* we have an array that finds its fold quickly */
*(s++) = (char) ender;
}
- else { /* FOLD and UTF */
+ else { /* FOLD, and UTF (or sharp s) */
/* Unlike the non-fold case, we do actually have to
* calculate the results here in pass 1. This is for two
* reasons, the folded length may be longer than the
}
}
else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
- if (! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
+ if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
*s, *(s+1))))
{
break;
RExC_parse = p - 1;
Set_Node_Cur_Length(ret, parse_start);
- nextchar(pRExC_state);
+ RExC_parse = p;
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ FALSE /* Don't force to /x */ );
{
/* len is STRLEN which is unsigned, need to copy to signed */
IV iv = len;
return(ret);
}
-STATIC char *
-S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
-{
- /* Returns the next non-pattern-white space, non-comment character (the
- * latter only if 'recognize_comment is true) in the string p, which is
- * ended by RExC_end. See also reg_skipcomment */
- const char *e = RExC_end;
-
- PERL_ARGS_ASSERT_REGPATWS;
-
- while (p < e) {
- STRLEN len;
- if ((len = is_PATWS_safe(p, e, UTF))) {
- p += len;
- }
- else if (recognize_comment && *p == '#') {
- p = reg_skipcomment(pRExC_state, p);
- }
- else
- break;
- }
- return p;
-}
STATIC void
S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr)
&& first_char == *(p - 1));
}
+STATIC unsigned int
+S_regex_set_precedence(const U8 my_operator) {
+
+ /* Returns the precedence in the (?[...]) construct of the input operator,
+ * specified by its character representation. The precedence follows
+ * general Perl rules, but it extends this so that ')' and ']' have (low)
+ * precedence even though they aren't really operators */
+
+ switch (my_operator) {
+ case '!':
+ return 5;
+ case '&':
+ return 4;
+ case '^':
+ case '|':
+ case '+':
+ case '-':
+ return 3;
+ case ')':
+ return 2;
+ case ']':
+ return 1;
+ }
+
+ NOT_REACHED; /* NOTREACHED */
+ return 0; /* Silence compiler warning */
+}
+
STATIC regnode *
S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist,
I32 *flagp, U32 depth,
{
/* Handle the (?[...]) construct to do set operations */
- U8 curchar;
- UV start, end; /* End points of code point ranges */
- SV* result_string;
- char *save_end, *save_parse;
- SV* final;
- STRLEN len;
- regnode* node;
- AV* stack;
- const bool save_fold = FOLD;
+ U8 curchar; /* Current character being parsed */
+ UV start, end; /* End points of code point ranges */
+ SV* final = NULL; /* The end result inversion list */
+ SV* result_string; /* 'final' stringified */
+ AV* stack; /* stack of operators and operands not yet
+ resolved */
+ AV* fence_stack = NULL; /* A stack containing the positions in
+ 'stack' of where the undealt-with left
+ parens would be if they were actually
+ put there */
+ IV fence = 0; /* Position of where most recent undealt-
+ with left paren in stack is; -1 if none.
+ */
+ STRLEN len; /* Temporary */
+ regnode* node; /* Temporary, and final regnode returned by
+ this function */
+ const bool save_fold = FOLD; /* Temporary */
+ char *save_end, *save_parse; /* Temporaries */
+ const bool in_locale = LOC; /* we turn off /l during processing */
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
- if (LOC) {
- vFAIL("(?[...]) not valid in locale");
+ if (in_locale) {
+ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
}
- RExC_uni_semantics = 1;
+
+ REQUIRE_UNI_RULES(flagp, NULL); /* The use of this operator implies /u.
+ This is required so that the compile
+ time values are valid in all runtime
+ cases */
/* This will return only an ANYOF regnode, or (unlikely) something smaller
* (such as EXACT). Thus we can skip most everything if just sizing. We
* upon an unescaped ']' that isn't one ending a regclass. To do both
* these things, we need to realize that something preceded by a backslash
* is escaped, so we have to keep track of backslashes */
- if (PASS2) {
- Perl_ck_warner_d(aTHX_
- packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
- "The regex_sets feature is experimental" REPORT_LOCATION,
- UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
- UTF8fARG(UTF,
- RExC_end - RExC_start - (RExC_parse - RExC_precomp),
- RExC_precomp + (RExC_parse - RExC_precomp)));
- }
- else {
+ if (SIZE_ONLY) {
UV depth = 0; /* how many nested (?[...]) constructs */
while (RExC_parse < RExC_end) {
SV* current = NULL;
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
+
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ TRUE /* Force /x */ );
+
switch (*RExC_parse) {
case '?':
if (RExC_parse[1] == '[') depth++, RExC_parse++;
* default: case next time and keep on incrementing until
* we find one of the invariants we do handle. */
RExC_parse++;
+ if (*RExC_parse == 'c') {
+ /* Skip the \cX notation for control characters */
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ }
break;
case '[':
{
RExC_parse++;
}
- /* regclass() can only return RESTART_UTF8 if multi-char
- folds are allowed. */
+ /* regclass() can only return RESTART_PASS1 and NEED_UTF8
+ * if multi-char folds are allowed. */
if (!regclass(pRExC_state, flagp,depth+1,
is_posix_class, /* parse the whole char
class only if not a
FALSE, /* don't allow multi-char folds */
TRUE, /* silence non-portable warnings. */
TRUE, /* strict */
+ FALSE, /* Require return to be an ANYOF */
¤t
))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
- (UV) *flagp);
+ FAIL2("panic: regclass returned NULL to handle_sets, "
+ "flags=%#"UVxf"", (UV) *flagp);
/* function call leaves parse pointing to the ']', except
* if we faked it */
nextchar(pRExC_state);
Set_Node_Length(node,
RExC_parse - oregcomp_parse + 1); /* MJD */
+ if (in_locale) {
+ set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
+ }
+
return node;
}
goto no_close;
}
- RExC_parse++;
+
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
}
no_close:
FAIL("Syntax error in (?[...])");
}
- /* Pass 2 only after this. Everything in this construct is a
- * metacharacter. Operands begin with either a '\' (for an escape
- * sequence), or a '[' for a bracketed character class. Any other
- * character should be an operator, or parenthesis for grouping. Both
- * types of operands are handled by calling regclass() to parse them. It
- * is called with a parameter to indicate to return the computed inversion
- * list. The parsing here is implemented via a stack. Each entry on the
- * stack is a single character representing one of the operators, or the
- * '('; or else a pointer to an operand inversion list. */
-
-#define IS_OPERAND(a) (! SvIOK(a))
-
- /* The stack starts empty. It is a syntax error if the first thing parsed
- * is a binary operator; everything else is pushed on the stack. When an
- * operand is parsed, the top of the stack is examined. If it is a binary
- * operator, the item before it should be an operand, and both are replaced
- * by the result of doing that operation on the new operand and the one on
- * the stack. Thus a sequence of binary operands is reduced to a single
- * one before the next one is parsed.
+ /* Pass 2 only after this. */
+ Perl_ck_warner_d(aTHX_
+ packWARN(WARN_EXPERIMENTAL__REGEX_SETS),
+ "The regex_sets feature is experimental" REPORT_LOCATION,
+ UTF8fARG(UTF, (RExC_parse - RExC_precomp), RExC_precomp),
+ UTF8fARG(UTF,
+ RExC_end - RExC_start - (RExC_parse - RExC_precomp),
+ RExC_precomp + (RExC_parse - RExC_precomp)));
+
+ /* Everything in this construct is a metacharacter. Operands begin with
+ * either a '\' (for an escape sequence), or a '[' for a bracketed
+ * character class. Any other character should be an operator, or
+ * parenthesis for grouping. Both types of operands are handled by calling
+ * regclass() to parse them. It is called with a parameter to indicate to
+ * return the computed inversion list. The parsing here is implemented via
+ * a stack. Each entry on the stack is a single character representing one
+ * of the operators; or else a pointer to an operand inversion list. */
+
+#define IS_OPERATOR(a) SvIOK(a)
+#define IS_OPERAND(a) (! IS_OPERATOR(a))
+
+ /* The stack is kept in Łukasiewicz order. (That's pronounced similar
+ * to luke-a-shave-itch (or -itz), but people who didn't want to bother
+ * with pronouncing it called it Reverse Polish instead, but now that YOU
+ * know how to pronounce it you can use the correct term, thus giving due
+ * credit to the person who invented it, and impressing your geek friends.
+ * Wikipedia says that the pronounciation of "Ł" has been changing so that
+ * it is now more like an English initial W (as in wonk) than an L.)
+ *
+ * This means that, for example, 'a | b & c' is stored on the stack as
+ *
+ * c [4]
+ * b [3]
+ * & [2]
+ * a [1]
+ * | [0]
*
- * A unary operator may immediately follow a binary in the input, for
- * example
+ * where the numbers in brackets give the stack [array] element number.
+ * In this implementation, parentheses are not stored on the stack.
+ * Instead a '(' creates a "fence" so that the part of the stack below the
+ * fence is invisible except to the corresponding ')' (this allows us to
+ * replace testing for parens, by using instead subtraction of the fence
+ * position). As new operands are processed they are pushed onto the stack
+ * (except as noted in the next paragraph). New operators of higher
+ * precedence than the current final one are inserted on the stack before
+ * the lhs operand (so that when the rhs is pushed next, everything will be
+ * in the correct positions shown above. When an operator of equal or
+ * lower precedence is encountered in parsing, all the stacked operations
+ * of equal or higher precedence are evaluated, leaving the result as the
+ * top entry on the stack. This makes higher precedence operations
+ * evaluate before lower precedence ones, and causes operations of equal
+ * precedence to left associate.
+ *
+ * The only unary operator '!' is immediately pushed onto the stack when
+ * encountered. When an operand is encountered, if the top of the stack is
+ * a '!", the complement is immediately performed, and the '!' popped. The
+ * resulting value is treated as a new operand, and the logic in the
+ * previous paragraph is executed. Thus in the expression
* [a] + ! [b]
- * When an operand is parsed and the top of the stack is a unary operator,
- * the operation is performed, and then the stack is rechecked to see if
- * this new operand is part of a binary operation; if so, it is handled as
- * above.
+ * the stack looks like
+ *
+ * !
+ * a
+ * +
+ *
+ * as 'b' gets parsed, the latter gets evaluated to '!b', and the stack
+ * becomes
*
- * A '(' is simply pushed on the stack; it is valid only if the stack is
- * empty, or the top element of the stack is an operator or another '('
- * (for which the parenthesized expression will become an operand). By the
- * time the corresponding ')' is parsed everything in between should have
- * been parsed and evaluated to a single operand (or else is a syntax
- * error), and is handled as a regular operand */
+ * !b
+ * a
+ * +
+ *
+ * A ')' is treated as an operator with lower precedence than all the
+ * aforementioned ones, which causes all operations on the stack above the
+ * corresponding '(' to be evaluated down to a single resultant operand.
+ * Then the fence for the '(' is removed, and the operand goes through the
+ * algorithm above, without the fence.
+ *
+ * A separate stack is kept of the fence positions, so that the position of
+ * the latest so-far unbalanced '(' is at the top of it.
+ *
+ * The ']' ending the construct is treated as the lowest operator of all,
+ * so that everything gets evaluated down to a single operand, which is the
+ * result */
sv_2mortal((SV *)(stack = newAV()));
+ sv_2mortal((SV *)(fence_stack = newAV()));
while (RExC_parse < RExC_end) {
- I32 top_index = av_tindex(stack);
- SV** top_ptr;
- SV* current = NULL;
+ I32 top_index; /* Index of top-most element in 'stack' */
+ SV** top_ptr; /* Pointer to top 'stack' element */
+ SV* current = NULL; /* To contain the current inversion list
+ operand */
+ SV* only_to_avoid_leaks;
- /* Skip white space */
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE /* means recognize comments */ );
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ TRUE /* Force /x */ );
if (RExC_parse >= RExC_end) {
Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
}
- if ((curchar = UCHARAT(RExC_parse)) == ']') {
- break;
- }
+
+ curchar = UCHARAT(RExC_parse);
+
+redo_curchar:
+
+ top_index = av_tindex(stack);
switch (curchar) {
+ SV** stacked_ptr; /* Ptr to something already on 'stack' */
+ char stacked_operator; /* The topmost operator on the 'stack'. */
+ SV* lhs; /* Operand to the left of the operator */
+ SV* rhs; /* Operand to the right of the operator */
+ SV* fence_ptr; /* Pointer to top element of the fence
+ stack */
+
+ case '(':
- case '?':
- if (av_tindex(stack) >= 0 /* This makes sure that we can
- safely subtract 1 from
- RExC_parse in the next clause.
- If we have something on the
- stack, we have parsed something
- */
- && UCHARAT(RExC_parse - 1) == '('
- && RExC_parse < RExC_end)
+ if (RExC_parse < RExC_end && (UCHARAT(RExC_parse + 1) == '?'))
{
/* If is a '(?', could be an embedded '(?flags:(?[...])'.
* This happens when we have some thing like
* interpolated expression evaluates to. We use the flags
* from the interpolated pattern. */
U32 save_flags = RExC_flags;
- const char * const save_parse = ++RExC_parse;
+ const char * save_parse;
+
+ RExC_parse += 2; /* Skip past the '(?' */
+ save_parse = RExC_parse;
+ /* Parse any flags for the '(?' */
parse_lparen_question_flags(pRExC_state);
if (RExC_parse == save_parse /* Makes sure there was at
- least one flag (or this
- embedding wasn't compiled)
- */
+ least one flag (or else
+ this embedding wasn't
+ compiled) */
|| RExC_parse >= RExC_end - 4
|| UCHARAT(RExC_parse) != ':'
|| UCHARAT(++RExC_parse) != '('
}
vFAIL("Expecting '(?flags:(?[...'");
}
+
+ /* Recurse, with the meat of the embedded expression */
RExC_parse++;
(void) handle_regex_sets(pRExC_state, ¤t, flagp,
depth+1, oregcomp_parse);
/* Here, 'current' contains the embedded expression's
* inversion list, and RExC_parse points to the trailing
- * ']'; the next character should be the ')' which will be
- * paired with the '(' that has been put on the stack, so
- * the whole embedded expression reduces to '(operand)' */
+ * ']'; the next character should be the ')' */
+ RExC_parse++;
+ assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
+
+ /* Then the ')' matching the original '(' handled by this
+ * case: statement */
RExC_parse++;
+ assert(RExC_parse < RExC_end && UCHARAT(RExC_parse) == ')');
+ RExC_parse++;
RExC_flags = save_flags;
goto handle_operand;
}
- /* FALLTHROUGH */
- default:
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
- vFAIL("Unexpected character");
+ /* A regular '('. Look behind for illegal syntax */
+ if (top_index - fence >= 0) {
+ /* If the top entry on the stack is an operator, it had
+ * better be a '!', otherwise the entry below the top
+ * operand should be an operator */
+ if ( ! (top_ptr = av_fetch(stack, top_index, FALSE))
+ || (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) != '!')
+ || ( IS_OPERAND(*top_ptr)
+ && ( top_index - fence < 1
+ || ! (stacked_ptr = av_fetch(stack,
+ top_index - 1,
+ FALSE))
+ || ! IS_OPERATOR(*stacked_ptr))))
+ {
+ RExC_parse++;
+ vFAIL("Unexpected '(' with no preceding operator");
+ }
+ }
+
+ /* Stack the position of this undealt-with left paren */
+ fence = top_index + 1;
+ av_push(fence_stack, newSViv(fence));
+ break;
case '\\':
- /* regclass() can only return RESTART_UTF8 if multi-char
- folds are allowed. */
+ /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
+ * multi-char folds are allowed. */
if (!regclass(pRExC_state, flagp,depth+1,
TRUE, /* means parse just the next thing */
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
TRUE, /* strict */
- ¤t
- ))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
- (UV) *flagp);
+ FALSE, /* Require return to be an ANYOF */
+ ¤t))
+ {
+ FAIL2("panic: regclass returned NULL to handle_sets, "
+ "flags=%#"UVxf"", (UV) *flagp);
+ }
+
/* regclass() will return with parsing just the \ sequence,
* leaving the parse pointer at the next thing to parse */
RExC_parse--;
RExC_parse++;
}
- /* regclass() can only return RESTART_UTF8 if multi-char
- folds are allowed. */
+ /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if
+ * multi-char folds are allowed. */
if(!regclass(pRExC_state, flagp,depth+1,
is_posix_class, /* parse the whole char class
only if not a posix class */
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
TRUE, /* strict */
+ FALSE, /* Require return to be an ANYOF */
¤t
))
- FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
- (UV) *flagp);
+ {
+ FAIL2("panic: regclass returned NULL to handle_sets, "
+ "flags=%#"UVxf"", (UV) *flagp);
+ }
+
/* function call leaves parse pointing to the ']', except if we
* faked it */
if (is_posix_class) {
goto handle_operand;
}
+ case ']':
+ if (top_index >= 1) {
+ goto join_operators;
+ }
+
+ /* Only a single operand on the stack: are done */
+ goto done;
+
+ case ')':
+ if (av_tindex(fence_stack) < 0) {
+ RExC_parse++;
+ vFAIL("Unexpected ')'");
+ }
+
+ /* If at least two thing on the stack, treat this as an
+ * operator */
+ if (top_index - fence >= 1) {
+ goto join_operators;
+ }
+
+ /* Here only a single thing on the fenced stack, and there is a
+ * fence. Get rid of it */
+ fence_ptr = av_pop(fence_stack);
+ assert(fence_ptr);
+ fence = SvIV(fence_ptr) - 1;
+ SvREFCNT_dec_NN(fence_ptr);
+ fence_ptr = NULL;
+
+ if (fence < 0) {
+ fence = 0;
+ }
+
+ /* Having gotten rid of the fence, we pop the operand at the
+ * stack top and process it as a newly encountered operand */
+ current = av_pop(stack);
+ if (IS_OPERAND(current)) {
+ goto handle_operand;
+ }
+
+ RExC_parse++;
+ goto bad_syntax;
+
case '&':
case '|':
case '+':
case '-':
case '^':
- if (top_index < 0
+
+ /* These binary operators should have a left operand already
+ * parsed */
+ if ( top_index - fence < 0
+ || top_index - fence == 1
|| ( ! (top_ptr = av_fetch(stack, top_index, FALSE)))
|| ! IS_OPERAND(*top_ptr))
{
- RExC_parse++;
- vFAIL2("Unexpected binary operator '%c' with no preceding operand", curchar);
+ goto unexpected_binary;
}
- av_push(stack, newSVuv(curchar));
- break;
- case '!':
- av_push(stack, newSVuv(curchar));
- break;
+ /* If only the one operand is on the part of the stack visible
+ * to us, we just place this operator in the proper position */
+ if (top_index - fence < 2) {
- case '(':
- if (top_index >= 0) {
- top_ptr = av_fetch(stack, top_index, FALSE);
- assert(top_ptr);
- if (IS_OPERAND(*top_ptr)) {
- RExC_parse++;
- vFAIL("Unexpected '(' with no preceding operator");
- }
+ /* Place the operator before the operand */
+
+ SV* lhs = av_pop(stack);
+ av_push(stack, newSVuv(curchar));
+ av_push(stack, lhs);
+ break;
}
- av_push(stack, newSVuv(curchar));
- break;
- case ')':
- {
- SV* lparen;
- if (top_index < 1
- || ! (current = av_pop(stack))
- || ! IS_OPERAND(current)
- || ! (lparen = av_pop(stack))
- || IS_OPERAND(lparen)
- || SvUV(lparen) != '(')
+ /* But if there is something else on the stack, we need to
+ * process it before this new operator if and only if the
+ * stacked operation has equal or higher precedence than the
+ * new one */
+
+ join_operators:
+
+ /* The operator on the stack is supposed to be below both its
+ * operands */
+ if ( ! (stacked_ptr = av_fetch(stack, top_index - 2, FALSE))
+ || IS_OPERAND(*stacked_ptr))
{
- SvREFCNT_dec(current);
+ /* But if not, it's legal and indicates we are completely
+ * done if and only if we're currently processing a ']',
+ * which should be the final thing in the expression */
+ if (curchar == ']') {
+ goto done;
+ }
+
+ unexpected_binary:
RExC_parse++;
- vFAIL("Unexpected ')'");
+ vFAIL2("Unexpected binary operator '%c' with no "
+ "preceding operand", curchar);
}
- top_index -= 2;
- SvREFCNT_dec_NN(lparen);
+ stacked_operator = (char) SvUV(*stacked_ptr);
- /* FALLTHROUGH */
- }
+ if (regex_set_precedence(curchar)
+ > regex_set_precedence(stacked_operator))
+ {
+ /* Here, the new operator has higher precedence than the
+ * stacked one. This means we need to add the new one to
+ * the stack to await its rhs operand (and maybe more
+ * stuff). We put it before the lhs operand, leaving
+ * untouched the stacked operator and everything below it
+ * */
+ lhs = av_pop(stack);
+ assert(IS_OPERAND(lhs));
+
+ av_push(stack, newSVuv(curchar));
+ av_push(stack, lhs);
+ break;
+ }
- handle_operand:
+ /* Here, the new operator has equal or lower precedence than
+ * what's already there. This means the operation already
+ * there should be performed now, before the new one. */
- /* Here, we have an operand to process, in 'current' */
+ rhs = av_pop(stack);
+ if (! IS_OPERAND(rhs)) {
- if (top_index < 0) { /* Just push if stack is empty */
- av_push(stack, current);
+ /* This can happen when a ! is not followed by an operand,
+ * like in /(?[\t &!])/ */
+ goto bad_syntax;
}
- else {
- SV* top = av_pop(stack);
- SV *prev = NULL;
- char current_operator;
-
- if (IS_OPERAND(top)) {
- SvREFCNT_dec_NN(top);
- SvREFCNT_dec_NN(current);
- vFAIL("Operand with no preceding operator");
+
+ lhs = av_pop(stack);
+
+ if (! IS_OPERAND(lhs)) {
+
+ /* This can happen when there is an empty (), like in
+ * /(?[[0]+()+])/ */
+ goto bad_syntax;
+ }
+
+ switch (stacked_operator) {
+ case '&':
+ _invlist_intersection(lhs, rhs, &rhs);
+ break;
+
+ case '|':
+ case '+':
+ _invlist_union(lhs, rhs, &rhs);
+ break;
+
+ case '-':
+ _invlist_subtract(lhs, rhs, &rhs);
+ break;
+
+ case '^': /* The union minus the intersection */
+ {
+ SV* i = NULL;
+ SV* u = NULL;
+ SV* element;
+
+ _invlist_union(lhs, rhs, &u);
+ _invlist_intersection(lhs, rhs, &i);
+ /* _invlist_subtract will overwrite rhs
+ without freeing what it already contains */
+ element = rhs;
+ _invlist_subtract(u, i, &rhs);
+ SvREFCNT_dec_NN(i);
+ SvREFCNT_dec_NN(u);
+ SvREFCNT_dec_NN(element);
+ break;
}
- current_operator = (char) SvUV(top);
- switch (current_operator) {
- case '(': /* Push the '(' back on followed by the new
- operand */
- av_push(stack, top);
- av_push(stack, current);
- SvREFCNT_inc(top); /* Counters the '_dec' done
- just after the 'break', so
- it doesn't get wrongly freed
- */
- break;
+ }
+ SvREFCNT_dec(lhs);
+
+ /* Here, the higher precedence operation has been done, and the
+ * result is in 'rhs'. We overwrite the stacked operator with
+ * the result. Then we redo this code to either push the new
+ * operator onto the stack or perform any higher precedence
+ * stacked operation */
+ only_to_avoid_leaks = av_pop(stack);
+ SvREFCNT_dec(only_to_avoid_leaks);
+ av_push(stack, rhs);
+ goto redo_curchar;
+
+ case '!': /* Highest priority, right associative */
+
+ /* If what's already at the top of the stack is another '!",
+ * they just cancel each other out */
+ if ( (top_ptr = av_fetch(stack, top_index, FALSE))
+ && (IS_OPERATOR(*top_ptr) && SvUV(*top_ptr) == '!'))
+ {
+ only_to_avoid_leaks = av_pop(stack);
+ SvREFCNT_dec(only_to_avoid_leaks);
+ }
+ else { /* Otherwise, since it's right associative, just push
+ onto the stack */
+ av_push(stack, newSVuv(curchar));
+ }
+ break;
- case '!':
- _invlist_invert(current);
-
- /* Unlike binary operators, the top of the stack,
- * now that this unary one has been popped off, may
- * legally be an operator, and we now have operand
- * for it. */
- top_index--;
- SvREFCNT_dec_NN(top);
- goto handle_operand;
-
- case '&':
- prev = av_pop(stack);
- _invlist_intersection(prev,
- current,
- ¤t);
- av_push(stack, current);
- break;
+ default:
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL("Unexpected character");
- case '|':
- case '+':
- prev = av_pop(stack);
- _invlist_union(prev, current, ¤t);
- av_push(stack, current);
- break;
+ handle_operand:
+
+ /* Here 'current' is the operand. If something is already on the
+ * stack, we have to check if it is a !. */
+ top_index = av_tindex(stack); /* Code above may have altered the
+ * stack in the time since we
+ * earlier set 'top_index'. */
+ if (top_index - fence >= 0) {
+ /* If the top entry on the stack is an operator, it had better
+ * be a '!', otherwise the entry below the top operand should
+ * be an operator */
+ top_ptr = av_fetch(stack, top_index, FALSE);
+ assert(top_ptr);
+ if (IS_OPERATOR(*top_ptr)) {
+
+ /* The only permissible operator at the top of the stack is
+ * '!', which is applied immediately to this operand. */
+ curchar = (char) SvUV(*top_ptr);
+ if (curchar != '!') {
+ SvREFCNT_dec(current);
+ vFAIL2("Unexpected binary operator '%c' with no "
+ "preceding operand", curchar);
+ }
- case '-':
- prev = av_pop(stack);;
- _invlist_subtract(prev, current, ¤t);
- av_push(stack, current);
- break;
+ _invlist_invert(current);
- case '^': /* The union minus the intersection */
- {
- SV* i = NULL;
- SV* u = NULL;
- SV* element;
-
- prev = av_pop(stack);
- _invlist_union(prev, current, &u);
- _invlist_intersection(prev, current, &i);
- /* _invlist_subtract will overwrite current
- without freeing what it already contains */
- element = current;
- _invlist_subtract(u, i, ¤t);
- av_push(stack, current);
- SvREFCNT_dec_NN(i);
- SvREFCNT_dec_NN(u);
- SvREFCNT_dec_NN(element);
- break;
- }
+ only_to_avoid_leaks = av_pop(stack);
+ SvREFCNT_dec(only_to_avoid_leaks);
+ top_index = av_tindex(stack);
- default:
- Perl_croak(aTHX_ "panic: Unexpected item on '(?[ ])' stack");
+ /* And we redo with the inverted operand. This allows
+ * handling multiple ! in a row */
+ goto handle_operand;
+ }
+ /* Single operand is ok only for the non-binary ')'
+ * operator */
+ else if ((top_index - fence == 0 && curchar != ')')
+ || (top_index - fence > 0
+ && (! (stacked_ptr = av_fetch(stack,
+ top_index - 1,
+ FALSE))
+ || IS_OPERAND(*stacked_ptr))))
+ {
+ SvREFCNT_dec(current);
+ vFAIL("Operand with no preceding operator");
}
- SvREFCNT_dec_NN(top);
- SvREFCNT_dec(prev);
}
- }
+
+ /* Here there was nothing on the stack or the top element was
+ * another operand. Just add this new one */
+ av_push(stack, current);
+
+ } /* End of switch on next parse token */
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ } /* End of loop parsing through the construct */
+
+ done:
+ if (av_tindex(fence_stack) >= 0) {
+ vFAIL("Unmatched (");
}
if (av_tindex(stack) < 0 /* Was empty */
|| ((final = av_pop(stack)) == NULL)
|| ! IS_OPERAND(final)
+ || SvTYPE(final) != SVt_INVLIST
|| av_tindex(stack) >= 0) /* More left on stack */
{
+ bad_syntax:
+ SvREFCNT_dec(final);
vFAIL("Incomplete expression within '(?[ ])'");
}
}
}
+ /* About to generate an ANYOF (or similar) node from the inversion list we
+ * have calculated */
save_parse = RExC_parse;
RExC_parse = SvPV(result_string, len);
save_end = RExC_end;
* already has all folding taken into consideration, and we don't want
* regclass() to add to that */
RExC_flags &= ~RXf_PMf_FOLD;
- /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed.
- */
+ /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if multi-char
+ * folds are allowed. */
node = regclass(pRExC_state, flagp,depth+1,
FALSE, /* means parse the whole char class */
FALSE, /* don't allow multi-char folds */
well have generated non-portable code points, but
they're valid on this machine */
FALSE, /* similarly, no need for strict */
+ FALSE, /* Require return to be an ANYOF */
NULL
);
if (!node)
FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
PTR2UV(flagp));
+
+ /* Fix up the node type if we are in locale. (We have pretended we are
+ * under /u for the purposes of regclass(), as this construct will only
+ * work under UTF-8 locales. But now we change the opcode to be ANYOFL (so
+ * as to cause any warnings about bad locales to be output in regexec.c),
+ * and add the flag that indicates to check if not in a UTF-8 locale. The
+ * reason we above forbid optimization into something other than an ANYOF
+ * node is simply to minimize the number of code changes in regexec.c.
+ * Otherwise we would have to create new EXACTish node types and deal with
+ * them. This decision could be revisited should this construct become
+ * popular.
+ *
+ * (One might think we could look at the resulting ANYOF node and suppress
+ * the flag if everything is above 255, as those would be UTF-8 only,
+ * but this isn't true, as the components that led to that result could
+ * have been locale-affected, and just happen to cancel each other out
+ * under UTF-8 locales.) */
+ if (in_locale) {
+ set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
+
+ assert(OP(node) == ANYOF);
+
+ OP(node) = ANYOFL;
+ ANYOF_FLAGS(node) |= ANYOF_LOC_REQ_UTF8;
+ }
+
if (save_fold) {
RExC_flags |= RXf_PMf_FOLD;
}
+
RExC_parse = save_parse + 1;
RExC_end = save_end;
SvREFCNT_dec_NN(final);
Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */
return node;
}
+#undef IS_OPERATOR
#undef IS_OPERAND
STATIC void
*invlist = add_cp_to_invlist(*invlist,
LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
break;
+
+#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
+
case LATIN_SMALL_LETTER_SHARP_S:
*invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
break;
+
+#endif
+
+#if UNICODE_MAJOR_VERSION < 3 \
+ || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0)
+
+ /* In 3.0 and earlier, U+0130 folded simply to 'i'; and in 3.0.1 so did
+ * U+0131. */
+ case 'i':
+ case 'I':
+ *invlist =
+ add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+# if UNICODE_DOT_DOT_VERSION == 1
+ *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_DOTLESS_I);
+# endif
+ break;
+#endif
+
default:
/* Use deprecated warning to increase the chances of this being
* output */
#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION \
(SvCUR(listsv) != initial_listsv_len)
+/* There is a restricted set of white space characters that are legal when
+ * ignoring white space in a bracketed character class. This generates the
+ * code to skip them.
+ *
+ * There is a line below that uses the same white space criteria but is outside
+ * this macro. Both here and there must use the same definition */
+#define SKIP_BRACKETED_WHITE_SPACE(do_skip, p) \
+ STMT_START { \
+ if (do_skip) { \
+ while ( p < RExC_end \
+ && isBLANK_A(UCHARAT(p))) \
+ { \
+ p++; \
+ } \
+ } \
+ } STMT_END
+
STATIC regnode *
S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
const bool stop_at_1, /* Just parse the next thing, don't
about too large
characters */
const bool strict,
+ bool optimizable, /* ? Allow a non-ANYOF return
+ node */
SV** ret_invlist /* Return an inversion list, not a node */
)
{
* are extra bits for \w, etc. in locale ANYOFs, as what these match is not
* determinable at compile time
*
- * Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
- * to be restarted. This can only happen if ret_invlist is non-NULL.
+ * Returns NULL, setting *flagp to RESTART_PASS1 if the sizing scan needs
+ * to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded
+ * to UTF-8. This can only happen if ret_invlist is non-NULL.
*/
UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
DEBUG_PARSE("clas");
+#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
+ && UNICODE_DOT_DOT_VERSION == 0)
+ allow_multi_folds = FALSE;
+#endif
+
/* Assume we are going to generate an ANYOF node. */
ret = reganode(pRExC_state,
(LOC)
? ANYOFL
- : ANYOF,
+ : (DEPENDS_SEMANTICS)
+ ? ANYOFD
+ : ANYOF,
0);
if (SIZE_ONLY) {
SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
}
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_parse++;
invert = TRUE;
allow_multi_folds = FALSE;
MARK_NAUGHTY(1);
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
}
/* Check that they didn't say [:posix:] instead of [[:posix:]] */
break;
}
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
if (UCHARAT(RExC_parse) == ']') {
break;
* skipped, it means that that white space is wanted literally, and
* is already in 'value'. Otherwise, need to translate the escape
* into what it signifies. */
- if (! skip_white || ! is_PATWS_cp(value)) switch ((I32)value) {
+ if (! skip_white || ! isBLANK_A(value)) switch ((I32)value) {
case 'w': namedclass = ANYOF_WORDCHAR; break;
case 'W': namedclass = ANYOF_NWORDCHAR; break;
case 'H': namedclass = ANYOF_NHORIZWS; break;
case 'N': /* Handle \N{NAME} in class */
{
- SV *as_text;
- STRLEN cp_count = grok_bslash_N(pRExC_state, NULL, &value,
- flagp, depth, &as_text);
- if (*flagp & RESTART_UTF8)
- FAIL("panic: grok_bslash_N set RESTART_UTF8");
- if (cp_count != 1) { /* The typical case drops through */
- assert(cp_count != (STRLEN) -1);
- if (cp_count == 0) {
+ const char * const backslash_N_beg = RExC_parse - 2;
+ int cp_count;
+
+ if (! grok_bslash_N(pRExC_state,
+ NULL, /* No regnode */
+ &value, /* Yes single value */
+ &cp_count, /* Multiple code pt count */
+ flagp,
+ depth)
+ ) {
+
+ if (*flagp & NEED_UTF8)
+ FAIL("panic: grok_bslash_N set NEED_UTF8");
+ if (*flagp & RESTART_PASS1)
+ return NULL;
+
+ if (cp_count < 0) {
+ vFAIL("\\N in a character class must be a named character: \\N{...}");
+ }
+ else if (cp_count == 0) {
if (strict) {
RExC_parse++; /* Position after the "}" */
vFAIL("Zero length \\N{}");
else if (PASS2) {
ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
}
+ break; /* <value> contains the first code
+ point. Drop out of the switch to
+ process it */
}
else {
+ SV * multi_char_N = newSVpvn(backslash_N_beg,
+ RExC_parse - backslash_N_beg);
multi_char_matches
= add_multi_match(multi_char_matches,
- as_text,
+ multi_char_N,
cp_count);
}
- break; /* <value> contains the first code
- point. Drop out of the switch to
- process it */
}
} /* End of cp_count != 1 */
vFAIL2("Empty \\%c{}", (U8)value);
if (*RExC_parse == '{') {
const U8 c = (U8)value;
- e = strchr(RExC_parse++, '}');
- if (!e)
+ e = strchr(RExC_parse, '}');
+ if (!e) {
+ RExC_parse++;
vFAIL2("Missing right brace on \\%c{}", c);
- while (isSPACE(*RExC_parse))
- RExC_parse++;
+ }
+
+ RExC_parse++;
+ while (isSPACE(*RExC_parse)) {
+ RExC_parse++;
+ }
+
+ if (UCHARAT(RExC_parse) == '^') {
+
+ /* toggle. (The rhs xor gets the single bit that
+ * differs between P and p; the other xor inverts just
+ * that bit) */
+ value ^= 'P' ^ 'p';
+
+ RExC_parse++;
+ while (isSPACE(*RExC_parse)) {
+ RExC_parse++;
+ }
+ }
+
if (e == RExC_parse)
vFAIL2("Empty \\%c{}", c);
+
n = e - RExC_parse;
while (isSPACE(*(RExC_parse + n - 1)))
n--;
- }
- else {
+ } /* The \p isn't immediately followed by a '{' */
+ else if (! isALPHA(*RExC_parse)) {
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL2("Character following \\%c must be '{' or a "
+ "single-character Unicode property name",
+ (U8) value);
+ }
+ else {
e = RExC_parse;
n = 1;
}
if (!SIZE_ONLY) {
SV* invlist;
char* name;
+ char* base_name; /* name after any packages are stripped */
+ const char * const colon_colon = "::";
- if (UCHARAT(RExC_parse) == '^') {
- RExC_parse++;
- n--;
- /* toggle. (The rhs xor gets the single bit that
- * differs between P and p; the other xor inverts just
- * that bit) */
- value ^= 'P' ^ 'p';
-
- while (isSPACE(*RExC_parse)) {
- RExC_parse++;
- n--;
- }
- }
/* Try to get the definition of the property into
* <invlist>. If /i is in effect, the effective property
* will have its name be <__NAME_i>. The design is
/* Look up the property name, and get its swash and
* inversion list, if the property is found */
- if (swash) {
+ if (swash) { /* Return any left-overs */
SvREFCNT_dec_NN(swash);
}
swash = _core_swash_init("utf8", name, &PL_sv_undef,
HV* curpkg = (IN_PERL_COMPILETIME)
? PL_curstash
: CopSTASH(PL_curcop);
- if (swash) {
+ UV final_n = n;
+ bool has_pkg;
+
+ if (swash) { /* Got a swash but no inversion list.
+ Something is likely wrong that will
+ be sorted-out later */
SvREFCNT_dec_NN(swash);
swash = NULL;
}
- /* Here didn't find it. It could be a user-defined
- * property that will be available at run-time. If we
- * accept only compile-time properties, is an error;
- * otherwise add it to the list for run-time look up */
- if (ret_invlist) {
+ /* Here didn't find it. It could be a an error (like a
+ * typo) in specifying a Unicode property, or it could
+ * be a user-defined property that will be available at
+ * run-time. The names of these must begin with 'In'
+ * or 'Is' (after any packages are stripped off). So
+ * if not one of those, or if we accept only
+ * compile-time properties, is an error; otherwise add
+ * it to the list for run-time look up. */
+ if ((base_name = rninstr(name, name + n,
+ colon_colon, colon_colon + 2)))
+ { /* Has ::. We know this must be a user-defined
+ property */
+ base_name += 2;
+ final_n -= base_name - name;
+ has_pkg = TRUE;
+ }
+ else {
+ base_name = name;
+ has_pkg = FALSE;
+ }
+
+ if ( final_n < 3
+ || base_name[0] != 'I'
+ || (base_name[1] != 's' && base_name[1] != 'n')
+ || ret_invlist)
+ {
+ const char * const msg
+ = (has_pkg)
+ ? "Illegal user-defined property name"
+ : "Can't find Unicode property definition";
RExC_parse = e + 1;
- vFAIL2utf8f(
- "Property '%"UTF8f"' is unknown",
- UTF8fARG(UTF, n, name));
+
+ /* diag_listed_as: Can't find Unicode property definition "%s" */
+ vFAIL3utf8f("%s \"%"UTF8f"\"",
+ msg, UTF8fARG(UTF, n, name));
}
/* If the property name doesn't already have a package
* name, add the current one to it so that it can be
* referred to outside it. [perl #121777] */
- if (curpkg && ! instr(name, "::")) {
+ if (! has_pkg && curpkg) {
char* pkgname = HvNAME(curpkg);
if (strNE(pkgname, "main")) {
char* full_name = Perl_form(aTHX_
(value == 'p' ? '+' : '!'),
UTF8fARG(UTF, n, name));
has_user_defined_property = TRUE;
+ optimizable = FALSE; /* Will have to leave this an
+ ANYOF node */
/* We don't know yet, so have to assume that the
- * property could match something in the Latin1 range,
- * hence something that isn't utf8. Note that this
- * would cause things in <depends_list> to match
+ * property could match something in the upper Latin1
+ * range, hence something that isn't utf8. Note that
+ * this would cause things in <depends_list> to match
* inappropriately, except that any \p{}, including
* this one forces Unicode semantics, which means there
* is no <depends_list> */
named */
/* \p means they want Unicode semantics */
- RExC_uni_semantics = 1;
+ REQUIRE_UNI_RULES(flagp, NULL);
}
break;
case 'n': value = '\n'; break;
recode_encoding:
if (! RExC_override_recoding) {
SV* enc = _get_encoding();
- value = reg_recode((const char)(U8)value, &enc);
+ value = reg_recode((U8)value, &enc);
if (!enc) {
if (strict) {
vFAIL("Invalid escape in the specified encoding");
}
ANYOF_FLAGS(ret) |= ANYOF_MATCHES_POSIXL;
ANYOF_POSIXL_ZERO(ret);
+
+ /* We can't change this into some other type of node
+ * (unless this is the only element, in which case there
+ * are nodes that mean exactly this) as has runtime
+ * dependencies */
+ optimizable = FALSE;
}
/* Coverity thinks it is possible for this to be negative; both
}
} /* end of namedclass \blah */
- if (skip_white) {
- RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */ );
- }
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
/* If 'range' is set, 'value' is the ending of a range--check its
* validity. (If value isn't a single code point in the case of a
&& *RExC_parse == '-')
{
char* next_char_ptr = RExC_parse + 1;
- if (skip_white) { /* Get the next real char after the '-' */
- next_char_ptr = regpatws(pRExC_state,
- RExC_parse + 1,
- FALSE); /* means don't recognize
- comments */
- }
+
+ /* Get the next real char after the '-' */
+ SKIP_BRACKETED_WHITE_SPACE(skip_white, next_char_ptr);
/* If the '-' is at the end of the class (just before the ']',
* it is a literal minus; otherwise it is a range */
/* non-Latin1 code point implies unicode semantics. Must be set in
* pass1 so is there for the whole of pass 2 */
if (value > 255) {
- RExC_uni_semantics = 1;
+ REQUIRE_UNI_RULES(flagp, NULL);
}
/* Ready to process either the single value, or the completed range.
* same element, neither should be a digit. */
if (index_start == index_final) {
assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
- || invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
- - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
- == 10);
+ || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
+ - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
+ == 10)
+ /* But actually Unicode did have one group of 11
+ * 'digits' in 5.2, so in case we are operating
+ * on that version, let that pass */
+ || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
+ - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
+ == 11
+ && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
+ == 0x19D0)
+ );
}
else if ((index_start >= 0
&& ELEMENT_RANGE_MATCHES_INVLIST(index_start))
ret = reg(pRExC_state, 1, ®_flags, depth+1);
- *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_UTF8);
+ *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED|RESTART_PASS1|NEED_UTF8);
RExC_parse = save_parse;
RExC_end = save_end;
* 2) if the character class contains only a single element (including a
* single range), we see if there is an equivalent node for it.
* Other checks are possible */
- if (! ret_invlist /* Can't optimize if returning the constructed
- inversion list */
+ if ( optimizable
+ && ! ret_invlist /* Can't optimize if returning the constructed
+ inversion list */
&& (UNLIKELY(posixl_matches_all) || element_count == 1))
{
U8 op = END;
op = POSIXA;
}
}
- else if (AT_LEAST_ASCII_RESTRICTED || ! FOLD) {
+ else if (! FOLD || ASCII_FOLD_RESTRICTED) {
/* We can optimize A-Z or a-z, but not if they could match
- * something like the KELVIN SIGN under /i (/a means they
- * can't) */
+ * something like the KELVIN SIGN under /i. */
if (prevvalue == 'A') {
if (value == 'Z'
#ifdef EBCDIC
/* Our calculated list will be for Unicode rules. For locale
* matching, we have to keep a separate list that is consulted at
* runtime only when the locale indicates Unicode rules. For
- * non-locale, we just use to the general list */
+ * non-locale, we just use the general list */
if (LOC) {
use_list = &only_utf8_locale_list;
}
if (DEPENDS_SEMANTICS) {
/* Under /d, everything in the upper half of the Latin1 range
* matches these complements */
- ANYOF_FLAGS(ret) |= ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII;
+ ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
}
else if (AT_LEAST_ASCII_RESTRICTED) {
/* Under /a and /aa, everything above ASCII matches these
}
if (warn_super) {
- ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
+ ANYOF_FLAGS(ret)
+ |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+
+ /* Because an ANYOF node is the only one that warns, this node
+ * can't be optimized into something else */
+ optimizable = FALSE;
}
}
if (only_utf8_locale_list) {
ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
}
- else if (cp_list) { /* Look to see if there a 0-255 code point is in
- the list */
+ else if (cp_list) { /* Look to see if a 0-255 code point is in list */
UV start, end;
invlist_iterinit(cp_list);
if (invlist_iternext(cp_list, &start, &end) && start < 256) {
* adjacent such nodes. And if the class is equivalent to things like /./,
* expensive run-time swashes can be avoided. Now that we have more
* complete information, we can find things necessarily missed by the
- * earlier code. I (khw) am not sure how much to look for here. It would
- * be easy, but perhaps too slow, to check any candidates against all the
- * node types they could possibly match using _invlistEQ(). */
-
- if (cp_list
- && ! invert
- && ! depends_list
- && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
- && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
-
- /* We don't optimize if we are supposed to make sure all non-Unicode
- * code points raise a warning, as only ANYOF nodes have this check.
- * */
- && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
- {
+ * earlier code. I (khw) did some benchmarks and found essentially no
+ * speed difference between using a POSIXA node versus an ANYOF node, so
+ * there is no reason to optimize, for example [A-Za-z0-9_] into
+ * [[:word:]]/a (although if we did it in the sizing pass it would save
+ * space). _invlistEQ() could be used if one ever wanted to do something
+ * like this at this point in the code */
+
+ if (optimizable && cp_list && ! invert && ! depends_list) {
UV start, end;
U8 op = END; /* The optimzation node-type */
const char * cur_parse= RExC_parse;
if (! invlist_iternext(cp_list, &start, &end)) {
/* Here, the list is empty. This happens, for example, when a
- * Unicode property is the only thing in the character class, and
- * it doesn't match anything. (perluniprops.pod notes such
- * properties) */
+ * Unicode property that doesn't match anything is the only element
+ * in the character class (perluniprops.pod notes such properties).
+ * */
op = OPFAIL;
*flagp |= HASWIDTH|SIMPLE;
}
}
}
}
- }
+ } /* End of first range contains just a single code point */
else if (start == 0) {
if (end == UV_MAX) {
op = SANY;
RExC_parse = (char *)orig_parse;
RExC_emit = (regnode *)orig_emit;
- ret = reg_node(pRExC_state, op);
+ if (regarglen[op]) {
+ ret = reganode(pRExC_state, op, 0);
+ } else {
+ ret = reg_node(pRExC_state, op);
+ }
RExC_parse = (char *)cur_parse;
return p;
}
-/* nextchar()
-
- Advances the parse position, and optionally absorbs
- "whitespace" from the inputstream.
-
- Without /x "whitespace" means (?#...) style comments only,
- with /x this means (?#...) and # comments and whitespace proper.
-
- Returns the RExC_parse point from BEFORE the scan occurs.
+STATIC void
+S_skip_to_be_ignored_text(pTHX_ RExC_state_t *pRExC_state,
+ char ** p,
+ const bool force_to_xmod
+ )
+{
+ /* If the text at the current parse position '*p' is a '(?#...)' comment,
+ * or if we are under /x or 'force_to_xmod' is TRUE, and the text at '*p'
+ * is /x whitespace, advance '*p' so that on exit it points to the first
+ * byte past all such white space and comments */
- This is the /x friendly way of saying RExC_parse++.
-*/
+ const bool use_xmod = force_to_xmod || (RExC_flags & RXf_PMf_EXTENDED);
-STATIC char*
-S_nextchar(pTHX_ RExC_state_t *pRExC_state)
-{
- char* const retval = RExC_parse++;
+ PERL_ARGS_ASSERT_SKIP_TO_BE_IGNORED_TEXT;
- PERL_ARGS_ASSERT_NEXTCHAR;
+ assert( ! UTF || UTF8_IS_INVARIANT(**p) || UTF8_IS_START(**p));
for (;;) {
- if (RExC_end - RExC_parse >= 3
- && *RExC_parse == '('
- && RExC_parse[1] == '?'
- && RExC_parse[2] == '#')
+ if (RExC_end - (*p) >= 3
+ && *(*p) == '('
+ && *(*p + 1) == '?'
+ && *(*p + 2) == '#')
{
- while (*RExC_parse != ')') {
- if (RExC_parse == RExC_end)
+ while (*(*p) != ')') {
+ if ((*p) == RExC_end)
FAIL("Sequence (?#... not terminated");
- RExC_parse++;
+ (*p)++;
}
- RExC_parse++;
+ (*p)++;
continue;
}
- if (RExC_flags & RXf_PMf_EXTENDED) {
- char * p = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
- if (p != RExC_parse) {
- RExC_parse = p;
+
+ if (use_xmod) {
+ const char * save_p = *p;
+ while ((*p) < RExC_end) {
+ STRLEN len;
+ if ((len = is_PATWS_safe((*p), RExC_end, UTF))) {
+ (*p) += len;
+ }
+ else if (*(*p) == '#') {
+ (*p) = reg_skipcomment(pRExC_state, (*p));
+ }
+ else {
+ break;
+ }
+ }
+ if (*p != save_p) {
continue;
}
}
- return retval;
+
+ break;
}
+
+ return;
+}
+
+/* nextchar()
+
+ Advances the parse position by one byte, unless that byte is the beginning
+ of a '(?#...)' style comment, or is /x whitespace and /x is in effect. In
+ those two cases, the parse position is advanced beyond all such comments and
+ white space.
+
+ This is the UTF, (?#...), and /x friendly way of saying RExC_parse++.
+*/
+
+STATIC void
+S_nextchar(pTHX_ RExC_state_t *pRExC_state)
+{
+ PERL_ARGS_ASSERT_NEXTCHAR;
+
+ assert( ! UTF
+ || UTF8_IS_INVARIANT(*RExC_parse)
+ || UTF8_IS_START(*RExC_parse));
+
+ RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+
+ skip_to_be_ignored_text(pRExC_state, &RExC_parse,
+ FALSE /* Don't assume /x */ );
}
STATIC regnode *
PerlIO_printf(Perl_debug_log, "(SBOL)");
if (r->intflags & PREGf_ANCH_GPOS)
PerlIO_printf(Perl_debug_log, "(GPOS)");
- PerlIO_putc(Perl_debug_log, ' ');
+ (void)PerlIO_putc(Perl_debug_log, ' ');
}
if (r->intflags & PREGf_GPOS_SEEN)
PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
}
} else if (k == CURLY) {
+ U32 lo = ARG1(o), hi = ARG2(o);
if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
- Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
+ Perl_sv_catpvf(aTHX_ sv, "{%u,", (unsigned) lo);
+ if (hi == REG_INFTY)
+ sv_catpvs(sv, "INFTY");
+ else
+ Perl_sv_catpvf(aTHX_ sv, "%u", (unsigned) hi);
+ sv_catpvs(sv, "}");
}
else if (k == WHILEM && o->flags) /* Ordinal/of */
Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
|| k == GROUPP || OP(o)==ACCEPT)
{
AV *name_list= NULL;
- Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
+ U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o);
+ Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno); /* Parenth number */
if ( RXp_PAREN_NAMES(prog) ) {
name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
} else if ( pRExC_state ) {
}
if (name_list) {
if ( k != REF || (OP(o) < NREF)) {
- SV **name= av_fetch(name_list, ARG(o), 0 );
+ SV **name= av_fetch(name_list, parno, 0 );
if (name)
Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
}
else {
- SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
+ SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]);
I32 *nums=(I32*)SvPVX(sv_dat);
SV **name= av_fetch(name_list, nums[0], 0 );
I32 n;
Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
}
}
- else if (k == VERB) {
- if (!o->flags)
- Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
- SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
- } else if (k == LOGICAL)
+ else if (k == LOGICAL)
/* 2: embedded, otherwise 1 */
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
else if (k == ANYOF) {
SV* bitmap_invlist; /* Will hold what the bit map contains */
- if (OP(o) == ANYOFL)
- sv_catpvs(sv, "{loc}");
+ if (OP(o) == ANYOFL) {
+ if (flags & ANYOF_LOC_REQ_UTF8) {
+ sv_catpvs(sv, "{utf8-loc}");
+ }
+ else {
+ sv_catpvs(sv, "{loc}");
+ }
+ }
if (flags & ANYOF_LOC_FOLD)
sv_catpvs(sv, "{i}");
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
sv_catpvs(sv, "^");
}
- if (flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
+ if (OP(o) == ANYOFD
+ && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+ {
sv_catpvs(sv, "{non-utf8-latin1-all}");
}
Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
else if (OP(o) == SBOL)
Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
+
+ /* add on the verb argument if there is one */
+ if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
+ Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
+ SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
+ }
#else
PERL_UNUSED_CONTEXT;
PERL_UNUSED_ARG(sv);
Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf));
}
+/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
+
+#ifndef PERL_IN_XSUB_RE
+void
+Perl_save_re_context(pTHX)
+{
+ I32 nparens = -1;
+ I32 i;
+
+ /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
+
+ if (PL_curpm) {
+ const REGEXP * const rx = PM_GETRE(PL_curpm);
+ if (rx)
+ nparens = RX_NPARENS(rx);
+ }
+
+ /* RT #124109. This is a complete hack; in the SWASHNEW case we know
+ * that PL_curpm will be null, but that utf8.pm and the modules it
+ * loads will only use $1..$3.
+ * The t/porting/re_context.t test file checks this assumption.
+ */
+ if (nparens == -1)
+ nparens = 3;
+
+ for (i = 1; i <= nparens; i++) {
+ char digits[TYPE_CHARS(long)];
+ const STRLEN len = my_snprintf(digits, sizeof(digits),
+ "%lu", (long)i);
+ GV *const *const gvp
+ = (GV**)hv_fetch(PL_defstash, digits, len, 0);
+
+ if (gvp) {
+ GV * const gv = *gvp;
+ if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
+ save_scalar(gv);
+ }
+ }
+}
+#endif
+
#ifdef DEBUGGING
STATIC void
this_end = (end < NUM_ANYOF_CODE_POINTS)
? end
: NUM_ANYOF_CODE_POINTS - 1;
+#if NUM_ANYOF_CODE_POINTS > 256
format = (this_end < 256)
? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
: "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
+#else
+ format = "\\x{%02"UVXf"}-\\x{%02"UVXf"}";
+#endif
GCC_DIAG_IGNORE(-Wformat-nonliteral);
Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
GCC_DIAG_RESTORE;
#endif /* DEBUGGING */
/*
- * Local variables:
- * c-indentation-style: bsd
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- *
* ex: set ts=8 sts=4 sw=4 et:
*/