I32 seen_zerolen;
regnode_offset *open_parens; /* offsets to open parens */
regnode_offset *close_parens; /* offsets to close parens */
+ I32 parens_buf_size; /* #slots malloced open/close_parens */
regnode *end_op; /* END node in program */
I32 utf8; /* whether the pattern is utf8 or not */
I32 orig_utf8; /* whether the pattern was originally in utf8 */
scan_frame *frame_last;
U32 frame_count;
AV *warn_text;
+ HV *unlexed_names;
#ifdef ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#define RExC_maxlen (pRExC_state->maxlen)
#define RExC_npar (pRExC_state->npar)
#define RExC_total_parens (pRExC_state->total_par)
+#define RExC_parens_buf_size (pRExC_state->parens_buf_size)
#define RExC_nestroot (pRExC_state->nestroot)
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_utf8 (pRExC_state->utf8)
#define RExC_warn_text (pRExC_state->warn_text)
#define RExC_in_script_run (pRExC_state->in_script_run)
#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
+#define RExC_unlexed_names (pRExC_state->unlexed_names)
/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
* a flag to disable back-off on the fixed/floating substrings - if it's
if (DEPENDS_SEMANTICS) { \
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
RExC_uni_semantics = 1; \
- if (RExC_seen_d_op && LIKELY(RExC_total_parens >= 0)) { \
+ if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
/* No need to restart the parse if we haven't seen \
* anything that differs between /u and /d, and no need \
* to restart immediately if we're going to reparse \
} \
} STMT_END
-#define BRANCH_MAX_OFFSET U16_MAX
#define REQUIRE_BRANCHJ(flagp, restart_retval) \
STMT_START { \
RExC_use_BRANCHJ = 1; \
- if (LIKELY(RExC_total_parens >= 0)) { \
+ if (LIKELY(! IN_PARENS_PASS)) { \
/* No need to restart the parse immediately if we're \
* going to reparse anyway to count parens */ \
*flagp |= RESTART_PARSE; \
} \
} STMT_END
+/* Until we have completed the parse, we leave RExC_total_parens at 0 or
+ * less. After that, it must always be positive, because the whole re is
+ * considered to be surrounded by virtual parens. Setting it to negative
+ * indicates there is some construct that needs to know the actual number of
+ * parens to be properly handled. And that means an extra pass will be
+ * required after we've counted them all */
+#define ALL_PARENS_COUNTED (RExC_total_parens > 0)
#define REQUIRE_PARENS_PASS \
- STMT_START { \
- if (RExC_total_parens == 0) RExC_total_parens = -1; \
+ STMT_START { /* No-op if have completed a pass */ \
+ if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
} STMT_END
+#define IN_PARENS_PASS (RExC_total_parens < 0)
+
/* This is used to return failure (zero) early from the calling function if
* various flags in 'flags' are set. Two flags always cause a return:
/* Used to point after bad bytes for an error message, but avoid skipping
* past a nul byte. */
-#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
+#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
/* Set up to clean up after our imminent demise */
#define PREPARE_TO_DIE \
unsigned int i;
const U32 n = ARG(node);
bool new_node_has_latin1 = FALSE;
+ const U8 flags = OP(node) == ANYOFH ? 0 : ANYOF_FLAGS(node);
PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
}
/* Get the code points valid only under UTF-8 locales */
- if ( (ANYOF_FLAGS(node) & ANYOFL_FOLD)
+ if ( (flags & ANYOFL_FOLD)
&& av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
{
only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
* actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
* have to do this here before we add the unconditionally matched code
* points */
- if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
+ if (flags & ANYOF_INVERT) {
_invlist_intersection_complement_2nd(invlist,
PL_UpperLatin1,
&invlist);
* as well. But don't add them if inverting, as when that gets done below,
* it would exclude all these characters, including the ones it shouldn't
* that were added just above */
- if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
- && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+ if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
+ && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
{
_invlist_union(invlist, PL_UpperLatin1, &invlist);
}
/* Similarly for these */
- if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
+ if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
_invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
}
- if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
+ if (flags & ANYOF_INVERT) {
_invlist_invert(invlist);
}
- else if (ANYOF_FLAGS(node) & ANYOFL_FOLD) {
+ else if (flags & ANYOFL_FOLD) {
if (new_node_has_latin1) {
/* Under /li, any 0-255 could fold to any other 0-255, depending on
if (only_utf8_locale_invlist) {
_invlist_union_maybe_complement_2nd(invlist,
only_utf8_locale_invlist,
- ANYOF_FLAGS(node) & ANYOF_INVERT,
+ flags & ANYOF_INVERT,
&invlist);
}
* another SSC or a regular ANYOF class. Can create false positives. */
SV* anded_cp_list;
+ U8 and_with_flags = (OP(and_with) == ANYOFH) ? 0 : ANYOF_FLAGS(and_with);
U8 anded_flags;
PERL_ARGS_ASSERT_SSC_AND;
* the code point inversion list and just the relevant flags */
if (is_ANYOF_SYNTHETIC(and_with)) {
anded_cp_list = ((regnode_ssc *)and_with)->invlist;
- anded_flags = ANYOF_FLAGS(and_with);
+ anded_flags = and_with_flags;
/* XXX This is a kludge around what appears to be deficiencies in the
* optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
else {
anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
if (OP(and_with) == ANYOFD) {
- anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
+ anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
}
else {
- anded_flags = ANYOF_FLAGS(and_with)
+ anded_flags = and_with_flags
&( ANYOF_COMMON_FLAGS
|ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
|ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
- if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
+ if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
anded_flags &=
ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
}
* <= (C1 & ~C2) | (P1 & ~P2)
* */
- if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
+ if ((and_with_flags & ANYOF_INVERT)
&& ! is_ANYOF_SYNTHETIC(and_with))
{
unsigned int i;
/* If either P1 or P2 is empty, the intersection will be also; can skip
* the loop */
- if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
+ if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
ANYOF_POSIXL_ZERO(ssc);
}
else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
else {
ssc->invlist = anded_cp_list;
ANYOF_POSIXL_ZERO(ssc);
- if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
+ if (and_with_flags & ANYOF_MATCHES_POSIXL) {
ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
}
}
}
else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
- || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
+ || (and_with_flags & ANYOF_MATCHES_POSIXL))
{
/* One or the other of P1, P2 is non-empty. */
- if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
+ if (and_with_flags & ANYOF_MATCHES_POSIXL) {
ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
}
ssc_union(ssc, anded_cp_list, FALSE);
SV* ored_cp_list;
U8 ored_flags;
+ U8 or_with_flags = (OP(or_with) == ANYOFH) ? 0 : ANYOF_FLAGS(or_with);
PERL_ARGS_ASSERT_SSC_OR;
* the code point inversion list and just the relevant flags */
if (is_ANYOF_SYNTHETIC(or_with)) {
ored_cp_list = ((regnode_ssc*) or_with)->invlist;
- ored_flags = ANYOF_FLAGS(or_with);
+ ored_flags = or_with_flags;
}
else {
ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
- ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
+ ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
if (OP(or_with) != ANYOFD) {
ored_flags
- |= ANYOF_FLAGS(or_with)
+ |= or_with_flags
& ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
|ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
- if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
+ if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
ored_flags |=
ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
}
* (which results in actually simpler code than the non-inverted case)
* */
- if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
+ if ((or_with_flags & ANYOF_INVERT)
&& ! is_ANYOF_SYNTHETIC(or_with))
{
/* We ignore P2, leaving P1 going forward */
} /* else Not inverted */
- else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
+ else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
unsigned int i;
trie_words = newAV();
});
- re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
+ re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
assert(re_trie_maxbuff);
if (!SvIOK(re_trie_maxbuff)) {
sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
STRLEN l;
const char * const s = SvPV_const(data->last_found, l);
SSize_t old = b - data->last_start_min;
+ assert(old >= 0);
if (UTF)
- old = utf8_hop((U8*)s, old) - (U8*)s;
+ old = utf8_hop_forward((U8*)s, old,
+ (U8 *) SvEND(data->last_found))
+ - (U8*)s;
l -= old;
/* Get the added string: */
last_str = newSVpvn_utf8(s + old, l, UTF);
last, &data_fake, stopparen,
recursed_depth, NULL, f, depth+1);
if (scan->flags) {
- if (deltanext) {
- FAIL("Variable length lookbehind not implemented");
- }
- else if (minnext > (I32)U8_MAX) {
+ if ( deltanext < 0
+ || deltanext > (I32) U8_MAX
+ || minnext > (I32)U8_MAX
+ || minnext + deltanext > (I32)U8_MAX)
+ {
FAIL2("Lookbehind longer than %" UVuf " not implemented",
(UV)U8_MAX);
}
- scan->flags = (U8)minnext;
+
+ /* The 'next_off' field has been repurposed to count the
+ * additional starting positions to try beyond the initial
+ * one. (This leaves it at 0 for non-variable length
+ * matches to avoid breakage for those not using this
+ * extension) */
+ if (deltanext) {
+ scan->next_off = deltanext;
+ ckWARNexperimental(RExC_parse,
+ WARN_EXPERIMENTAL__VLB,
+ "Variable length lookbehind is experimental");
+ }
+ scan->flags = (U8)minnext + deltanext;
}
if (data) {
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
stopparen, recursed_depth, NULL,
f, depth+1);
if (scan->flags) {
- if (deltanext) {
- FAIL("Variable length lookbehind not implemented");
- }
- else if (*minnextp > (I32)U8_MAX) {
+ assert(0); /* This code has never been tested since this
+ is normally not compiled */
+ if ( deltanext < 0
+ || deltanext > (I32) U8_MAX
+ || *minnextp > (I32)U8_MAX
+ || *minnextp + deltanext > (I32)U8_MAX)
+ {
FAIL2("Lookbehind longer than %" UVuf " not implemented",
(UV)U8_MAX);
}
- scan->flags = (U8)*minnextp;
+
+ if (deltanext) {
+ scan->next_off = deltanext;
+ }
+ scan->flags = (U8)*minnextp + deltanext;
}
*minnextp += min;
}
pRExC_state->warn_text = NULL;
+ pRExC_state->unlexed_names = NULL;
pRExC_state->code_blocks = NULL;
if (is_bare_re)
RExC_naughty = 0;
RExC_npar = 1;
+ RExC_parens_buf_size = 0;
RExC_emit_start = RExC_rxi->program;
pRExC_state->code_index = 0;
/* Do the parse */
if (reg(pRExC_state, 0, &flags, 1)) {
- /* Success!, But if RExC_total_parens < 0, we need to redo the parse
- * knowing how many parens there actually are */
- if (RExC_total_parens < 0) {
+ /* Success!, But we may need to redo the parse knowing how many parens
+ * there actually are */
+ if (IN_PARENS_PASS) {
flags |= RESTART_PARSE;
}
DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
}
- if (RExC_total_parens > 0) {
+ if (ALL_PARENS_COUNTED) {
/* Make enough room for all the known parens, and zero it */
Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
/* It might be a forward reference; we can't fail until we
* know, by completing the parse to get all the groups, and
* then reparsing */
- if (RExC_total_parens > 0) {
+ if (ALL_PARENS_COUNTED) {
vFAIL("Reference to nonexistent named group");
}
else {
return;
default:
fail_modifiers:
- RExC_parse += SKIP_IF_CHAR(RExC_parse);
+ RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized",
UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
I32 freeze_paren = 0;
I32 after_freeze = 0;
I32 num; /* numeric backreferences */
+ SV * max_open; /* Max number of unclosed parens */
char * parse_start = RExC_parse; /* MJD */
char * const oregcomp_parse = RExC_parse;
PERL_ARGS_ASSERT_REG;
DEBUG_PARSE("reg ");
+
+ max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD);
+ assert(max_open);
+ if (!SvIOK(max_open)) {
+ sv_setiv(max_open, RE_COMPILE_RECURSION_INIT);
+ }
+ if (depth > 4 * (UV) SvIV(max_open)) { /* We increase depth by 4 for each
+ open paren */
+ vFAIL("Too many nested open parens");
+ }
+
*flagp = 0; /* Tentatively. */
/* Having this true makes it feasible to have a lot fewer tests for the
} /* End of switch */
if ( ! op ) {
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += UTF
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
if (has_upper || verb_len == 0) {
vFAIL2utf8f(
"Unknown verb pattern '%" UTF8f "'",
return handle_named_backref(pRExC_state, flagp,
parse_start, ')');
}
- RExC_parse += SKIP_IF_CHAR(RExC_parse);
+ RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL3("Sequence (%.*s...) not recognized",
RExC_parse-seqstart, seqstart);
goto gen_recurse_regop;
/* NOTREACHED */
case '+':
- if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
+ if (! inRANGE(RExC_parse[0], '1', '9')) {
RExC_parse++;
vFAIL("Illegal pattern");
}
goto parse_recursion;
/* NOTREACHED*/
case '-': /* (?-1) */
- if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
+ if (! inRANGE(RExC_parse[0], '1', '9')) {
RExC_parse--; /* rewind to let it be handled later */
goto parse_flags;
}
/* It might be a forward reference; we can't fail until
* we know, by completing the parse to get all the
* groups, and then reparsing */
- if (RExC_total_parens > 0) {
+ if (ALL_PARENS_COUNTED) {
RExC_parse++;
vFAIL("Reference to nonexistent group");
}
/* It might be a forward reference; we can't fail until we
* know, by completing the parse to get all the groups, and
* then reparsing */
- if (RExC_total_parens > 0) {
+ if (ALL_PARENS_COUNTED) {
if (num >= RExC_total_parens) {
RExC_parse++;
vFAIL("Reference to nonexistent group");
case '?': /* (??...) */
is_logical = 1;
if (*RExC_parse != '{') {
- RExC_parse += SKIP_IF_CHAR(RExC_parse);
+ RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end);
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL2utf8f(
"Sequence (%" UTF8f "...) not recognized",
parno = 1;
RExC_parse++;
}
- else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
+ else if (inRANGE(RExC_parse[0], '1', '9')) {
UV uv;
endptr = RExC_end;
if (grok_atoUV(RExC_parse, &uv, &endptr)
ret = reganode(pRExC_state, INSUBP, parno);
goto insert_if_check_paren;
}
- else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
+ else if (inRANGE(RExC_parse[0], '1', '9')) {
/* (?(1)...) */
char c;
UV uv;
insert_if_check_paren:
if (UCHARAT(RExC_parse) != ')') {
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += UTF
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
vFAIL("Switch condition not recognized");
}
nextchar(pRExC_state);
#endif
return ret;
}
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += UTF
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
vFAIL("Unknown switch condition (?(...))");
}
case '[': /* (?[ ... ]) */
RExC_parse--; /* for vFAIL to print correctly */
vFAIL("Sequence (? incomplete");
break;
+
+ case ')':
+ if (RExC_strict) { /* [perl #132851] */
+ ckWARNreg(RExC_parse, "Empty (?) without any modifiers");
+ }
+ /* FALLTHROUGH */
default: /* e.g., (?i) */
RExC_parse = (char *) seqstart + 1;
parse_flags:
capturing_parens:
parno = RExC_npar;
RExC_npar++;
- if (RExC_total_parens <= 0) {
+ if (! ALL_PARENS_COUNTED) {
/* If we are in our first pass through (and maybe only pass),
* we need to allocate memory for the capturing parentheses
- * data structures. Since we start at npar=1, when it reaches
- * 2, for the first time it has something to put in it. Above
- * 2 means we extend what we already have */
- if (RExC_npar == 2) {
+ * data structures.
+ */
+
+ if (!RExC_parens_buf_size) {
+ /* first guess at number of parens we might encounter */
+ RExC_parens_buf_size = 10;
+
/* setup RExC_open_parens, which holds the address of each
* OPEN tag, and to make things simpler for the 0 index the
* start of the program - this is used later for offsets */
- Newxz(RExC_open_parens, RExC_npar, regnode_offset);
+ Newxz(RExC_open_parens, RExC_parens_buf_size,
+ regnode_offset);
RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
/* setup RExC_close_parens, which holds the address of each
* CLOSE tag, and to make things simpler for the 0 index
* the end of the program - this is used later for offsets
* */
- Newxz(RExC_close_parens, RExC_npar, regnode_offset);
+ Newxz(RExC_close_parens, RExC_parens_buf_size,
+ regnode_offset);
/* we dont know where end op starts yet, so we dont need to
* set RExC_close_parens[0] like we do RExC_open_parens[0]
* above */
}
- else {
- Renew(RExC_open_parens, RExC_npar, regnode_offset);
- Zero(RExC_open_parens + RExC_npar - 1, 1, regnode_offset);
+ else if (RExC_npar > RExC_parens_buf_size) {
+ I32 old_size = RExC_parens_buf_size;
+
+ RExC_parens_buf_size *= 2;
- Renew(RExC_close_parens, RExC_npar, regnode_offset);
- Zero(RExC_close_parens + RExC_npar - 1, 1, regnode_offset);
+ Renew(RExC_open_parens, RExC_parens_buf_size,
+ regnode_offset);
+ Zero(RExC_open_parens + old_size,
+ RExC_parens_buf_size - old_size, regnode_offset);
+
+ Renew(RExC_close_parens, RExC_parens_buf_size,
+ regnode_offset);
+ Zero(RExC_close_parens + old_size,
+ RExC_parens_buf_size - old_size, regnode_offset);
}
}
RETURN_FAIL_ON_RESTART(flags, flagp);
FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
}
- REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
+ if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
lastbr = br;
*flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
}
(IV)(ender - lastbr)
);
);
- REGTAIL(pRExC_state, lastbr, ender);
+ if (! REGTAIL(pRExC_state, lastbr, ender)) {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
if (have_branch) {
char is_nothing= 1;
for (br = REGNODE_p(ret); br; br = regnext(br)) {
const U8 op = PL_regkind[OP(br)];
if (op == BRANCH) {
- REGTAIL_STUDY(pRExC_state,
- REGNODE_OFFSET(NEXTOPER(br)),
- ender);
+ if (! REGTAIL_STUDY(pRExC_state,
+ REGNODE_OFFSET(NEXTOPER(br)),
+ ender))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
if ( OP(NEXTOPER(br)) != NOTHING
|| regnext(NEXTOPER(br)) != REGNODE_p(ender))
is_nothing= 0;
Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
Set_Node_Offset(REGNODE_p(ret), parse_start + 1);
FLAGS(REGNODE_p(ret)) = flag;
- REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
+ if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)))
+ {
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
}
}
/* FIXME adding one for every branch after the first is probably
* excessive now we have TRIE support. (hv) */
MARK_NAUGHTY(1);
- if ( chain > (SSize_t) BRANCH_MAX_OFFSET
- && ! RExC_use_BRANCHJ)
- {
+ if (! REGTAIL(pRExC_state, chain, latest)) {
/* XXX We could just redo this branch, but figuring out what
- * bookkeeping needs to be reset is a pain */
+ * bookkeeping needs to be reset is a pain, and it's likely
+ * that other branches that goto END will also be too large */
REQUIRE_BRANCHJ(flagp, 0);
}
- REGTAIL(pRExC_state, chain, latest);
}
chain = latest;
c++;
* points) that this \N sequence matches. This is set, and the input is
* parsed for errors, even if the function returns FALSE, as detailed below.
*
- * There are 5 possibilities here, as detailed in the next 5 paragraphs.
+ * There are 6 possibilities here, as detailed in the next 6 paragraphs.
*
* Probably the most common case is for the \N to specify a single code point.
* *cp_count will be set to 1, and *code_point_p will be set to that code
* point.
*
- * Another possibility is for the input to be an empty \N{}, which for
- * backwards compatibility we accept. *cp_count will be set to 0. *node_p
- * will be set to a generated NOTHING node.
+ * Another possibility is for the input to be an empty \N{}. This is no
+ * longer accepted, and will generate a fatal error.
+ *
+ * Another possibility is for a custom charnames handler to be in effect which
+ * translates the input name to an empty string. *cp_count will be set to 0.
+ * *node_p will be set to a generated NOTHING node.
*
* Still another possibility is for the \N to mean [^\n]. *cp_count will be
* set to 0. *node_p will be set to a generated REG_ANY node.
*
- * The fourth possibility is that \N resolves to a sequence of more than one
+ * The fifth possibility is that \N resolves to a sequence of more than one
* code points. *cp_count will be set to the number of code points in the
* sequence. *node_p will be set to a generated node returned by this
* function calling S_reg().
* The final possibility is that it is premature to be calling this function;
* the parse needs to be restarted. This can happen when this changes from
* /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
- * latter occurs only when the fourth possibility would otherwise be in
+ * latter occurs only when the fifth possibility would otherwise be in
* effect, and is because one of those code points requires the pattern to be
* recompiled as UTF-8. The function returns FALSE, and sets the
* RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this
* so we need a way to take a snapshot of what they resolve to at the time of
* the original parse. [perl #56444].
*
- * That parsing is skipped for single-quoted regexes, so we may here get
- * '\N{NAME}'. This is a fatal error. These names have to be resolved by the
- * parser. But if the single-quoted regex is something like '\N{U+41}', that
- * is legal and handled here. The code point is Unicode, and has to be
- * translated into the native character set for non-ASCII platforms.
- */
+ * That parsing is skipped for single-quoted regexes, so here we may get
+ * '\N{NAME}', which is parsed now. If the single-quoted regex is something
+ * like '\N{U+41}', that code point is Unicode, and has to be translated into
+ * the native character set for non-ASCII platforms. The other possibilities
+ * are already native, so no translation is done. */
char * endbrace; /* points to '}' following the name */
char* p = RExC_parse; /* Temporary */
char *orig_end;
char *save_start;
I32 flags;
- Size_t count = 0; /* code point count kept internally by this function */
GET_RE_DEBUG_FLAGS_DECL;
/* Disambiguate between \N meaning a named character versus \N meaning
* [^\n]. The latter is assumed when the {...} following the \N is a legal
- * quantifier, or there is no '{' at all */
+ * quantifier, or if there is no '{' at all */
if (*p != '{' || regcurly(p)) {
RExC_parse = p;
if (cp_count) {
vFAIL2("Missing right brace on \\%c{}", 'N');
}
- /* Here, we have decided it should be a named character or sequence */
- REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode
- semantics */
+ /* Here, we have decided it should be a named character or sequence. These
+ * imply Unicode semantics */
+ REQUIRE_UNI_RULES(flagp, FALSE);
- if (endbrace == RExC_parse) { /* empty: \N{} */
+ /* \N{_} is what toke.c returns to us to indicate a name that evaluates to
+ * nothing at all (not allowed under strict) */
+ if (endbrace - RExC_parse == 1 && *RExC_parse == '_') {
+ RExC_parse = endbrace;
if (strict) {
RExC_parse++; /* Position after the "}" */
vFAIL("Zero length \\N{}");
}
+
if (cp_count) {
*cp_count = 0;
}
return TRUE;
}
- /* If we haven't got something that begins with 'U+', then it didn't get lexed. */
- if ( endbrace - RExC_parse < 2
- || strnNE(RExC_parse, "U+", 2))
- {
- RExC_parse = endbrace; /* position msg's '<--HERE' */
- vFAIL("\\N{NAME} must be resolved by the lexer");
- }
+ if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) {
+
+ /* Here, the name isn't of the form U+.... This can happen if the
+ * pattern is single-quoted, so didn't get evaluated in toke.c. Now
+ * is the time to find out what the name means */
+
+ const STRLEN name_len = endbrace - RExC_parse;
+ SV * value_sv; /* What does this name evaluate to */
+ SV ** value_svp;
+ const U8 * value; /* string of name's value */
+ STRLEN value_len; /* and its length */
+
+ /* RExC_unlexed_names is a hash of names that weren't evaluated by
+ * toke.c, and their values. Make sure is initialized */
+ if (! RExC_unlexed_names) {
+ RExC_unlexed_names = newHV();
+ }
+
+ /* If we have already seen this name in this pattern, use that. This
+ * allows us to only call the charnames handler once per name per
+ * pattern. A broken or malicious handler could return something
+ * different each time, which could cause the results to vary depending
+ * on if something gets added or subtracted from the pattern that
+ * causes the number of passes to change, for example */
+ if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse,
+ name_len, 0)))
+ {
+ value_sv = *value_svp;
+ }
+ else { /* Otherwise we have to go out and get the name */
+ const char * error_msg = NULL;
+ value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace,
+ UTF,
+ &error_msg);
+ if (error_msg) {
+ RExC_parse = endbrace;
+ vFAIL(error_msg);
+ }
+
+ /* If no error message, should have gotten a valid return */
+ assert (value_sv);
+
+ /* Save the name's meaning for later use */
+ if (! hv_store(RExC_unlexed_names, RExC_parse, name_len,
+ value_sv, 0))
+ {
+ Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
+ }
+ }
+
+ /* Here, we have the value the name evaluates to in 'value_sv' */
+ value = (U8 *) SvPV(value_sv, value_len);
+
+ /* See if the result is one code point vs 0 or multiple */
+ if (value_len > 0 && value_len <= (UV) ((SvUTF8(value_sv))
+ ? UTF8SKIP(value)
+ : 1))
+ {
+ /* Here, exactly one code point. If that isn't what is wanted,
+ * fail */
+ if (! code_point_p) {
+ RExC_parse = p;
+ return FALSE;
+ }
+
+ /* Convert from string to numeric code point */
+ *code_point_p = (SvUTF8(value_sv))
+ ? valid_utf8_to_uvchr(value, NULL)
+ : *value;
+
+ /* Have parsed this entire single code point \N{...}. *cp_count
+ * has already been set to 1, so don't do it again. */
+ RExC_parse = endbrace;
+ nextchar(pRExC_state);
+ return TRUE;
+ } /* End of is a single code point */
+
+ /* Count the code points, if caller desires. The API says to do this
+ * even if we will later return FALSE */
+ if (cp_count) {
+ *cp_count = 0;
+
+ *cp_count = (SvUTF8(value_sv))
+ ? utf8_length(value, value + value_len)
+ : value_len;
+ }
+
+ /* Fail if caller doesn't want to handle a multi-code-point sequence.
+ * But don't back the pointer up if the caller wants to know how many
+ * code points there are (they need to handle it themselves in this
+ * case). */
+ if (! node_p) {
+ if (! cp_count) {
+ RExC_parse = p;
+ }
+ return FALSE;
+ }
+
+ /* Convert this to a sub-pattern of the form "(?: ... )", and then call
+ * reg recursively to parse it. That way, it retains its atomicness,
+ * while not having to worry about any special handling that some code
+ * points may have. */
+
+ substitute_parse = newSVpvs("?:");
+ sv_catsv(substitute_parse, value_sv);
+ sv_catpv(substitute_parse, ")");
+
+#ifdef EBCDIC
+ /* The value should already be native, so no need to convert on EBCDIC
+ * platforms.*/
+ assert(! RExC_recode_x_to_native);
+#endif
- /* This code purposely indented below because of future changes coming */
+ }
+ else { /* \N{U+...} */
+ Size_t count = 0; /* code point count kept internally */
/* We can get to here when the input is \N{U+...} or when toke.c has
* converted a name to the \N{U+...} form. This include changing a
RExC_recode_x_to_native = 1;
#endif
+ }
+
/* Here, we have the string the name evaluates to, ready to be parsed,
* stored in 'substitute_parse' as a series of valid "\x{...}\x{...}"
* constructs. This can be called from within a substitute parse already.
char name = *RExC_parse;
char * endbrace = NULL;
RExC_parse += 2;
- if (RExC_parse < RExC_end) {
- endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
- }
+ endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
if (! endbrace) {
vFAIL2("Missing right brace on \\%c{}", name);
/* It might be a forward reference; we can't fail until we
* know, by completing the parse to get all the groups, and
* then reparsing */
- if (RExC_total_parens > 0) {
+ if (ALL_PARENS_COUNTED) {
if (num >= RExC_total_parens) {
vFAIL("Reference to nonexistent group");
}
|| UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
|| UTF8_IS_START(UCHARAT(RExC_parse)));
-
/* Here, we have a literal character. Find the maximal string of
* them in the input that we can fit into a single EXACTish node.
* We quit at the first non-literal or when the node gets full, or
has_micro_sign = TRUE;
}
- *(s++) = (char) (DEPENDS_SEMANTICS)
- ? toFOLD(ender)
-
- /* Under /u, the fold of any
- * character in the 0-255 range
- * happens to be its lowercase
- * equivalent, except for LATIN SMALL
- * LETTER SHARP S, which was handled
- * above, and the MICRO SIGN, whose
- * fold requires UTF-8 to represent.
- * */
- : toLOWER_L1(ender);
+ *(s++) = (DEPENDS_SEMANTICS)
+ ? (char) toFOLD(ender)
+
+ /* Under /u, the fold of any character in
+ * the 0-255 range happens to be its
+ * lowercase equivalent, except for LATIN
+ * SMALL LETTER SHARP S, which was handled
+ * above, and the MICRO SIGN, whose fold
+ * requires UTF-8 to represent. */
+ : (char) toLOWER_L1(ender);
}
} /* End of adding current character to the node */
else {
/* Point to the first byte of the final character */
- s = (char *) utf8_hop((U8 *) s, -1);
+ s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
while (s >= s0) { /* Search backwards until find
a non-problematic char */
RExC_emit += STR_SZ(len);
/* If the node isn't a single character, it can't be SIMPLE */
- if (len > ((UTF) ? UVCHR_SKIP(ender) : 1)) {
+ if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) {
maybe_SIMPLE = 0;
}
RExC_parse = RExC_end;
}
else if (RExC_parse != save_parse) {
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += (UTF)
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
}
vFAIL("Expecting '(?flags:(?[...'");
}
FALSE, /* Require return to be an ANYOF */
¤t))
{
- FAIL2("panic: regclass returned failure to handle_sets, "
- "flags=%#" UVxf, (UV) *flagp);
+ goto regclass_failed;
}
/* regclass() will return with parsing just the \ sequence,
FALSE, /* Require return to be an ANYOF */
¤t))
{
- FAIL2("panic: regclass returned failure to handle_sets, "
- "flags=%#" UVxf, (UV) *flagp);
+ goto regclass_failed;
}
if (! current) {
}
if (!node)
- FAIL2("panic: regclass returned failure to handle_sets, flags=%#" UVxf,
- PTR2UV(flagp));
+ goto regclass_failed;
/* Fix up the node type if we are in locale. (We have pretended we are
* under /u for the purposes of regclass(), as this construct will only
nextchar(pRExC_state);
Set_Node_Length(REGNODE_p(node), RExC_parse - oregcomp_parse + 1); /* MJD */
return node;
+
+ regclass_failed:
+ FAIL2("panic: regclass returned failure to handle_sets, " "flags=%#" UVxf,
+ (UV) *flagp);
}
#ifdef ENABLE_REGEX_SETS_DEBUGGING
S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
const bool stop_at_1, /* Just parse the next thing, don't
look for a full character class */
- bool allow_multi_folds,
+ bool allow_mutiple_chars,
const bool silence_non_portable, /* Don't output warnings
about too large
characters */
#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
|| (UNICODE_MAJOR_VERSION == 3 && UNICODE_DOT_VERSION == 0 \
&& UNICODE_DOT_DOT_VERSION == 0)
- allow_multi_folds = FALSE;
+ allow_mutiple_chars = FALSE;
#endif
/* We include the /i status at the beginning of this so that we can
if (UCHARAT(RExC_parse) == '^') { /* Complement the class */
RExC_parse++;
invert = TRUE;
- allow_multi_folds = FALSE;
+ allow_mutiple_chars = FALSE;
MARK_NAUGHTY(1);
SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
}
"Ignoring zero length \\N{} in character class");
}
else { /* cp_count > 1 */
+ assert(cp_count > 1);
if (! RExC_in_multi_char_class) {
if (invert || range || *RExC_parse == '-') {
if (strict) {
} /* The \p isn't immediately followed by a '{' */
else if (! isALPHA(*RExC_parse)) {
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += (UTF)
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
vFAIL2("Character following \\%c must be '{' or a "
"single-character Unicode property name",
(U8) value);
SV * prop_definition = parse_uniprop_string(
name, n, UTF, FOLD,
FALSE, /* This is compile-time */
+
+ /* We can't defer this defn when
+ * the full result is required in
+ * this call */
+ ! cBOOL(ret_invlist),
+
&user_defined,
msg,
0 /* Base level */
RExC_parse += numlen;
if (numlen != 3) {
if (strict) {
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += (UTF)
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
vFAIL("Need exactly 3 octal digits");
}
else if ( numlen < 3 /* like \08, \178 */
* "ss" =~ /^[^\xDF]+$/i => N
*
* See [perl #89750] */
- if (FOLD && allow_multi_folds && value == prevvalue) {
+ if (FOLD && allow_mutiple_chars && value == prevvalue) {
if ( value == LATIN_SMALL_LETTER_SHARP_S
|| (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
value)))
RExC_emit += 1 + STR_SZ(len);
STR_LEN(REGNODE_p(ret)) = len;
if (len == 1) {
- *STRING(REGNODE_p(ret)) = value;
+ *STRING(REGNODE_p(ret)) = (U8) value;
}
else {
uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
* bitmap, optimize to indicate that */
if ( start[0] >= NUM_ANYOF_CODE_POINTS
&& ! LOC
- && ! upper_latin1_only_utf8_matches)
+ && ! upper_latin1_only_utf8_matches
+ && anyof_flags == 0)
{
+ UV highest_cp = invlist_highest(cp_list);
+
+ /* If the lowest and highest code point in the class have the same
+ * UTF-8 first byte, then all do, and we can store that byte for
+ * regexec.c to use so that it can more quickly scan the target
+ * string for potential matches for this class. We co-opt the the
+ * flags field for this. Zero means, they don't have the same
+ * first byte. We do accept here very large code points (for
+ * future use), but don't bother with this optimization for them,
+ * as it would cause other complications */
+ if (highest_cp > IV_MAX) {
+ anyof_flags = 0;
+ }
+ else {
+ U8 low_utf8[UTF8_MAXBYTES+1];
+ U8 high_utf8[UTF8_MAXBYTES+1];
+
+ (void) uvchr_to_utf8(low_utf8, start[0]);
+ (void) uvchr_to_utf8(high_utf8, invlist_highest(cp_list));
+
+ anyof_flags = (low_utf8[0] == high_utf8[0])
+ ? low_utf8[0]
+ : 0;
+ }
+
op = ANYOFH;
}
} /* End of seeing if can optimize it into a different node */
stored here for just
this occasion */
TRUE, /* run time */
+ FALSE, /* This call must find the defn */
si, /* The property definition */
&user_defined,
msg,
|| UTF8_IS_INVARIANT(*RExC_parse)
|| UTF8_IS_START(*RExC_parse));
- RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ RExC_parse += (UTF)
+ ? UTF8_SAFE_SKIP(RExC_parse, RExC_end)
+ : 1;
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
FALSE /* Don't force /x */ );
src = REGNODE_p(RExC_emit);
RExC_emit += size;
dst = REGNODE_p(RExC_emit);
- if (RExC_open_parens) {
+
+ /* If we are in a "count the parentheses" pass, the numbers are unreliable,
+ * and [perl #133871] shows this can lead to problems, so skip this
+ * realignment of parens until a later pass when they are reliable */
+ if (! IN_PARENS_PASS && RExC_open_parens) {
int paren;
/*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/
/* remember that RExC_npar is rex->nparens + 1,
}
/*
-- regtail - set the next-pointer at the end of a node chain of p to val.
+- regtail - set the next-pointer at the end of a node chain of p to val. If
+ that value won't fit in the space available, instead returns FALSE.
+ (Except asserts if we can't fit in the largest space the regex
+ engine is designed for.)
- SEE ALSO: regtail_study
*/
-STATIC void
+STATIC bool
S_regtail(pTHX_ RExC_state_t * pRExC_state,
const regnode_offset p,
const regnode_offset val,
}
if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
+ assert((UV) (val - scan) <= U32_MAX);
ARG_SET(REGNODE_p(scan), val - scan);
}
else {
+ if (val - scan > U16_MAX) {
+ /* Since not all callers check the return value, populate this with
+ * something that won't loop and will likely lead to a crash if
+ * execution continues */
+ NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
+ return FALSE;
+ }
NEXT_OFF(REGNODE_p(scan)) = val - scan;
}
+
+ return TRUE;
}
#ifdef DEBUGGING
Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
to control which is which.
+This used to return a value that was ignored. It was a problem that it is
+#ifdef'd to be another function that didn't return a value. khw has changed it
+so both currently return a pass/fail return.
+
*/
/* TODO: All four parms should be const */
-STATIC U8
+STATIC bool
S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p,
const regnode_offset val, U32 depth)
{
bool unfolded_multi_char; /* Unexamined in this routine */
if (join_exact(pRExC_state, scan, &min,
&unfolded_multi_char, 1, REGNODE_p(val), depth+1))
- return EXACT;
+ return TRUE; /* Was return EXACT */
}
#endif
if ( exact ) {
);
});
if (reg_off_by_arg[OP(REGNODE_p(scan))]) {
+ assert((UV) (val - scan) <= U32_MAX);
ARG_SET(REGNODE_p(scan), val - scan);
}
else {
+ if (val - scan > U16_MAX) {
+ NEXT_OFF(REGNODE_p(scan)) = U16_MAX;
+ return FALSE;
+ }
NEXT_OFF(REGNODE_p(scan)) = val - scan;
}
- return exact;
+ return TRUE; /* Was 'return exact' */
}
#endif
/* 2: embedded, otherwise 1 */
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags);
else if (k == ANYOF) {
- const U8 flags = ANYOF_FLAGS(o);
+ const U8 flags = (OP(o) == ANYOFH) ? 0 : ANYOF_FLAGS(o);
bool do_sep = FALSE; /* Do we need to separate various components of
the output? */
/* Set if there is still an unresolved user-defined property */
/* And finally the matching, closing ']' */
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+ if (OP(o) == ANYOFH && FLAGS(o) != 0) {
+ Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x)", FLAGS(o));
+ }
+
+
SvREFCNT_dec(unresolved);
}
else if (k == ANYOFM) {
assert(FLAGS(o) < C_ARRAY_LENGTH(bounds));
sv_catpv(sv, bounds[FLAGS(o)]);
}
- else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
- Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
+ else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) {
+ Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags));
+ if (o->next_off) {
+ Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off);
+ }
+ Perl_sv_catpvf(aTHX_ sv, "]");
+ }
else if (OP(o) == SBOL)
Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^");
if (!dsv)
dsv = (REGEXP*) newSV_type(SVt_REGEXP);
else {
+ assert(SvTYPE(dsv) == SVt_REGEXP || (SvTYPE(dsv) == SVt_PVLV));
+
+ /* our only valid caller, sv_setsv_flags(), should have done
+ * a SV_CHECK_THINKFIRST_COW_DROP() by now */
+ assert(!SvOOK(dsv));
+ assert(!SvIsCOW(dsv));
+ assert(!SvROK(dsv));
+
+ if (SvPVX_const(dsv)) {
+ if (SvLEN(dsv))
+ Safefree(SvPVX(dsv));
+ SvPVX(dsv) = NULL;
+ }
+ SvLEN_set(dsv, 0);
+ SvCUR_set(dsv, 0);
SvOK_off((SV *)dsv);
+
if (islv) {
/* For PVLVs, the head (sv_any) points to an XPVLV, while
* the LV's xpvlenu_rx will point to a regexp body, which
2: something we no longer hold a reference on
so we need to copy it locally. */
RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
+ /* set malloced length to a non-zero value so it will be freed
+ * (otherwise in combination with SVf_FAKE it looks like an alien
+ * buffer). It doesn't have to be the actual malloced size, since it
+ * should never be grown */
+ SvLEN_set(dstr, SvCUR(sstr)+1);
ret->mother_re = NULL;
}
#endif /* PERL_IN_XSUB_RE */
PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist);
PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]);
PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist);
+ PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]);
#ifdef UNI_XIDC
/* The below are used only by deprecated functions. They could be removed */
const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
const bool to_fold, /* ? Is this under /i */
const bool runtime, /* ? Are we in compile- or run-time */
+ const bool deferrable, /* Is it ok for this property's full definition
+ to be deferred until later? */
SV* contents, /* The property's definition */
bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be
getting called unless this is thought to be
Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
UTF8fARG(is_contents_utf8, s - s0, s0));
sv_catpvs(msg, "\"");
- goto return_msg;
+ goto return_failure;
}
/* Accumulate this digit into the value */
Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
UTF8fARG(is_contents_utf8, s - s0, s0));
sv_catpvs(msg, "\"");
- goto return_msg;
+ goto return_failure;
}
max = (max << 4) + READ_XDIGIT(s);
Perl_sv_catpvf(aTHX_ msg, "%" UTF8f,
UTF8fARG(is_contents_utf8, s - s0, s0));
sv_catpvs(msg, "\"");
- goto return_msg;
+ goto return_failure;
}
#if 0 /* See explanation at definition above of get_extended_utf8_msg() */
this_definition = parse_uniprop_string(s0, s - s0,
is_utf8, to_fold, runtime,
+ deferrable,
user_defined_ptr, msg,
(name_len == 0)
? level /* Don't increase level
: level + 1
);
if (this_definition == NULL) {
- goto return_msg; /* 'msg' should have had the reason appended to
- it by the above call */
+ goto return_failure; /* 'msg' should have had the reason
+ appended to it by the above call */
}
if (! is_invlist(this_definition)) { /* Unknown at this time */
}
/* Otherwise, add some explanatory text, but we will return success */
+ goto return_msg;
+
+ return_failure:
+ running_definition = NULL;
return_msg:
const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */
const bool to_fold, /* ? Is this under /i */
const bool runtime, /* TRUE if this is being called at run time */
+ const bool deferrable, /* TRUE if it's ok for the definition to not be
+ known at this call */
bool *user_defined_ptr, /* Upon return from this function it will be
set to TRUE if any component is a
user-defined property */
}
}
+ /* Most punctuation after the equals indicates a subpattern, like
+ * \p{foo=/bar/} */
+ if ( isPUNCT_A(name[i])
+ && name[i] != '-'
+ && name[i] != '+'
+ && name[i] != '_'
+ && name[i] != '{')
+ {
+ /* Find the property. The table includes the equals sign, so we
+ * use 'j' as-is */
+ table_index = match_uniprop((U8 *) lookup_name, j);
+ if (table_index) {
+ const char * const * prop_values
+ = UNI_prop_value_ptrs[table_index];
+ SV * subpattern;
+ Size_t subpattern_len;
+ REGEXP * subpattern_re;
+ char open = name[i++];
+ char close;
+ const char * pos_in_brackets;
+ bool escaped = 0;
+
+ /* A backslash means the real delimitter is the next character.
+ * */
+ if (open == '\\') {
+ open = name[i++];
+ escaped = 1;
+ }
+
+ /* This data structure is constructed so that the matching
+ * closing bracket is 3 past its matching opening. The second
+ * set of closing is so that if the opening is something like
+ * ']', the closing will be that as well. Something similar is
+ * done in toke.c */
+ pos_in_brackets = strchr("([<)]>)]>", open);
+ close = (pos_in_brackets) ? pos_in_brackets[3] : open;
+
+ if ( name[name_len-1] != close
+ || (escaped && name[name_len-2] != '\\'))
+ {
+ sv_catpvs(msg, "Unicode property wildcard not terminated");
+ goto append_name_to_msg;
+ }
+
+ Perl_ck_warner_d(aTHX_
+ packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS),
+ "The Unicode property wildcards feature is experimental");
+
+ /* Now create and compile the wildcard subpattern. Use /iaa
+ * because nothing outside of ASCII will match, and it the
+ * property values should all match /i. Note that when the
+ * pattern fails to compile, our added text to the user's
+ * pattern will be displayed to the user, which is not so
+ * desirable. */
+ subpattern_len = name_len - i - 1 - escaped;
+ subpattern = Perl_newSVpvf(aTHX_ "(?iaa:%.*s)",
+ (unsigned) subpattern_len,
+ name + i);
+ subpattern = sv_2mortal(subpattern);
+ subpattern_re = re_compile(subpattern, 0);
+ assert(subpattern_re); /* Should have died if didn't compile
+ successfully */
+
+ /* For each legal property value, see if the supplied pattern
+ * matches it. */
+ while (*prop_values) {
+ const char * const entry = *prop_values;
+ const Size_t len = strlen(entry);
+ SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP);
+
+ if (pregexec(subpattern_re,
+ (char *) entry,
+ (char *) entry + len,
+ (char *) entry, 0,
+ entry_sv,
+ 0))
+ { /* Here, matched. Add to the returned list */
+ Size_t total_len = j + len;
+ SV * sub_invlist = NULL;
+ char * this_string;
+
+ /* We know this is a legal \p{property=value}. Call
+ * the function to return the list of code points that
+ * match it */
+ Newxz(this_string, total_len + 1, char);
+ Copy(lookup_name, this_string, j, char);
+ my_strlcat(this_string, entry, total_len + 1);
+ SAVEFREEPV(this_string);
+ sub_invlist = parse_uniprop_string(this_string,
+ total_len,
+ is_utf8,
+ to_fold,
+ runtime,
+ deferrable,
+ user_defined_ptr,
+ msg,
+ level + 1);
+ _invlist_union(prop_definition, sub_invlist,
+ &prop_definition);
+ }
+
+ prop_values++; /* Next iteration, look at next propvalue */
+ } /* End of looking through property values; (the data
+ structure is terminated by a NULL ptr) */
+
+ SvREFCNT_dec_NN(subpattern_re);
+
+ if (prop_definition) {
+ return prop_definition;
+ }
+
+ sv_catpvs(msg, "No Unicode property value wildcard matches:");
+ goto append_name_to_msg;
+ }
+
+ /* Here's how khw thinks we should proceed to handle the properties
+ * not yet done: Bidi Mirroring Glyph
+ Bidi Paired Bracket
+ Case Folding (both full and simple)
+ Decomposition Mapping
+ Equivalent Unified Ideograph
+ Name
+ Name Alias
+ Lowercase Mapping (both full and simple)
+ NFKC Case Fold
+ Titlecase Mapping (both full and simple)
+ Uppercase Mapping (both full and simple)
+ * Move the part that looks at the property values into a perl
+ * script, like utf8_heavy.pl is done. This makes things somewhat
+ * easier, but most importantly, it avoids always adding all these
+ * strings to the memory usage when the feature is little-used.
+ *
+ * The property values would all be concatenated into a single
+ * string per property with each value on a separate line, and the
+ * code point it's for on alternating lines. Then we match the
+ * user's input pattern m//mg, without having to worry about their
+ * uses of '^' and '$'. Only the values that aren't the default
+ * would be in the strings. Code points would be in UTF-8. The
+ * search pattern that we would construct would look like
+ * (?: \n (code-point_re) \n (?aam: user-re ) \n )
+ * And so $1 would contain the code point that matched the user-re.
+ * For properties where the default is the code point itself, such
+ * as any of the case changing mappings, the string would otherwise
+ * consist of all Unicode code points in UTF-8 strung together.
+ * This would be impractical. So instead, examine their compiled
+ * pattern, looking at the ssc. If none, reject the pattern as an
+ * error. Otherwise run the pattern against every code point in
+ * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets
+ * And it might be good to create an API to return the ssc.
+ *
+ * For the name properties, a new function could be created in
+ * charnames which essentially does the same thing as above,
+ * sharing Name.pl with the other charname functions. Don't know
+ * about loose name matching, or algorithmically determined names.
+ * Decomposition.pl similarly.
+ *
+ * It might be that a new pattern modifier would have to be
+ * created, like /t for resTricTed, which changed the behavior of
+ * some constructs in their subpattern, like \A. */
+ } /* End of is a wildcard subppattern */
+
+
/* Certain properties whose values are numeric need special handling.
* They may optionally be prefixed by 'is'. Ignore that prefix for the
* purposes of checking if this is one of those properties */
* for this property in the hash. So we have the go ahead to
* expand the definition ourselves. */
+ PUSHSTACKi(PERLSI_MAGIC);
ENTER;
/* Create a temporary placeholder in the hash to detect recursion
* handle it */
prop_definition = handle_user_defined_property(name, name_len,
is_utf8, to_fold, runtime,
+ deferrable,
POPs, user_defined_ptr,
msg,
level);
}
- /* Here, we have the results of the expansion. Replace the
- * placeholder with them. We need exclusive access to the hash,
- * and we can't let anyone else in, between when we delete the
- * placeholder and add the permanent entry */
+ /* Here, we have the results of the expansion. Delete the
+ * placeholder, and if the definition is now known, replace it with
+ * that definition. We need exclusive access to the hash, and we
+ * can't let anyone else in, between when we delete the placeholder
+ * and add the permanent entry */
USER_PROP_MUTEX_LOCK;
S_delete_recursion_entry(aTHX_ SvPVX(fq_name));
FREETMPS;
LEAVE;
+ POPSTACK;
if (prop_definition) {
* compile time, it might just be that the subroutine for that
* property hasn't been encountered yet, but at runtime, it's
* an error to try to use an undefined one */
- if (runtime) {
+ if (! deferrable) {
if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
sv_catpvs(msg, "Unknown user-defined property name");
goto append_name_to_msg;
/* Create and return the inversion list */
prop_definition =_new_invlist_C_array(uni_prop_ptrs[table_index]);
+ sv_2mortal(prop_definition);
+
+
+ /* See if there is a private use override to add to this definition */
+ {
+ COPHH * hinthash = (IN_PERL_COMPILETIME)
+ ? CopHINTHASH_get(&PL_compiling)
+ : CopHINTHASH_get(PL_curcop);
+ SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0);
+
+ if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) {
+
+ /* See if there is an element in the hints hash for this table */
+ SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index);
+ const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup));
+
+ if (pos) {
+ bool dummy;
+ SV * pu_definition;
+ SV * pu_invlist;
+ SV * expanded_prop_definition =
+ sv_2mortal(invlist_clone(prop_definition, NULL));
+
+ /* If so, it's definition is the string from here to the next
+ * \a character. And its format is the same as a user-defined
+ * property */
+ pos += SvCUR(pu_lookup);
+ pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos);
+ pu_invlist = handle_user_defined_property(lookup_name,
+ lookup_len,
+ 0, /* Not UTF-8 */
+ 0, /* Not folded */
+ runtime,
+ deferrable,
+ pu_definition,
+ &dummy,
+ msg,
+ level);
+ if (TAINT_get) {
+ if (SvCUR(msg) > 0) sv_catpvs(msg, "; ");
+ sv_catpvs(msg, "Insecure private-use override");
+ goto append_name_to_msg;
+ }
+
+ /* For now, as a safety measure, make sure that it doesn't
+ * override non-private use code points */
+ _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist);
+
+ /* Add it to the list to be returned */
+ _invlist_union(prop_definition, pu_invlist,
+ &expanded_prop_definition);
+ prop_definition = expanded_prop_definition;
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental");
+ }
+ }
+ }
+
if (invert_return) {
_invlist_invert(prop_definition);
}
- sv_2mortal(prop_definition);
return prop_definition;