corresponding to copy_start */
SSize_t whilem_seen; /* number of WHILEM in this expr */
regnode *emit_start; /* Start of emitted-code area */
- regnode *emit_bound; /* First regnode outside of the
- allocated space */
regnode_offset emit; /* Code-emit pointer */
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
U32 seen;
SSize_t size; /* Number of regnode equivalents in
pattern */
- I32 npar; /* Capture buffer count, (OPEN) plus
- one. ("par" 0 is the whole
- pattern)*/
+
+ /* position beyond 'precomp' of the warning message furthest away from
+ * 'precomp'. During the parse, no warnings are raised for any problems
+ * earlier in the parse than this position. This works if warnings are
+ * raised the first time a given spot is parsed, and if only one
+ * independent warning is raised for any given spot */
+ Size_t latest_warn_offset;
+
+ I32 npar; /* Capture buffer count so far in the
+ parse, (OPEN) plus one. ("par" 0 is
+ the whole pattern)*/
+ I32 total_par; /* During initial parse, is either 0,
+ or -1; the latter indicating a
+ reparse is needed. After that pass,
+ it is what 'npar' became after the
+ pass. Hence, it being > 0 indicates
+ we are in a reparse situation */
I32 nestroot; /* root parens we are in - used by
accept */
- I32 extralen;
I32 seen_zerolen;
regnode_offset *open_parens; /* offsets to open parens */
regnode_offset *close_parens; /* offsets to close parens */
#define RExC_mysv2 (pRExC_state->mysv2)
#endif
- bool seen_unfolded_sharp_s;
+ bool seen_d_op;
bool strict;
bool study_started;
bool in_script_run;
- bool pass1;
+ bool use_BRANCHJ;
};
#define RExC_flags (pRExC_state->flags)
#define RExC_start (pRExC_state->start)
#define RExC_end (pRExC_state->end)
#define RExC_parse (pRExC_state->parse)
+#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
#define RExC_whilem_seen (pRExC_state->whilem_seen)
+#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
+ under /d from /u ? */
-/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
- * EXACTF node, hence was parsed under /di rules. If later in the parse,
- * something forces the pattern into using /ui rules, the sharp s should be
- * folded into the sequence 'ss', which takes up more space than previously
- * calculated. This means that the sizing pass needs to be restarted. (The
- * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
- * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
- * so there is no need to resize [perl #125990]. */
-#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
#ifdef RE_TRACK_PATTERN_OFFSETS
# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
others */
#endif
#define RExC_emit (pRExC_state->emit)
-#define RExC_pass1 (pRExC_state->pass1)
#define RExC_emit_start (pRExC_state->emit_start)
-#define RExC_emit_bound (pRExC_state->emit_bound)
#define RExC_sawback (pRExC_state->sawback)
#define RExC_seen (pRExC_state->seen)
#define RExC_size (pRExC_state->size)
#define RExC_maxlen (pRExC_state->maxlen)
#define RExC_npar (pRExC_state->npar)
+#define RExC_total_parens (pRExC_state->total_par)
#define RExC_nestroot (pRExC_state->nestroot)
-#define RExC_extralen (pRExC_state->extralen)
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_utf8 (pRExC_state->utf8)
#define RExC_uni_semantics (pRExC_state->uni_semantics)
#define RExC_study_started (pRExC_state->study_started)
#define RExC_warn_text (pRExC_state->warn_text)
#define RExC_in_script_run (pRExC_state->in_script_run)
+#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
* a flag to disable back-off on the fixed/floating substrings - if it's
* Flags to be passed up and down.
*/
#define WORST 0 /* Worst case. */
-#define HASWIDTH 0x01 /* Known to match non-null strings. */
+#define HASWIDTH 0x01 /* Known to not match null strings, could match
+ non-null ones. */
/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
* character. (There needs to be a case: in the switch statement in regexec.c
#define REQUIRE_UTF8(flagp) STMT_START { \
if (!UTF) { \
- assert(PASS1); \
*flagp = RESTART_PARSE|NEED_UTF8; \
return 0; \
} \
} STMT_END
-/* Change from /d into /u rules, and restart the parse if we've already seen
- * something whose size would increase as a result, by setting *flagp and
- * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
- * we've changed to /u during the parse. */
+/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
+ * a flag that indicates we've changed to /u during the parse. */
#define REQUIRE_UNI_RULES(flagp, restart_retval) \
STMT_START { \
if (DEPENDS_SEMANTICS) { \
- assert(PASS1); \
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
RExC_uni_semantics = 1; \
- if (RExC_seen_unfolded_sharp_s) { \
+ if (RExC_seen_d_op && LIKELY(RExC_total_parens >= 0)) { \
+ /* No need to restart the parse if we haven't seen \
+ * anything that differs between /u and /d, and no need \
+ * to restart immediately if we're going to reparse \
+ * anyway to count parens */ \
*flagp |= RESTART_PARSE; \
return restart_retval; \
} \
} \
} STMT_END
-/* Executes a return statement with the value 'X', if 'flags' contains any of
- * 'RESTART_PARSE', 'NEED_UTF8', or 'extra'. If so, *flagp is set to those
- * flags */
-#define RETURN_X_ON_RESTART_OR_FLAGS(X, flags, flagp, extra) \
+#define BRANCH_MAX_OFFSET U16_MAX
+#define REQUIRE_BRANCHJ(flagp, restart_retval) \
+ STMT_START { \
+ RExC_use_BRANCHJ = 1; \
+ if (LIKELY(RExC_total_parens >= 0)) { \
+ /* No need to restart the parse immediately if we're \
+ * going to reparse anyway to count parens */ \
+ *flagp |= RESTART_PARSE; \
+ return restart_retval; \
+ } \
+ } STMT_END
+
+#define REQUIRE_PARENS_PASS \
+ STMT_START { \
+ if (RExC_total_parens == 0) RExC_total_parens = -1; \
+ } STMT_END
+
+/* This is used to return failure (zero) early from the calling function if
+ * various flags in 'flags' are set. Two flags always cause a return:
+ * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
+ * additional flags that should cause a return; 0 if none. If the return will
+ * be done, '*flagp' is first set to be all of the flags that caused the
+ * return. */
+#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
STMT_START { \
if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
*(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
- return X; \
+ return 0; \
} \
} STMT_END
-#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
- RETURN_X_ON_RESTART_OR_FLAGS(0,flags,flagp,extra)
-
-#define RETURN_X_ON_RESTART(X, flags,flagp) \
- RETURN_X_ON_RESTART_OR_FLAGS( X, flags, flagp, 0)
-
-
-#define RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp,extra) \
- if (*(flagp) & (RESTART_PARSE|(extra))) return 0
-
#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
#define RETURN_FAIL_ON_RESTART(flags,flagp) \
- RETURN_X_ON_RESTART(0, flags,flagp)
+ RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
- RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp, 0)
+ if (MUST_RESTART(*(flagp))) return 0
/* This converts the named class defined in regcomp.h to its equivalent class
* number defined in handy.h. */
#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
== REGEX_DEPENDS_CHARSET)
+/* Use RExC_uni_semantics instead of this:
#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
+*/
#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
>= REGEX_UNICODE_CHARSET)
#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
#define REPORT_LOCATION_ARGS(xC) \
UTF8fARG(UTF, \
(xI(xC) > eI) /* Don't run off end */ \
- ? eC - sC /* Length before the <--HERE */ \
+ ? eI - sI /* Length before the <--HERE */ \
: ((xI_offset(xC) >= 0) \
? xI_offset(xC) \
: (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
* past a nul byte. */
#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
+/* Set up to clean up after our imminent demise */
+#define PREPARE_TO_DIE \
+ STMT_START { \
+ if (RExC_rx_sv) \
+ SAVEFREESV(RExC_rx_sv); \
+ if (RExC_open_parens) \
+ SAVEFREEPV(RExC_open_parens); \
+ if (RExC_close_parens) \
+ SAVEFREEPV(RExC_close_parens); \
+ } STMT_END
+
/*
* Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
* arg. Show regex, up to a maximum length. If it's too long, chop and add
const char *ellipses = ""; \
IV len = RExC_precomp_end - RExC_precomp; \
\
- if (!SIZE_ONLY) \
- SAVEFREESV(RExC_rx_sv); \
+ PREPARE_TO_DIE; \
if (len > RegexLengthToShowInErrorMessages) { \
/* chop 10 shorter than the max, to ensure meaning of "..." */ \
len = RegexLengthToShowInErrorMessages - 10; \
* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
*/
#define vFAIL(m) STMT_START { \
- if (!SIZE_ONLY) \
- SAVEFREESV(RExC_rx_sv); \
+ PREPARE_TO_DIE; \
Simple_vFAIL(m); \
} STMT_END
* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
*/
#define vFAIL2(m,a1) STMT_START { \
- if (!SIZE_ONLY) \
- SAVEFREESV(RExC_rx_sv); \
+ PREPARE_TO_DIE; \
Simple_vFAIL2(m, a1); \
} STMT_END
* Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
*/
#define vFAIL3(m,a1,a2) STMT_START { \
- if (!SIZE_ONLY) \
- SAVEFREESV(RExC_rx_sv); \
+ PREPARE_TO_DIE; \
Simple_vFAIL3(m, a1, a2); \
} STMT_END
} STMT_END
#define vFAIL4(m,a1,a2,a3) STMT_START { \
- if (!SIZE_ONLY) \
- SAVEFREESV(RExC_rx_sv); \
+ PREPARE_TO_DIE; \
Simple_vFAIL4(m, a1, a2, a3); \
} STMT_END
/* A specialized version of vFAIL2 that works with UTF8f */
#define vFAIL2utf8f(m, a1) STMT_START { \
- if (!SIZE_ONLY) \
- SAVEFREESV(RExC_rx_sv); \
+ PREPARE_TO_DIE; \
S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
REPORT_LOCATION_ARGS(RExC_parse)); \
} STMT_END
#define vFAIL3utf8f(m, a1, a2) STMT_START { \
- if (!SIZE_ONLY) \
- SAVEFREESV(RExC_rx_sv); \
+ PREPARE_TO_DIE; \
S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
REPORT_LOCATION_ARGS(RExC_parse)); \
} STMT_END
-/* Outputting warnings is generally deferred until the 2nd pass. This is
- * because the first pass can be restarted, for example if the pattern has to
- * be converted to UTF-8. If a warning had already been output earlier in the
- * pass, it would be re-output after the restart. Pass 2 is never restarted,
- * so the problem simply goes away if we defer the output to that pass. See
- * [perl #122671]. */
+/* Setting this to NULL is a signal to not output warnings */
+#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE RExC_copy_start_in_constructed = NULL
+#define RESTORE_WARNINGS RExC_copy_start_in_constructed = RExC_precomp
+
+/* Since a warning can be generated multiple times as the input is reparsed, we
+ * output it the first time we come to that point in the parse, but suppress it
+ * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
+ * generate any warnings */
+#define TO_OUTPUT_WARNINGS(loc) \
+ ( RExC_copy_start_in_constructed \
+ && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
+
+/* After we've emitted a warning, we save the position in the input so we don't
+ * output it again */
+#define UPDATE_WARNINGS_LOC(loc) \
+ STMT_START { \
+ if (TO_OUTPUT_WARNINGS(loc)) { \
+ RExC_latest_warn_offset = (xI(loc)) - RExC_precomp; \
+ } \
+ } STMT_END
+
+/* 'warns' is the output of the packWARNx macro used in 'code' */
#define _WARN_HELPER(loc, warns, code) \
STMT_START { \
- if (PASS2) { \
+ if (! RExC_copy_start_in_constructed) { \
+ Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
+ " expected at '%s'", \
+ __FILE__, __LINE__, loc); \
+ } \
+ if (TO_OUTPUT_WARNINGS(loc)) { \
+ if (ckDEAD(warns)) \
+ PREPARE_TO_DIE; \
code; \
+ UPDATE_WARNINGS_LOC(loc); \
} \
} STMT_END
#define Set_Node_Offset_Length(node,offset,len)
#define ProgLen(ri) ri->u.proglen
#define SetProgLen(ri,x) ri->u.proglen = x
+#define Track_Code(code)
#else
#define ProgLen(ri) ri->u.offsets[0]
#define SetProgLen(ri,x) ri->u.offsets[0] = x
#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
- if (! SIZE_ONLY) { \
MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
__LINE__, (int)(offset), (int)(byte))); \
if((offset) < 0) { \
} else { \
RExC_offsets[2*(offset)-1] = (byte); \
} \
- } \
} STMT_END
#define Set_Node_Offset(node,byte) \
#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
#define Set_Node_Length_To_R(node,len) STMT_START { \
- if (! SIZE_ONLY) { \
MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
__LINE__, (int)(node), (int)(len))); \
if((node) < 0) { \
} else { \
RExC_offsets[2*(node)] = (len); \
} \
- } \
} STMT_END
#define Set_Node_Length(node,len) \
Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
} STMT_END
+
+#define Track_Code(code) STMT_START { code } STMT_END
#endif
#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
list */
const U32 max_code_points = (LOC)
? 256
- : (( ! UNI_SEMANTICS
- || invlist_highest(ssc->invlist) < 256)
+ : (( ! RExC_uni_semantics
+ || invlist_highest(ssc->invlist) < 256)
? 128
: NON_OTHER_COUNT);
const U32 max_match = max_code_points / 2;
#endif
switch (flags) {
- case EXACT: case EXACTL: break;
+ case EXACT: case EXACT_ONLY8: case EXACTL: break;
case EXACTFAA:
case EXACTFU_SS:
case EXACTFU:
trie->wordcount = word_count;
RExC_rxi->data->data[ data_slot ] = (void*)trie;
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
- if (flags == EXACT || flags == EXACTL)
+ if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
trie->wordcount+1, sizeof(reg_trie_wordinfo));
noper= noper_next;
}
- if ( noper < tail &&
- (
- OP(noper) == flags ||
- (
- flags == EXACTFU &&
- OP(noper) == EXACTFU_SS
- )
- )
- ) {
+ if ( noper < tail
+ && ( OP(noper) == flags
+ || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || OP(noper) == EXACTFU_SS))) )
+ {
uc= (U8*)STRING(noper);
e= uc + STR_LEN(noper);
} else {
noper= noper_next;
}
- if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+ if ( noper < tail
+ && ( OP(noper) == flags
+ || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || OP(noper) == EXACTFU_SS))) )
+ {
const U8 *uc= (U8*)STRING(noper);
const U8 *e= uc + STR_LEN(noper);
noper= noper_next;
}
- if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
+ if ( noper < tail
+ && ( OP(noper) == flags
+ || (flags == EXACT && OP(noper) == EXACT_ONLY8)
+ || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
+ || OP(noper) == EXACTFU_SS))) )
+ {
const U8 *uc= (U8*)STRING(noper);
const U8 *e= uc + STR_LEN(noper);
optimisation.
*/
while( optimize < jumper ) {
-#ifdef RE_TRACK_PATTERN_OFFSETS
- mjd_nodelen += Node_Length((optimize));
-#endif
+ Track_Code( mjd_nodelen += Node_Length((optimize)); );
OP( optimize ) = OPTIMIZED;
Set_Node_Offset_Length(optimize, 0, 0);
optimize++;
U32 flags, regnode *val, U32 depth)
{
/* Merge several consecutive EXACTish nodes into one. */
+
regnode *n = regnext(scan);
U32 stringok = 1;
regnode *next = scan + NODE_SZ_STR(scan);
#endif
DEBUG_PEEP("join", scan, depth, 0);
+ assert(PL_regkind[OP(scan)] == EXACT);
+
/* Look through the subsequent nodes in the chain. Skip NOTHING, merge
* EXACT ones that are mergeable to the current one. */
- while (n
- && (PL_regkind[OP(n)] == NOTHING
- || (stringok && OP(n) == OP(scan)))
+ while ( n
+ && ( PL_regkind[OP(n)] == NOTHING
+ || (stringok && PL_regkind[OP(n)] == EXACT))
&& NEXT_OFF(n)
&& NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
{
if (oldl + STR_LEN(n) > U8_MAX)
break;
+ /* Joining something that requires UTF-8 with something that
+ * doesn't, means the result requires UTF-8. */
+ if (OP(scan) == EXACT && (OP(n) == EXACT_ONLY8)) {
+ OP(scan) = EXACT_ONLY8;
+ }
+ else if (OP(scan) == EXACT_ONLY8 && (OP(n) == EXACT)) {
+ ; /* join is compatible, no need to change OP */
+ }
+ else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_ONLY8)) {
+ OP(scan) = EXACTFU_ONLY8;
+ }
+ else if ((OP(scan) == EXACTFU_ONLY8) && (OP(n) == EXACTFU)) {
+ ; /* join is compatible, no need to change OP */
+ }
+ else if (OP(scan) == EXACTFU) {
+ if (OP(n) != EXACTFU) {
+
+ /* Here the first node is EXACTFU and the second isn't.
+ * Normally EXACTFU nodes are compatible for joining only
+ * with EXACTFU_ONLY8 nodes (already handled), and other
+ * EXACTFU nodes. But under /di, certain temporary
+ * EXACTFS_foo_U nodes are generated, which are compatible.
+ * We check for this case here. These need to be resolved
+ * to either EXACTFU or EXACTF at joining time. They have
+ * nothing in them that would forbid them from being the
+ * more desirable EXACTFU nodes except that they begin
+ * and/or end with a single [Ss]. The reason this is
+ * problematic is because they could be joined in this loop
+ * with an adjacent node that ends and/or begins with [Ss]
+ * which would then form the sequence 'ss', which matches
+ * differently under /di than /ui, in which case EXACTFU
+ * can't be used. If the 'ss' sequence doesn't get formed,
+ * the nodes get absorbed into any adjacent EXACTFU node.
+ * And if the only adjacent node is EXACTF, they get
+ * absorbed into that, under the theory that a longer node
+ * is better than two shorter ones, even if one is EXACTFU.
+ * Note that EXACTFU_ONLY8 is generated only for UTF-8
+ * patterns, and the EXACTFS_foo_U ones only for non-UTF-8.
+ * */
+
+ if (OP(n) == EXACTFS_E_U || OP(n) == EXACTFS_BE_U) {
+
+ /* Here the joined node would end with 's'. If the
+ * node following the combination is an EXACTF one,
+ * it's better to join this EXACTFS_fooE_U with that
+ * one, leaving the current one in 'scan' be the more
+ * desirable EXACTFU */
+ if (OP(nnext) == EXACTF) {
+ break;
+ }
+ OP(scan) = EXACTFS_E_U;
+ }
+ else if (OP(n) != EXACTFS_B_U) {
+ break; /* This would be an incompatible join; stop */
+ }
+ }
+ }
+ else if (OP(scan) == EXACTF) {
+ if (OP(n) != EXACTF) {
+
+ /* Here the first node is EXACTF and the second isn't.
+ * EXACTF nodes are compatible for joining only with other
+ * EXACTF nodes, and the EXACTFS_foo_U nodes. But the
+ * latter nodes can be also joined with EXACTFU ones, and
+ * that is a better outcome, so if the node following 'n'
+ * is EXACTFU, quit now so that those two can be joined
+ * later */
+ if ( OP(n) != EXACTFS_B_U
+ && OP(n) != EXACTFS_E_U
+ && OP(n) != EXACTFS_BE_U)
+ {
+ break;
+ }
+ else if (OP(nnext) == EXACTFU) {
+ break;
+ }
+ else {
+ /* Here the next node can be joined with the EXACTF
+ * node, and become part of it. That they begin or end
+ * with 's' now doesn't matter. */
+ }
+ }
+ }
+ else if (OP(scan) == EXACTFS_B_U) {
+
+ /* Here, the first node begins, but does not end with 's'.
+ * That means it doesn't form 'ss' with the following node, so
+ * can become EXACTFU, and either stand on its own or be joined
+ * with a following EXACTFU. If the following is instead an
+ * EXACTF, the two can also be joined together as EXACTF */
+ if (OP(n) == EXACTF) {
+ OP(scan) = EXACTF;
+ }
+ else {
+ OP(scan) = EXACTFU;
+ if (OP(n) != EXACTFU) {
+ break;
+ }
+ }
+ }
+ else if (OP(scan) == EXACTFS_E_U || OP(scan) == EXACTFS_BE_U) {
+
+ /* Here, the first node ends with 's', and could become an
+ * EXACTFU (or be joined with a following EXACTFU) if that next
+ * node doesn't begin with 's'; otherwise it must become an
+ * EXACTF node. */
+ if (OP(n) == EXACTFS_B_U || OP(n) == EXACTFS_BE_U) {
+ OP(scan) = EXACTF;
+ }
+ else {
+ OP(scan) = EXACTFU;
+ if (OP(n) != EXACTFU) {
+ break;
+ }
+ }
+ }
+ else if (OP(scan) != OP(n)) {
+
+ /* The only other compatible joinings are the same node type */
+ break;
+ }
+
DEBUG_PEEP("merg", n, depth, 0);
merged++;
#endif
}
+ /* These temporary nodes can now be turned into EXACTFU, and must, as
+ * regexec.c doesn't handle them */
+ if ( OP(scan) == EXACTFS_B_U
+ || OP(scan) == EXACTFS_E_U
+ || OP(scan) == EXACTFS_BE_U)
+ {
+ OP(scan) = EXACTFU;
+ }
+
*min_subtract = 0;
*unfolded_multi_char = FALSE;
* this final joining, sequences could have been split over boundaries, and
* hence missed). The sequences only happen in folding, hence for any
* non-EXACT EXACTish node */
- if (OP(scan) != EXACT && OP(scan) != EXACTL) {
+ if (OP(scan) != EXACT && OP(scan) != EXACT_ONLY8 && OP(scan) != EXACTL) {
U8* s0 = (U8*) STRING(scan);
U8* s = s0;
U8* s_end = s0 + STR_LEN(scan);
U32 i;
U32 j;
for ( j = 0 ; j < recursed_depth ; j++ ) {
- for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
+ for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
if (
PAREN_TEST(RExC_study_chunk_recursed +
( j * RExC_study_chunk_recursed_bytes), i )
----------------+-----------
NOTHING | NOTHING
EXACT | EXACT
+ EXACT_ONLY8 | EXACT
EXACTFU | EXACTFU
+ EXACTFU_ONLY8 | EXACTFU
EXACTFU_SS | EXACTFU
- EXACTFAA | EXACTFAA
+ EXACTFAA | EXACTFAA
EXACTL | EXACTL
EXACTFLU8 | EXACTFLU8
*/
#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
? NOTHING \
- : ( EXACT == (X) ) \
+ : ( EXACT == (X) || EXACT_ONLY8 == (X) ) \
? EXACT \
- : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
+ : ( EXACTFU == (X) \
+ || EXACTFU_ONLY8 == (X) \
+ || EXACTFU_SS == (X) ) \
? EXACTFU \
- : ( EXACTFAA == (X) ) \
- ? EXACTFAA \
+ : ( EXACTFAA == (X) ) \
+ ? EXACTFAA \
: ( EXACTL == (X) ) \
? EXACTL \
- : ( EXACTFLU8 == (X) ) \
- ? EXACTFLU8 \
+ : ( EXACTFLU8 == (X) ) \
+ ? EXACTFLU8 \
: 0 )
/* dont use tail as the end marker for this traverse */
continue;
}
}
- else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
+ else if ( OP(scan) == EXACT
+ || OP(scan) == EXACT_ONLY8
+ || OP(scan) == EXACTL)
+ {
SSize_t l = STR_LEN(scan);
UV uc;
assert(l);
case PLUS:
if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
next = NEXTOPER(scan);
- if (OP(next) == EXACT
+ if ( OP(next) == EXACT
+ || OP(next) == EXACT_ONLY8
|| OP(next) == EXACTL
|| (flags & SCF_DO_STCLASS))
{
min++;
/* FALLTHROUGH */
case STAR:
+ next = NEXTOPER(scan);
+
+ /* These temporary nodes can now be turned into EXACTFU, and
+ * must, as regexec.c doesn't handle them */
+ if ( OP(next) == EXACTFS_B_U
+ || OP(next) == EXACTFS_E_U
+ || OP(next) == EXACTFS_BE_U)
+ {
+ OP(next) = EXACTFU;
+ }
+
if (flags & SCF_DO_STCLASS) {
mincount = 0;
maxcount = REG_INFTY;
&& maxcount <= REG_INFTY/3) /* Complement check for big
count */
{
- /* Fatal warnings may leak the regexp without this: */
- SAVEFREESV(RExC_rx_sv);
- Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
- "Quantifier unexpected on zero-length expression "
- "in regex m/%" UTF8f "/",
- UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
- RExC_precomp));
- (void)ReREFCNT_inc(RExC_rx_sv);
- }
+ _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
+ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
+ "Quantifier unexpected on zero-length expression "
+ "in regex m/%" UTF8f "/",
+ UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
+ RExC_precomp)));
+ }
min += minnext * mincount;
is_inf_internal |= deltanext == SSize_t_MAX
(regnode_charclass *) scan);
break;
+ case NANYOFM:
case ANYOFM:
{
SV* cp_list = get_ANYOFM_contents(scan);
if (flags & SCF_DO_STCLASS_OR) {
- ssc_union(data->start_class,
- cp_list,
- FALSE /* don't invert */
- );
+ ssc_union(data->start_class, cp_list, invert);
}
else if (flags & SCF_DO_STCLASS_AND) {
- ssc_intersection(data->start_class,
- cp_list,
- FALSE /* don't invert */
- );
+ ssc_intersection(data->start_class, cp_list, invert);
}
SvREFCNT_dec_NN(cp_list);
return;
for (n = 0; n < cbs->count; n++) {
REGEXP *rx = cbs->cb[n].src_regex;
- cbs->cb[n].src_regex = NULL;
- SvREFCNT_dec(rx);
+ if (rx) {
+ cbs->cb[n].src_regex = NULL;
+ SvREFCNT_dec_NN(rx);
+ }
}
Safefree(cbs->cb);
Safefree(cbs);
DEBUG_PARSE_r(Perl_re_printf( aTHX_
"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
- Newx(dst, *plen_p * 2 + 1, U8);
+ /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
+ Newx(dst, *plen_p + variant_under_utf8_count(src, src + *plen_p) + 1, U8);
d = dst;
while (s < *plen_p) {
&& n < pRExC_state->code_blocks->count
&& s == pRExC_state->code_blocks->cb[n].start)
{
- /* blank out literal code block */
- assert(pat[s] == '(');
- while (s <= pRExC_state->code_blocks->cb[n].end) {
- *p++ = '_';
+ /* blank out literal code block so that they aren't
+ * recompiled: eg change from/to:
+ * /(?{xyz})/
+ * /(?=====)/
+ * and
+ * /(??{xyz})/
+ * /(?======)/
+ * and
+ * /(?(?{xyz}))/
+ * /(?(?=====))/
+ */
+ assert(pat[s] == '(');
+ assert(pat[s+1] == '?');
+ *p++ = '(';
+ *p++ = '?';
+ s += 2;
+ while (s < pRExC_state->code_blocks->cb[n].end) {
+ *p++ = '=';
s++;
}
- s--;
+ *p++ = ')';
n++;
continue;
}
return TRUE;
}
+STATIC void
+S_set_regex_pv(pTHX_ RExC_state_t *pRExC_state, REGEXP *Rx)
+{
+ /* Calculates and sets in the compiled pattern 'Rx' the string to compile,
+ * properly wrapped with the right modifiers */
+
+ bool has_p = ((RExC_rx->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
+ bool has_charset = RExC_utf8 || (get_regex_charset(RExC_rx->extflags)
+ != REGEX_DEPENDS_CHARSET);
+
+ /* The caret is output if there are any defaults: if not all the STD
+ * flags are set, or if no character set specifier is needed */
+ bool has_default =
+ (((RExC_rx->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
+ || ! has_charset);
+ bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
+ == REG_RUN_ON_COMMENT_SEEN);
+ U8 reganch = (U8)((RExC_rx->extflags & RXf_PMf_STD_PMMOD)
+ >> RXf_PMf_STD_PMMOD_SHIFT);
+ const char *fptr = STD_PAT_MODS; /*"msixxn"*/
+ char *p;
+ STRLEN pat_len = RExC_precomp_end - RExC_precomp;
+
+ /* We output all the necessary flags; we never output a minus, as all
+ * those are defaults, so are
+ * covered by the caret */
+ const STRLEN wraplen = pat_len + has_p + has_runon
+ + has_default /* If needs a caret */
+ + PL_bitcount[reganch] /* 1 char for each set standard flag */
+
+ /* If needs a character set specifier */
+ + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
+ + (sizeof("(?:)") - 1);
+
+ PERL_ARGS_ASSERT_SET_REGEX_PV;
+
+ /* make sure PL_bitcount bounds not exceeded */
+ assert(sizeof(STD_PAT_MODS) <= 8);
+
+ p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
+ SvPOK_on(Rx);
+ if (RExC_utf8)
+ SvFLAGS(Rx) |= SVf_UTF8;
+ *p++='('; *p++='?';
+
+ /* If a default, cover it using the caret */
+ if (has_default) {
+ *p++= DEFAULT_PAT_MOD;
+ }
+ if (has_charset) {
+ STRLEN len;
+ const char* name;
+
+ name = get_regex_charset_name(RExC_rx->extflags, &len);
+ if strEQ(name, DEPENDS_PAT_MODS) { /* /d under UTF-8 => /u */
+ assert(RExC_utf8);
+ name = UNICODE_PAT_MODS;
+ len = sizeof(UNICODE_PAT_MODS) - 1;
+ }
+ Copy(name, p, len, char);
+ p += len;
+ }
+ if (has_p)
+ *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
+ {
+ char ch;
+ while((ch = *fptr++)) {
+ if(reganch & 1)
+ *p++ = ch;
+ reganch >>= 1;
+ }
+ }
+
+ *p++ = ':';
+ Copy(RExC_precomp, p, pat_len, char);
+ assert ((RX_WRAPPED(Rx) - p) < 16);
+ RExC_rx->pre_prefix = p - RX_WRAPPED(Rx);
+ p += pat_len;
+
+ /* Adding a trailing \n causes this to compile properly:
+ my $R = qr / A B C # D E/x; /($R)/
+ Otherwise the parens are considered part of the comment */
+ if (has_runon)
+ *p++ = '\n';
+ *p++ = ')';
+ *p = 0;
+ SvCUR_set(Rx, p - RX_WRAPPED(Rx));
+}
+
/*
* Perl_re_op_compile - the perl internal RE engine's function to compile a
* regular expression into internal code.
* pm_flags field of the related PMOP. Currently we're only interested in
* PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
*
- * We can't allocate space until we know how big the compiled form will be,
- * but we can't compile it (and thus know how big it is) until we've got a
- * place to put the code. So we cheat: we compile it twice, once with code
- * generation turned off and size counting turned on, and once "for real".
- * This also means that we don't allocate space until we are sure that the
- * thing really will compile successfully, and we never have to move the
- * code and thus invalidate pointers into it. (Note that it has to be in
- * one piece because free() must be able to free it all.) [NB: not true in perl]
+ * For many years this code had an initial sizing pass that calculated
+ * (sometimes incorrectly, leading to security holes) the size needed for the
+ * compiled pattern. That was changed by commit
+ * 7c932d07cab18751bfc7515b4320436273a459e2 in 5.29, which reallocs the size, a
+ * node at a time, as parsing goes along. Patches welcome to fix any obsolete
+ * references to this sizing pass.
+ *
+ * Now, an initial crude guess as to the size needed is made, based on the
+ * length of the pattern. Patches welcome to improve that guess. That amount
+ * of space is malloc'd and then immediately freed, and then clawed back node
+ * by node. This design is to minimze, to the extent possible, memory churn
+ * when doing the the reallocs.
+ *
+ * A separate parentheses counting pass may be needed in some cases.
+ * (Previously the sizing pass did this.) Patches welcome to reduce the number
+ * of these cases.
+ *
+ * The existence of a sizing pass necessitated design decisions that are no
+ * longer needed. There are potential areas of simplification.
*
* Beware that the optimization-preparation code in here knows about some
* of the structure of the compiled regexp. [I'll say.]
bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags)
{
REGEXP *Rx; /* Capital 'R' means points to a REGEXP */
- struct regexp *r;
STRLEN plen;
char *exp;
regnode *scan;
/* ignore the utf8ness if the pattern is 0 length */
RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
- RExC_uni_semantics = 0;
- RExC_seen_unfolded_sharp_s = 0;
+ RExC_uni_semantics = RExC_utf8; /* UTF-8 implies unicode semantics;
+ otherwise we may find later this should
+ be 1 */
RExC_contains_locale = 0;
RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
RExC_in_script_run = 0;
RExC_frame_head= NULL;
RExC_frame_last= NULL;
RExC_frame_count= 0;
+ RExC_latest_warn_offset = 0;
+ RExC_use_BRANCHJ = 0;
+ RExC_total_parens = 0;
+ RExC_open_parens = NULL;
+ RExC_close_parens = NULL;
+ RExC_paren_names = NULL;
+ RExC_size = 0;
+ RExC_seen_d_op = FALSE;
+#ifdef DEBUGGING
+ RExC_paren_name_list = NULL;
+#endif
DEBUG_r({
RExC_mysv1= sv_newmortal();
PL_colors[4], PL_colors[5], s);
});
- redo_parse:
/* we jump here if we have to recompile, e.g., from upgrading the pattern
* to utf8 */
)
runtime_code = S_has_runtime_code(aTHX_ pRExC_state, exp, plen);
+ redo_parse:
/* return old regex if pattern hasn't changed */
/* XXX: note in the below we have to check the flags as well as the
* pattern.
return old_re;
}
+ /* Allocate the pattern's SV */
+ RExC_rx_sv = Rx = (REGEXP*) newSV_type(SVt_REGEXP);
+ RExC_rx = ReANY(Rx);
+ if ( RExC_rx == NULL )
+ FAIL("Regexp out of space");
+
rx_flags = orig_rx_flags;
- if ( initial_charset == REGEX_DEPENDS_CHARSET
- && (RExC_utf8 ||RExC_uni_semantics))
- {
+ if (initial_charset == REGEX_DEPENDS_CHARSET && RExC_uni_semantics) {
/* Set to use unicode semantics if the pattern is in utf8 and has the
* 'depends' charset specified, as it means unicode when utf8 */
set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
}
- RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
- RExC_flags = rx_flags;
RExC_pm_flags = pm_flags;
if (runtime_code) {
RExC_maxlen = 0;
RExC_in_lookbehind = 0;
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
- RExC_extralen = 0;
#ifdef EBCDIC
RExC_recode_x_to_native = 0;
#endif
RExC_in_multi_char_class = 0;
- /* First pass: determine size, legality. */
- RExC_pass1 = TRUE;
- RExC_parse = exp;
- RExC_start = RExC_copy_start_in_constructed = exp;
- RExC_end = exp + plen;
- RExC_precomp_end = RExC_end;
- RExC_naughty = 0;
- RExC_npar = 1;
+ RExC_start = RExC_copy_start_in_constructed = RExC_copy_start_in_input = RExC_precomp = exp;
+ RExC_precomp_end = RExC_end = exp + plen;
RExC_nestroot = 0;
- RExC_size = 0L;
- RExC_emit = 1;
RExC_whilem_seen = 0;
- RExC_open_parens = 0;
- RExC_close_parens = 0;
RExC_end_op = NULL;
- RExC_paren_names = NULL;
-#ifdef DEBUGGING
- RExC_paren_name_list = NULL;
-#endif
RExC_recurse = NULL;
RExC_study_chunk_recursed = NULL;
RExC_study_chunk_recursed_bytes= 0;
RExC_recurse_count = 0;
pRExC_state->code_index = 0;
- /* We allocate scratch space as large as the largest node, for use in the
- * first pass. Since many functions return RExC_emit on success, and '0'
- * if an error, RExC_emit must never be 0, so we set it to 1 and double
- * the scratch space */
- Newxc(RExC_emit_start, 2 * sizeof(regnode_ssc), char, regnode);
- SAVEFREEPV(RExC_emit_start);
+ /* Initialize the string in the compiled pattern. This is so that there is
+ * something to output if necessary */
+ set_regex_pv(pRExC_state, Rx);
+
+ DEBUG_PARSE_r({
+ Perl_re_printf( aTHX_
+ "Starting parse and generation\n");
+ RExC_lastnum=0;
+ RExC_lastparse=NULL;
+ });
+
+ /* Allocate space and zero-initialize. Note, the two step process
+ of zeroing when in debug mode, thus anything assigned has to
+ happen after that */
+ if (! RExC_size) {
+
+ /* On the first pass of the parse, we guess how big this will be. Then
+ * we grow in one operation to that amount and then give it back. As
+ * we go along, we re-allocate what we need.
+ *
+ * XXX Currently the guess is essentially that the pattern will be an
+ * EXACT node with one byte input, one byte output. This is crude, and
+ * better heuristics are welcome.
+ *
+ * On any subsequent passes, we guess what we actually computed in the
+ * latest earlier pass. Such a pass probably didn't complete so is
+ * missing stuff. We could improve those guesses by knowing where the
+ * parse stopped, and use the length so far plus apply the above
+ * assumption to what's left. */
+ RExC_size = STR_SZ(RExC_end - RExC_start);
+ }
+
+ Newxc(RExC_rxi, sizeof(regexp_internal) + RExC_size, char, regexp_internal);
+ if ( RExC_rxi == NULL )
+ FAIL("Regexp out of space");
+
+ Zero(RExC_rxi, sizeof(regexp_internal) + RExC_size, char);
+ RXi_SET( RExC_rx, RExC_rxi );
+
+ /* We start from 0 (over from 0 in the case this is a reparse. The first
+ * node parsed will give back any excess memory we have allocated so far).
+ * */
+ RExC_size = 0;
+
+ /* non-zero initialization begins here */
+ RExC_rx->engine= eng;
+ RExC_rx->extflags = rx_flags;
+ RXp_COMPFLAGS(RExC_rx) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
+
+ if (pm_flags & PMf_IS_QR) {
+ RExC_rxi->code_blocks = pRExC_state->code_blocks;
+ if (RExC_rxi->code_blocks) {
+ RExC_rxi->code_blocks->refcnt++;
+ }
+ }
+
+ RExC_rx->intflags = 0;
+
+ RExC_flags = rx_flags; /* don't let top level (?i) bleed */
+ RExC_parse = exp;
/* This NUL is guaranteed because the pattern comes from an SV*, and the sv
* code makes sure the final byte is an uncounted NUL. But should this
* etc. So it is worth noting. */
assert(*RExC_end == '\0');
- DEBUG_PARSE_r(
- Perl_re_printf( aTHX_ "Starting first pass (sizing)\n");
- RExC_lastnum=0;
- RExC_lastparse=NULL;
- );
+ RExC_naughty = 0;
+ RExC_npar = 1;
+ RExC_emit_start = RExC_rxi->program;
+ pRExC_state->code_index = 0;
+
+ *((char*) RExC_emit_start) = (char) REG_MAGIC;
+ RExC_emit = 1;
+
+ /* Do the parse */
+ if (reg(pRExC_state, 0, &flags, 1)) {
+
+ /* Success!, But if RExC_total_parens < 0, we need to redo the parse
+ * knowing how many parens there actually are */
+ if (RExC_total_parens < 0) {
+ flags |= RESTART_PARSE;
+ }
+
+ /* We have that number in RExC_npar */
+ RExC_total_parens = RExC_npar;
+ }
+ else if (! MUST_RESTART(flags)) {
+ ReREFCNT_dec(Rx);
+ Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile, flags=%#" UVxf, (UV) flags);
+ }
+
+ /* Here, we either have success, or we have to redo the parse for some reason */
+ if (MUST_RESTART(flags)) {
- if (reg(pRExC_state, 0, &flags, 1) == 0) {
/* It's possible to write a regexp in ascii that represents Unicode
codepoints outside of the byte range, such as via \x{100}. If we
detect such a sequence we have to convert the entire pattern to utf8
at least some part of the pattern, and therefore must convert the whole
thing.
-- dmq */
- if (MUST_RESTART(flags)) {
- if (flags & NEED_UTF8) {
- S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
- pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
- DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
+ if (flags & NEED_UTF8) {
+
+ /* We have stored the offset of the final warning output so far.
+ * That must be adjusted. Any variant characters between the start
+ * of the pattern and this warning count for 2 bytes in the final,
+ * so just add them again */
+ if (UNLIKELY(RExC_latest_warn_offset > 0)) {
+ RExC_latest_warn_offset +=
+ variant_under_utf8_count((U8 *) exp, (U8 *) exp
+ + RExC_latest_warn_offset);
}
- else {
- DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
- }
-
- goto redo_parse;
+ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen,
+ pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0);
+ DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse after upgrade\n"));
+ }
+ else {
+ DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n"));
}
- Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile for sizing pass, flags=%#" UVxf, (UV) flags);
- }
-
- DEBUG_PARSE_r({
- Perl_re_printf( aTHX_
- "Required size %" IVdf " nodes\n"
- "Starting second pass (creation)\n",
- (IV)RExC_size);
- RExC_lastnum=0;
- RExC_lastparse=NULL;
- });
-
- /* The first pass could have found things that force Unicode semantics */
- if ((RExC_utf8 || RExC_uni_semantics)
- && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
- {
- set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
- }
- /* Small enough for pointer-storage convention?
- If extralen==0, this means that we will not need long jumps. */
- if (RExC_size >= 0x10000L && RExC_extralen)
- RExC_size += RExC_extralen;
- else
- RExC_extralen = 0;
- if (RExC_whilem_seen > 15)
- RExC_whilem_seen = 15;
+ if (RExC_total_parens > 0) {
+ /* Make enough room for all the known parens, and zero it */
+ Renew(RExC_open_parens, RExC_total_parens, regnode_offset);
+ Zero(RExC_open_parens, RExC_total_parens, regnode_offset);
+ RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
- /* Allocate space and zero-initialize. Note, the two step process
- of zeroing when in debug mode, thus anything assigned has to
- happen after that */
- Rx = (REGEXP*) newSV_type(SVt_REGEXP);
- r = ReANY(Rx);
- Newxc(RExC_rxi, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
- char, regexp_internal);
- if ( r == NULL || RExC_rxi == NULL )
- FAIL("Regexp out of space");
-#ifdef DEBUGGING
- /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
- Zero(RExC_rxi, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
- char);
-#else
- /* bulk initialize base fields with 0. */
- Zero(RExC_rxi, sizeof(regexp_internal), char);
-#endif
+ Renew(RExC_close_parens, RExC_total_parens, regnode_offset);
+ Zero(RExC_close_parens, RExC_total_parens, regnode_offset);
+ }
+ else { /* Parse did not complete. Reinitialize the parentheses
+ structures */
+ RExC_total_parens = 0;
+ if (RExC_open_parens) {
+ Safefree(RExC_open_parens);
+ RExC_open_parens = NULL;
+ }
+ if (RExC_close_parens) {
+ Safefree(RExC_close_parens);
+ RExC_close_parens = NULL;
+ }
+ }
- /* non-zero initialization begins here */
- RXi_SET( r, RExC_rxi );
- r->engine= eng;
- r->extflags = rx_flags;
- RXp_COMPFLAGS(r) = orig_rx_flags & RXf_PMf_FLAGCOPYMASK;
+ /* Clean up what we did in this parse */
+ SvREFCNT_dec_NN(RExC_rx_sv);
- if (pm_flags & PMf_IS_QR) {
- RExC_rxi->code_blocks = pRExC_state->code_blocks;
- if (RExC_rxi->code_blocks)
- RExC_rxi->code_blocks->refcnt++;
+ goto redo_parse;
}
- {
- bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
- bool has_charset = (get_regex_charset(r->extflags)
- != REGEX_DEPENDS_CHARSET);
-
- /* The caret is output if there are any defaults: if not all the STD
- * flags are set, or if no character set specifier is needed */
- bool has_default =
- (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
- || ! has_charset);
- bool has_runon = ((RExC_seen & REG_RUN_ON_COMMENT_SEEN)
- == REG_RUN_ON_COMMENT_SEEN);
- U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD)
- >> RXf_PMf_STD_PMMOD_SHIFT);
- const char *fptr = STD_PAT_MODS; /*"msixxn"*/
- char *p;
-
- /* We output all the necessary flags; we never output a minus, as all
- * those are defaults, so are
- * covered by the caret */
- const STRLEN wraplen = plen + has_p + has_runon
- + has_default /* If needs a caret */
- + PL_bitcount[reganch] /* 1 char for each set standard flag */
-
- /* If needs a character set specifier */
- + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
- + (sizeof("(?:)") - 1);
-
- /* make sure PL_bitcount bounds not exceeded */
- assert(sizeof(STD_PAT_MODS) <= 8);
-
- p = sv_grow(MUTABLE_SV(Rx), wraplen + 1); /* +1 for the ending NUL */
- SvPOK_on(Rx);
- if (RExC_utf8)
- SvFLAGS(Rx) |= SVf_UTF8;
- *p++='('; *p++='?';
-
- /* If a default, cover it using the caret */
- if (has_default) {
- *p++= DEFAULT_PAT_MOD;
- }
- if (has_charset) {
- STRLEN len;
- const char* const name = get_regex_charset_name(r->extflags, &len);
- Copy(name, p, len, char);
- p += len;
- }
- if (has_p)
- *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
- {
- char ch;
- while((ch = *fptr++)) {
- if(reganch & 1)
- *p++ = ch;
- reganch >>= 1;
- }
- }
+ /* Here, we have successfully parsed and generated the pattern's program
+ * for the regex engine. We are ready to finish things up and look for
+ * optimizations. */
- *p++ = ':';
- Copy(RExC_precomp, p, plen, char);
- assert ((RX_WRAPPED(Rx) - p) < 16);
- r->pre_prefix = p - RX_WRAPPED(Rx);
- p += plen;
+ /* Update the string to compile, with correct modifiers, etc */
+ set_regex_pv(pRExC_state, Rx);
- /* Adding a trailing \n causes this to compile properly:
- my $R = qr / A B C # D E/x; /($R)/
- Otherwise the parens are considered part of the comment */
- if (has_runon)
- *p++ = '\n';
- *p++ = ')';
- *p = 0;
- SvCUR_set(Rx, p - RX_WRAPPED(Rx));
- }
+ RExC_rx->nparens = RExC_total_parens - 1;
+
+ /* Uses the upper 4 bits of the FLAGS field, so keep within that size */
+ if (RExC_whilem_seen > 15)
+ RExC_whilem_seen = 15;
- r->intflags = 0;
- r->nparens = RExC_npar - 1; /* set early to validate backrefs */
+ DEBUG_PARSE_r({
+ Perl_re_printf( aTHX_
+ "Required size %" IVdf " nodes\n", (IV)RExC_size);
+ RExC_lastnum=0;
+ RExC_lastparse=NULL;
+ });
- /* Useful during FAIL. */
#ifdef RE_TRACK_PATTERN_OFFSETS
- Newxz(RExC_offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
DEBUG_OFFSETS_r(Perl_re_printf( aTHX_
"%s %" UVuf " bytes for offset annotations.\n",
RExC_offsets ? "Got" : "Couldn't get",
- (UV)((2*RExC_size+1) * sizeof(U32))));
-#endif
- SetProgLen(RExC_rxi, RExC_size);
- RExC_rx_sv = Rx;
- RExC_rx = r;
+ (UV)((RExC_offsets[0] * 2 + 1))));
+ DEBUG_OFFSETS_r(if (RExC_offsets) {
+ const STRLEN len = RExC_offsets[0];
+ STRLEN i;
+ GET_RE_DEBUG_FLAGS_DECL;
+ Perl_re_printf( aTHX_
+ "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
+ for (i = 1; i <= len; i++) {
+ if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
+ Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ",
+ (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
+ }
+ Perl_re_printf( aTHX_ "\n");
+ });
- /* Second pass: emit code. */
- RExC_pass1 = FALSE;
- RExC_flags = rx_flags; /* don't let top level (?i) bleed */
- RExC_pm_flags = pm_flags;
- RExC_parse = exp;
- RExC_end = exp + plen;
- RExC_naughty = 0;
- RExC_emit_start = RExC_rxi->program;
- RExC_emit = 1;
- RExC_emit_bound = RExC_rxi->program + RExC_size + 1;
- pRExC_state->code_index = 0;
+#else
+ SetProgLen(RExC_rxi,RExC_size);
+#endif
- *((char*) RExC_emit_start) = (char) REG_MAGIC;
- /* setup various meta data about recursion, this all requires
- * RExC_npar to be correctly set, and a bit later on we clear it */
- if (RExC_seen & REG_RECURSE_SEEN) {
- DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
- "%*s%*s Setting up open/close parens\n",
- 22, "| |", (int)(0 * 2 + 1), ""));
-
- /* setup RExC_open_parens, which holds the address of each
- * OPEN tag, and to make things simpler for the 0 index
- * the start of the program - this is used later for offsets */
- Newxz(RExC_open_parens, RExC_npar, regnode_offset);
- SAVEFREEPV(RExC_open_parens);
- RExC_open_parens[0] = RExC_emit;
-
- /* setup RExC_close_parens, which holds the address of each
- * CLOSE tag, and to make things simpler for the 0 index
- * the end of the program - this is used later for offsets */
- Newxz(RExC_close_parens, RExC_npar, regnode_offset);
- SAVEFREEPV(RExC_close_parens);
- /* we dont know where end op starts yet, so we dont
- * need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */
-
- /* Note, RExC_npar is 1 + the number of parens in a pattern.
- * So its 1 if there are no parens. */
- RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) +
- ((RExC_npar & 0x07) != 0);
- Newx(RExC_study_chunk_recursed,
- RExC_study_chunk_recursed_bytes * RExC_npar, U8);
- SAVEFREEPV(RExC_study_chunk_recursed);
- }
- RExC_npar = 1;
- if (reg(pRExC_state, 0, &flags, 1) == 0) {
- ReREFCNT_dec(Rx);
- Perl_croak(aTHX_ "panic: reg returned failure to re_op_compile for generation pass, flags=%#" UVxf, (UV) flags);
- }
DEBUG_OPTIMISE_r(
Perl_re_printf( aTHX_ "Starting post parse optimization\n");
);
/* XXXX To minimize changes to RE engine we always allocate
3-units-long substrs field. */
- Newx(r->substrs, 1, struct reg_substr_data);
+ Newx(RExC_rx->substrs, 1, struct reg_substr_data);
if (RExC_recurse_count) {
Newx(RExC_recurse, RExC_recurse_count, regnode *);
SAVEFREEPV(RExC_recurse);
}
+ if (RExC_seen & REG_RECURSE_SEEN) {
+ /* Note, RExC_total_parens is 1 + the number of parens in a pattern.
+ * So its 1 if there are no parens. */
+ RExC_study_chunk_recursed_bytes= (RExC_total_parens >> 3) +
+ ((RExC_total_parens & 0x07) != 0);
+ Newx(RExC_study_chunk_recursed,
+ RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
+ SAVEFREEPV(RExC_study_chunk_recursed);
+ }
+
reStudy:
- r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
+ RExC_rx->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
DEBUG_r(
RExC_study_chunk_recursed_count= 0;
);
- Zero(r->substrs, 1, struct reg_substr_data);
+ Zero(RExC_rx->substrs, 1, struct reg_substr_data);
if (RExC_study_chunk_recursed) {
Zero(RExC_study_chunk_recursed,
- RExC_study_chunk_recursed_bytes * RExC_npar, U8);
+ RExC_study_chunk_recursed_bytes * RExC_total_parens, U8);
}
#endif
/* Dig out information for optimizations. */
- r->extflags = RExC_flags; /* was pm_op */
+ RExC_rx->extflags = RExC_flags; /* was pm_op */
/*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
if (UTF)
SvUTF8_on(Rx); /* Unicode in it? */
RExC_rxi->regstclass = NULL;
if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
- r->intflags |= PREGf_NAUGHTY;
+ RExC_rx->intflags |= PREGf_NAUGHTY;
scan = RExC_rxi->program + 1; /* First BRANCH. */
/* testing for BRANCH here tells us whether there is "must appear"
DEBUG_PEEP("first:", first, 0, 0);
/* Ignore EXACT as we deal with it later. */
if (PL_regkind[OP(first)] == EXACT) {
- if (OP(first) == EXACT || OP(first) == EXACTL)
+ if ( OP(first) == EXACT
+ || OP(first) == EXACT_ONLY8
+ || OP(first) == EXACTL)
+ {
NOOP; /* Empty, get anchored substr later. */
+ }
else
RExC_rxi->regstclass = first;
}
PL_regkind[OP(first)] == NBOUND)
RExC_rxi->regstclass = first;
else if (PL_regkind[OP(first)] == BOL) {
- r->intflags |= (OP(first) == MBOL
+ RExC_rx->intflags |= (OP(first) == MBOL
? PREGf_ANCH_MBOL
: PREGf_ANCH_SBOL);
first = NEXTOPER(first);
goto again;
}
else if (OP(first) == GPOS) {
- r->intflags |= PREGf_ANCH_GPOS;
+ RExC_rx->intflags |= PREGf_ANCH_GPOS;
first = NEXTOPER(first);
goto again;
}
!sawlookahead &&
(OP(first) == STAR &&
PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
- !(r->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
+ !(RExC_rx->intflags & PREGf_ANCH) && !pRExC_state->code_blocks)
{
/* turn .* into ^.* with an implied $*=1 */
const int type =
(OP(NEXTOPER(first)) == REG_ANY)
? PREGf_ANCH_MBOL
: PREGf_ANCH_SBOL;
- r->intflags |= (type | PREGf_IMPLICIT);
+ RExC_rx->intflags |= (type | PREGf_IMPLICIT);
first = NEXTOPER(first);
goto again;
}
&& (!sawopen || !RExC_sawback)
&& !pRExC_state->code_blocks) /* May examine pos and $& */
/* x+ must match at the 1st pos of run of x's */
- r->intflags |= PREGf_SKIP;
+ RExC_rx->intflags |= PREGf_SKIP;
/* Scan is after the zeroth branch, first is atomic matcher. */
#ifdef TRIE_STUDY_OPT
CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
- if ( RExC_npar == 1 && !data.cur_is_floating
+ if ( RExC_total_parens == 1 && !data.cur_is_floating
&& data.last_start_min == 0 && data.last_end > 0
&& !RExC_seen_zerolen
&& !(RExC_seen & REG_VERBARG_SEEN)
&& !(RExC_seen & REG_GPOS_SEEN)
){
- r->extflags |= RXf_CHECK_ALL;
+ RExC_rx->extflags |= RXf_CHECK_ALL;
}
scan_commit(pRExC_state, &data,&minlen, 0);
== SvCUR(data.substrs[1].str)
)
&& S_setup_longest (aTHX_ pRExC_state,
- &(r->substrs->data[i]),
+ &(RExC_rx->substrs->data[i]),
&(data.substrs[i]),
longest_length[i]))
{
- r->substrs->data[i].min_offset =
+ RExC_rx->substrs->data[i].min_offset =
data.substrs[i].min_offset - data.substrs[i].lookbehind;
- r->substrs->data[i].max_offset = data.substrs[i].max_offset;
+ RExC_rx->substrs->data[i].max_offset = data.substrs[i].max_offset;
/* Don't offset infinity */
if (data.substrs[i].max_offset < SSize_t_MAX)
- r->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
+ RExC_rx->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
}
else {
- r->substrs->data[i].substr = NULL;
- r->substrs->data[i].utf8_substr = NULL;
+ RExC_rx->substrs->data[i].substr = NULL;
+ RExC_rx->substrs->data[i].utf8_substr = NULL;
longest_length[i] = 0;
}
}
&& (OP(RExC_rxi->regstclass) == REG_ANY || OP(RExC_rxi->regstclass) == SANY))
RExC_rxi->regstclass = NULL;
- if ((!(r->substrs->data[0].substr || r->substrs->data[0].utf8_substr)
- || r->substrs->data[0].min_offset)
+ if ((!(RExC_rx->substrs->data[0].substr || RExC_rx->substrs->data[0].utf8_substr)
+ || RExC_rx->substrs->data[0].min_offset)
&& stclass_flag
&& ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
&& is_ssc_worth_it(pRExC_state, data.start_class))
(regnode_ssc*)RExC_rxi->data->data[n],
regnode_ssc);
RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
- r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
+ RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
- regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
+ regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
Perl_re_printf( aTHX_
"synthetic stclass \"%s\".\n",
SvPVX_const(sv));});
/* A temporary algorithm prefers floated substr to fixed one of
* same length to dig more info. */
i = (longest_length[0] <= longest_length[1]);
- r->substrs->check_ix = i;
- r->check_end_shift = r->substrs->data[i].end_shift;
- r->check_substr = r->substrs->data[i].substr;
- r->check_utf8 = r->substrs->data[i].utf8_substr;
- r->check_offset_min = r->substrs->data[i].min_offset;
- r->check_offset_max = r->substrs->data[i].max_offset;
- if (!i && (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
- r->intflags |= PREGf_NOSCAN;
-
- if ((r->check_substr || r->check_utf8) ) {
- r->extflags |= RXf_USE_INTUIT;
- if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
- r->extflags |= RXf_INTUIT_TAIL;
+ RExC_rx->substrs->check_ix = i;
+ RExC_rx->check_end_shift = RExC_rx->substrs->data[i].end_shift;
+ RExC_rx->check_substr = RExC_rx->substrs->data[i].substr;
+ RExC_rx->check_utf8 = RExC_rx->substrs->data[i].utf8_substr;
+ RExC_rx->check_offset_min = RExC_rx->substrs->data[i].min_offset;
+ RExC_rx->check_offset_max = RExC_rx->substrs->data[i].max_offset;
+ if (!i && (RExC_rx->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
+ RExC_rx->intflags |= PREGf_NOSCAN;
+
+ if ((RExC_rx->check_substr || RExC_rx->check_utf8) ) {
+ RExC_rx->extflags |= RXf_USE_INTUIT;
+ if (SvTAIL(RExC_rx->check_substr ? RExC_rx->check_substr : RExC_rx->check_utf8))
+ RExC_rx->extflags |= RXf_INTUIT_TAIL;
}
/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
CHECK_RESTUDY_GOTO_butfirst(NOOP);
- r->check_substr = NULL;
- r->check_utf8 = NULL;
- r->substrs->data[0].substr = NULL;
- r->substrs->data[0].utf8_substr = NULL;
- r->substrs->data[1].substr = NULL;
- r->substrs->data[1].utf8_substr = NULL;
+ RExC_rx->check_substr = NULL;
+ RExC_rx->check_utf8 = NULL;
+ RExC_rx->substrs->data[0].substr = NULL;
+ RExC_rx->substrs->data[0].utf8_substr = NULL;
+ RExC_rx->substrs->data[1].substr = NULL;
+ RExC_rx->substrs->data[1].utf8_substr = NULL;
if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
&& is_ssc_worth_it(pRExC_state, data.start_class))
(regnode_ssc*)RExC_rxi->data->data[n],
regnode_ssc);
RExC_rxi->regstclass = (regnode*)RExC_rxi->data->data[n];
- r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
+ RExC_rx->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
- regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
+ regprop(RExC_rx, sv, (regnode*)data.start_class, NULL, pRExC_state);
Perl_re_printf( aTHX_
"synthetic stclass \"%s\".\n",
SvPVX_const(sv));});
}
if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) {
- r->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
- r->maxlen = REG_INFTY;
+ RExC_rx->extflags |= RXf_UNBOUNDED_QUANTIFIER_SEEN;
+ RExC_rx->maxlen = REG_INFTY;
}
else {
- r->maxlen = RExC_maxlen;
+ RExC_rx->maxlen = RExC_maxlen;
}
/* Guard against an embedded (?=) or (?<=) with a longer minlen than
the "real" pattern. */
DEBUG_OPTIMISE_r({
- Perl_re_printf( aTHX_ "minlen: %" IVdf " r->minlen:%" IVdf " maxlen:%" IVdf "\n",
- (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen);
+ Perl_re_printf( aTHX_ "minlen: %" IVdf " RExC_rx->minlen:%" IVdf " maxlen:%" IVdf "\n",
+ (IV)minlen, (IV)RExC_rx->minlen, (IV)RExC_maxlen);
});
- r->minlenret = minlen;
- if (r->minlen < minlen)
- r->minlen = minlen;
+ RExC_rx->minlenret = minlen;
+ if (RExC_rx->minlen < minlen)
+ RExC_rx->minlen = minlen;
if (RExC_seen & REG_RECURSE_SEEN ) {
- r->intflags |= PREGf_RECURSE_SEEN;
- Newx(r->recurse_locinput, r->nparens + 1, char *);
+ RExC_rx->intflags |= PREGf_RECURSE_SEEN;
+ Newx(RExC_rx->recurse_locinput, RExC_rx->nparens + 1, char *);
}
if (RExC_seen & REG_GPOS_SEEN)
- r->intflags |= PREGf_GPOS_SEEN;
+ RExC_rx->intflags |= PREGf_GPOS_SEEN;
if (RExC_seen & REG_LOOKBEHIND_SEEN)
- r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
+ RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the
lookbehind */
if (pRExC_state->code_blocks)
- r->extflags |= RXf_EVAL_SEEN;
+ RExC_rx->extflags |= RXf_EVAL_SEEN;
if (RExC_seen & REG_VERBARG_SEEN)
{
- r->intflags |= PREGf_VERBARG_SEEN;
- r->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
+ RExC_rx->intflags |= PREGf_VERBARG_SEEN;
+ RExC_rx->extflags |= RXf_NO_INPLACE_SUBST; /* don't understand this! Yves */
}
if (RExC_seen & REG_CUTGROUP_SEEN)
- r->intflags |= PREGf_CUTGROUP_SEEN;
+ RExC_rx->intflags |= PREGf_CUTGROUP_SEEN;
if (pm_flags & PMf_USE_RE_EVAL)
- r->intflags |= PREGf_USE_RE_EVAL;
+ RExC_rx->intflags |= PREGf_USE_RE_EVAL;
if (RExC_paren_names)
- RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
+ RXp_PAREN_NAMES(RExC_rx) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
else
- RXp_PAREN_NAMES(r) = NULL;
+ RXp_PAREN_NAMES(RExC_rx) = NULL;
/* If we have seen an anchor in our pattern then we set the extflag RXf_IS_ANCHORED
* so it can be used in pp.c */
- if (r->intflags & PREGf_ANCH)
- r->extflags |= RXf_IS_ANCHORED;
+ if (RExC_rx->intflags & PREGf_ANCH)
+ RExC_rx->extflags |= RXf_IS_ANCHORED;
{
U8 nop = OP(next);
if (PL_regkind[fop] == NOTHING && nop == END)
- r->extflags |= RXf_NULL;
+ RExC_rx->extflags |= RXf_NULL;
else if ((fop == MBOL || (fop == SBOL && !first->flags)) && nop == END)
/* when fop is SBOL first->flags will be true only when it was
* produced by parsing /\A/, and not when parsing /^/. This is
* very important for the split code as there we want to
* treat /^/ as /^/m, but we do not want to treat /\A/ as /^/m.
* See rt #122761 for more details. -- Yves */
- r->extflags |= RXf_START_ONLY;
+ RExC_rx->extflags |= RXf_START_ONLY;
else if (fop == PLUS
&& PL_regkind[nop] == POSIXD && FLAGS(next) == _CC_SPACE
&& nop == END)
- r->extflags |= RXf_WHITE;
- else if ( r->extflags & RXf_SPLIT
- && (fop == EXACT || fop == EXACTL)
+ RExC_rx->extflags |= RXf_WHITE;
+ else if ( RExC_rx->extflags & RXf_SPLIT
+ && (fop == EXACT || fop == EXACT_ONLY8 || fop == EXACTL)
&& STR_LEN(first) == 1
&& *(STRING(first)) == ' '
&& nop == END )
- r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
+ RExC_rx->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
}
if (RExC_contains_locale) {
- RXp_EXTFLAGS(r) |= RXf_TAINTED;
+ RXp_EXTFLAGS(RExC_rx) |= RXf_TAINTED;
}
#ifdef DEBUGGING
ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - REGNODE_OFFSET(scan));
}
- Newxz(r->offs, RExC_npar, regexp_paren_pair);
+ Newxz(RExC_rx->offs, RExC_total_parens, regexp_paren_pair);
/* assume we don't need to swap parens around before we match */
DEBUG_TEST_r({
Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n",
DEBUG_DUMP_r({
DEBUG_RExC_seen();
Perl_re_printf( aTHX_ "Final program:\n");
- regdump(r);
+ regdump(RExC_rx);
});
-#ifdef RE_TRACK_PATTERN_OFFSETS
- DEBUG_OFFSETS_r(if (RExC_offsets) {
- const STRLEN len = RExC_offsets[0];
- STRLEN i;
- GET_RE_DEBUG_FLAGS_DECL;
- Perl_re_printf( aTHX_
- "Offsets: [%" UVuf "]\n\t", (UV)RExC_offsets[0]);
- for (i = 1; i <= len; i++) {
- if (RExC_offsets[i*2-1] || RExC_offsets[i*2])
- Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ",
- (UV)i, (UV)RExC_offsets[i*2-1], (UV)RExC_offsets[i*2]);
- }
- Perl_re_printf( aTHX_ "\n");
- });
-#endif
+
+ if (RExC_open_parens) {
+ Safefree(RExC_open_parens);
+ RExC_open_parens = NULL;
+ }
+ if (RExC_close_parens) {
+ Safefree(RExC_close_parens);
+ RExC_close_parens = NULL;
+ }
#ifdef USE_ITHREADS
/* under ithreads the ?pat? PMf_USED flag on the pmop is simulated
S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
{
char *name_start = RExC_parse;
+ SV* sv_name;
PERL_ARGS_ASSERT_REG_SCAN_NAME;
character */
vFAIL("Group name must start with a non-digit word character");
}
- if ( flags ) {
- SV* sv_name
- = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
+ sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
SVs_TEMP | (UTF ? SVf_UTF8 : 0));
- if ( flags == REG_RSN_RETURN_NAME)
- return sv_name;
- else if (flags==REG_RSN_RETURN_DATA) {
- HE *he_str = NULL;
- SV *sv_dat = NULL;
- if ( ! sv_name ) /* should not happen*/
- Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
- if (RExC_paren_names)
- he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
- if ( he_str )
- sv_dat = HeVAL(he_str);
- if ( ! sv_dat ) /* Didn't find group */
+ if ( flags == REG_RSN_RETURN_NAME)
+ return sv_name;
+ else if (flags==REG_RSN_RETURN_DATA) {
+ HE *he_str = NULL;
+ SV *sv_dat = NULL;
+ if ( ! sv_name ) /* should not happen*/
+ Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
+ if (RExC_paren_names)
+ he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
+ if ( he_str )
+ sv_dat = HeVAL(he_str);
+ if ( ! sv_dat ) { /* Didn't find group */
+
+ /* It might be a forward reference; we can't fail until we
+ * know, by completing the parse to get all the groups, and
+ * then reparsing */
+ if (RExC_total_parens > 0) {
vFAIL("Reference to nonexistent named group");
- return sv_dat;
- }
- else {
- Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
- (unsigned long) flags);
+ }
+ else {
+ REQUIRE_PARENS_PASS;
+ }
}
- NOT_REACHED; /* NOTREACHED */
+ return sv_dat;
}
- return NULL;
+
+ Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
+ (unsigned long) flags);
}
#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
} else \
Perl_re_printf( aTHX_ "%16s",""); \
\
- if (SIZE_ONLY) \
- num = RExC_size + 1; \
- else \
- num=REG_NODE_NUM(REGNODE_p(RExC_emit)); \
+ num=REG_NODE_NUM(REGNODE_p(RExC_emit)); \
if (RExC_lastnum!=num) \
Perl_re_printf( aTHX_ "|%4d", num); \
else \
invlist_iterinit(invlist);
while (invlist_iternext(invlist, &start, &end)) {
if (end == UV_MAX) {
- Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFINITY%c",
+ Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFTY%c",
start, intra_range_delimiter,
inter_range_delimiter);
}
[0] 0x000A .. 0x000D
[2] 0x0085
[4] 0x2028 .. 0x2029
- [6] 0x3104 .. INFINITY
+ [6] 0x3104 .. INFTY
* This means that the first range of code points matched by the list are
* 0xA through 0xD; the second range contains only the single code point
* 0x85, etc. An inversion list is an array of UVs. Two array elements
while (invlist_iternext(invlist, &start, &end)) {
if (end == UV_MAX) {
Perl_dump_indent(aTHX_ level, file,
- "%s[%" UVuf "] 0x%04" UVXf " .. INFINITY\n",
+ "%s[%" UVuf "] 0x%04" UVXf " .. INFTY\n",
indent, (UV)count, start);
}
else if (end != start) {
RExC_parse++;
has_use_defaults = TRUE;
STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
- set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
+ set_regex_charset(&RExC_flags, (RExC_uni_semantics)
? REGEX_UNICODE_CHARSET
: REGEX_DEPENDS_CHARSET);
}
cs = get_regex_charset(RExC_flags);
if (cs == REGEX_DEPENDS_CHARSET
- && (RExC_utf8 || RExC_uni_semantics))
+ && (RExC_uni_semantics))
{
cs = REGEX_UNICODE_CHARSET;
}
* pattern (or target, not known until runtime) are
* utf8, or something in the pattern indicates unicode
* semantics */
- cs = (RExC_utf8 || RExC_uni_semantics)
+ cs = (RExC_uni_semantics)
? REGEX_UNICODE_CHARSET
: REGEX_DEPENDS_CHARSET;
has_charset_modifier = DEPENDS_PAT_MOD;
NOT_REACHED; /*NOTREACHED*/
case ONCE_PAT_MOD: /* 'o' */
case GLOBAL_PAT_MOD: /* 'g' */
- if (PASS2 && ckWARN(WARN_REGEXP)) {
+ if (ckWARN(WARN_REGEXP)) {
const I32 wflagbit = *RExC_parse == 'o'
? WASTED_O
: WASTED_G;
break;
case CONTINUE_PAT_MOD: /* 'c' */
- if (PASS2 && ckWARN(WARN_REGEXP)) {
+ if (ckWARN(WARN_REGEXP)) {
if (! (wastedflags & WASTED_C) ) {
wastedflags |= WASTED_GC;
/* diag_listed_as: Useless (?-%s) - don't use /%s modifier in regex; marked by <-- HERE in m/%s/ */
regnode_offset ret;
char* name_start = RExC_parse;
U32 num = 0;
- SV *sv_dat = reg_scan_name(pRExC_state, SIZE_ONLY
- ? REG_RSN_RETURN_NULL
- : REG_RSN_RETURN_DATA);
+ SV *sv_dat = reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_HANDLE_NAMED_BACKREF;
vFAIL2("Sequence %.3s... not terminated", parse_start);
}
- if (!SIZE_ONLY) {
+ if (sv_dat) {
num = add_data( pRExC_state, STR_WITH_LEN("S"));
RExC_rxi->data->data[num]=(void*)sv_dat;
- SvREFCNT_inc_simple_void(sv_dat);
+ SvREFCNT_inc_simple_void_NN(sv_dat);
}
RExC_sawback = 1;
ret = reganode(pRExC_state,
*
* Returns 0 otherwise, with *flagp set to indicate why:
* TRYAGAIN at the end of (?) that only sets flags.
- * RESTART_PARSE if the sizing scan needs to be restarted, or'd with
+ * RESTART_PARSE if the parse needs to be restarted, or'd with
* NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
* Otherwise would only return 0 if regbranch() returns 0, which cannot
* happen. */
ret = reg2Lanode(pRExC_state, op, 0, internal_argval);
}
RExC_seen |= REG_VERBARG_SEEN;
- if ( ! SIZE_ONLY ) {
- if (start_arg) {
- SV *sv = newSVpvn( start_arg,
- RExC_parse - start_arg);
- ARG(REGNODE_p(ret)) = add_data( pRExC_state,
- STR_WITH_LEN("S"));
- RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
- FLAGS(REGNODE_p(ret)) = 1;
- } else {
- FLAGS(REGNODE_p(ret)) = 0;
- }
- if ( internal_argval != -1 )
- ARG2L_SET(REGNODE_p(ret), internal_argval);
+ if (start_arg) {
+ SV *sv = newSVpvn( start_arg,
+ RExC_parse - start_arg);
+ ARG(REGNODE_p(ret)) = add_data( pRExC_state,
+ STR_WITH_LEN("S"));
+ RExC_rxi->data->data[ARG(REGNODE_p(ret))]=(void*)sv;
+ FLAGS(REGNODE_p(ret)) = 1;
+ } else {
+ FLAGS(REGNODE_p(ret)) = 0;
}
+ if ( internal_argval != -1 )
+ ARG2L_SET(REGNODE_p(ret), internal_argval);
nextchar(pRExC_state);
return ret;
}
/* FALLTHROUGH */
case '\'': /* (?'...') */
name_start = RExC_parse;
- svname = reg_scan_name(pRExC_state,
- SIZE_ONLY /* reverse test from the others */
- ? REG_RSN_RETURN_NAME
- : REG_RSN_RETURN_NULL);
+ svname = reg_scan_name(pRExC_state, REG_RSN_RETURN_NAME);
if ( RExC_parse == name_start
|| RExC_parse >= RExC_end
|| *RExC_parse != paren)
vFAIL2("Sequence (?%c... not terminated",
paren=='>' ? '<' : paren);
}
- if (SIZE_ONLY) {
+ {
HE *he_str;
SV *sv_dat = NULL;
if (!svname) /* shouldn't happen */
/* Yes this does cause a memory leak in debugging Perls
* */
if (!av_store(RExC_paren_name_list,
- RExC_npar, SvREFCNT_inc(svname)))
+ RExC_npar, SvREFCNT_inc_NN(svname)))
SvREFCNT_dec_NN(svname);
#endif
buffers in alternations share the same numbers */
paren = ':';
after_freeze = freeze_paren = RExC_npar;
+
+ /* XXX This construct currently requires an extra pass.
+ * Investigation would be required to see if that could be
+ * changed */
+ REQUIRE_PARENS_PASS;
break;
case ':': /* (?:...) */
case '>': /* (?>...) */
FAIL("Sequence (?R) not terminated");
num = 0;
RExC_seen |= REG_RECURSE_SEEN;
+
+ /* XXX These constructs currently require an extra pass.
+ * It probably could be changed */
+ REQUIRE_PARENS_PASS;
+
*flagp |= POSTPONED;
goto gen_recurse_regop;
/*notreached*/
parse_start = RExC_parse - 1;
named_recursion:
{
- SV *sv_dat = reg_scan_name(pRExC_state,
- SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
- num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
+ SV *sv_dat = reg_scan_name(pRExC_state,
+ REG_RSN_RETURN_DATA);
+ num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
}
if (RExC_parse >= RExC_end || *RExC_parse != ')')
vFAIL("Sequence (?&... not terminated");
*/
num = RExC_npar + num;
if (num < 1) {
- RExC_parse++;
- vFAIL("Reference to nonexistent group");
+
+ /* It might be a forward reference; we can't fail until
+ * we know, by completing the parse to get all the
+ * groups, and then reparsing */
+ if (RExC_total_parens > 0) {
+ RExC_parse++;
+ vFAIL("Reference to nonexistent group");
+ }
+ else {
+ REQUIRE_PARENS_PASS;
+ }
}
} else if ( paren == '+' ) {
num = RExC_npar + num - 1;
*/
ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count);
- if (!SIZE_ONLY) {
- if (num > (I32)RExC_rx->nparens) {
- RExC_parse++;
- vFAIL("Reference to nonexistent group");
- }
- RExC_recurse_count++;
- DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
- "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
- 22, "| |", (int)(depth * 2 + 1), "",
- (UV)ARG(REGNODE_p(ret)),
- (IV)ARG2L(REGNODE_p(ret))));
+ if (num >= RExC_npar) {
+
+ /* It might be a forward reference; we can't fail until we
+ * know, by completing the parse to get all the groups, and
+ * then reparsing */
+ if (RExC_total_parens > 0) {
+ if (num >= RExC_total_parens) {
+ RExC_parse++;
+ vFAIL("Reference to nonexistent group");
+ }
+ }
+ else {
+ REQUIRE_PARENS_PASS;
+ }
}
+ RExC_recurse_count++;
+ DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
+ "%*s%*s Recurse #%" UVuf " to %" IVdf "\n",
+ 22, "| |", (int)(depth * 2 + 1), "",
+ (UV)ARG(REGNODE_p(ret)),
+ (IV)ARG2L(REGNODE_p(ret))));
RExC_seen |= REG_RECURSE_SEEN;
Set_Node_Length(REGNODE_p(ret),
{
U32 n = 0;
struct reg_code_block *cb;
+ OP * o;
RExC_seen_zerolen++;
/* this is a pre-compiled code block (?{...}) */
cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index];
RExC_parse = RExC_start + cb->end;
- if (!SIZE_ONLY) {
- OP *o = cb->block;
- if (cb->src_regex) {
- n = add_data(pRExC_state, STR_WITH_LEN("rl"));
- RExC_rxi->data->data[n] =
- (void*)SvREFCNT_inc((SV*)cb->src_regex);
- RExC_rxi->data->data[n+1] = (void*)o;
- }
- else {
- n = add_data(pRExC_state,
- (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
- RExC_rxi->data->data[n] = (void*)o;
- }
- }
+ o = cb->block;
+ if (cb->src_regex) {
+ n = add_data(pRExC_state, STR_WITH_LEN("rl"));
+ RExC_rxi->data->data[n] =
+ (void*)SvREFCNT_inc((SV*)cb->src_regex);
+ RExC_rxi->data->data[n+1] = (void*)o;
+ }
+ else {
+ n = add_data(pRExC_state,
+ (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l", 1);
+ RExC_rxi->data->data[n] = (void*)o;
+ }
pRExC_state->code_index++;
nextchar(pRExC_state);
* return value */
RExC_flags & RXf_PMf_COMPILETIME
);
- if (!SIZE_ONLY) {
- FLAGS(REGNODE_p(ret)) = 2;
- }
+ FLAGS(REGNODE_p(ret)) = 2;
REGTAIL(pRExC_state, ret, eval);
/* deal with the length of this later - MJD */
return ret;
regnode_offset tail;
ret = reg_node(pRExC_state, LOGICAL);
- if (!SIZE_ONLY)
- FLAGS(REGNODE_p(ret)) = 1;
+ FLAGS(REGNODE_p(ret)) = 1;
tail = reg(pRExC_state, 1, &flag, depth+1);
RETURN_FAIL_ON_RESTART(flag, flagp);
char ch = RExC_parse[0] == '<' ? '>' : '\'';
char *name_start= RExC_parse++;
U32 num = 0;
- SV *sv_dat=reg_scan_name(pRExC_state,
- SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
+ SV *sv_dat=reg_scan_name(pRExC_state, REG_RSN_RETURN_DATA);
if ( RExC_parse == name_start
|| RExC_parse >= RExC_end
|| *RExC_parse != ch)
(ch == '>' ? '<' : ch));
}
RExC_parse++;
- if (!SIZE_ONLY) {
+ if (sv_dat) {
num = add_data( pRExC_state, STR_WITH_LEN("S"));
RExC_rxi->data->data[num]=(void*)sv_dat;
- SvREFCNT_inc_simple_void(sv_dat);
+ SvREFCNT_inc_simple_void_NN(sv_dat);
}
ret = reganode(pRExC_state, NGROUPP, num);
goto insert_if_check_paren;
SV *sv_dat;
RExC_parse++;
sv_dat = reg_scan_name(pRExC_state,
- SIZE_ONLY
- ? REG_RSN_RETURN_NULL
- : REG_RSN_RETURN_DATA);
-
- /* we should only have a false sv_dat when
- * SIZE_ONLY is true, and we always have false
- * sv_dat when SIZE_ONLY is true.
- * reg_scan_name() will VFAIL() if the name is
- * unknown when SIZE_ONLY is false, and otherwise
- * will return something, and when SIZE_ONLY is
- * true, reg_scan_name() just parses the string,
- * and doesnt return anything. (in theory) */
- assert(SIZE_ONLY ? !sv_dat : !!sv_dat);
-
+ REG_RSN_RETURN_DATA);
if (sv_dat)
parno = 1 + *((I32 *)SvPVX(sv_dat));
}
}
else
REGTAIL(pRExC_state, ret, ender);
+#if 0 /* Removing this doesn't cause failures in the test suite -- khw */
RExC_size++; /* XXX WHY do we need this?!!
For large programs it seems to be required
but I can't figure out why. -- dmq*/
+#endif
return ret;
}
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
capturing_parens:
parno = RExC_npar;
RExC_npar++;
+ if (RExC_total_parens <= 0) {
+ /* If we are in our first pass through (and maybe only pass),
+ * we need to allocate memory for the capturing parentheses
+ * data structures. Since we start at npar=1, when it reaches
+ * 2, for the first time it has something to put in it. Above
+ * 2 means we extend what we already have */
+ if (RExC_npar == 2) {
+ /* setup RExC_open_parens, which holds the address of each
+ * OPEN tag, and to make things simpler for the 0 index the
+ * start of the program - this is used later for offsets */
+ Newxz(RExC_open_parens, RExC_npar, regnode_offset);
+ RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */
+
+ /* setup RExC_close_parens, which holds the address of each
+ * CLOSE tag, and to make things simpler for the 0 index
+ * the end of the program - this is used later for offsets
+ * */
+ Newxz(RExC_close_parens, RExC_npar, regnode_offset);
+ /* we dont know where end op starts yet, so we dont need to
+ * set RExC_close_parens[0] like we do RExC_open_parens[0]
+ * above */
+ }
+ else {
+ Renew(RExC_open_parens, RExC_npar, regnode_offset);
+ Zero(RExC_open_parens + RExC_npar - 1, 1, regnode_offset);
+
+ Renew(RExC_close_parens, RExC_npar, regnode_offset);
+ Zero(RExC_close_parens + RExC_npar - 1, 1, regnode_offset);
+ }
+ }
ret = reganode(pRExC_state, OPEN, parno);
- if (!SIZE_ONLY ){
- if (!RExC_nestroot)
- RExC_nestroot = parno;
- if (RExC_open_parens && !RExC_open_parens[parno])
- {
- DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
- "%*s%*s Setting open paren #%" IVdf " to %d\n",
- 22, "| |", (int)(depth * 2 + 1), "",
- (IV)parno, REG_NODE_NUM(REGNODE_p(ret))));
- RExC_open_parens[parno]= ret;
- }
- }
+ if (!RExC_nestroot)
+ RExC_nestroot = parno;
+ if (RExC_open_parens && !RExC_open_parens[parno])
+ {
+ DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
+ "%*s%*s Setting open paren #%" IVdf " to %d\n",
+ 22, "| |", (int)(depth * 2 + 1), "",
+ (IV)parno, REG_NODE_NUM(REGNODE_p(ret))));
+ RExC_open_parens[parno]= ret;
+ }
Set_Node_Length(REGNODE_p(ret), 1); /* MJD */
Set_Node_Offset(REGNODE_p(ret), RExC_parse); /* MJD */
FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags);
}
if (*RExC_parse == '|') {
- if (!SIZE_ONLY && RExC_extralen) {
+ if (RExC_use_BRANCHJ) {
reginsert(pRExC_state, BRANCHJ, br, depth+1);
}
else { /* MJD */
Set_Node_Offset_To_R(br, parse_start-RExC_start);
}
have_branch = 1;
- if (SIZE_ONLY)
- RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
}
else if (paren == ':') {
*flagp |= flags&SIMPLE;
*flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
lastbr = br;
while (*RExC_parse == '|') {
- if (!SIZE_ONLY && RExC_extralen) {
+ if (RExC_use_BRANCHJ) {
ender = reganode(pRExC_state, LONGJMP, 0);
/* Append to the previous. */
REGNODE_OFFSET(NEXTOPER(NEXTOPER(REGNODE_p(lastbr)))),
ender);
}
- if (SIZE_ONLY)
- RExC_extralen += 2; /* Account for LONGJMP. */
nextchar(pRExC_state);
if (freeze_paren) {
if (RExC_npar > after_freeze)
break;
case 0:
ender = reg_node(pRExC_state, END);
- if (!SIZE_ONLY) {
- assert(!RExC_end_op); /* there can only be one! */
- RExC_end_op = REGNODE_p(ender);
- if (RExC_close_parens) {
- DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
- "%*s%*s Setting close paren #0 (END) to %d\n",
- 22, "| |", (int)(depth * 2 + 1), "",
- REG_NODE_NUM(REGNODE_p(ender))));
+ assert(!RExC_end_op); /* there can only be one! */
+ RExC_end_op = REGNODE_p(ender);
+ if (RExC_close_parens) {
+ DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_
+ "%*s%*s Setting close paren #0 (END) to %d\n",
+ 22, "| |", (int)(depth * 2 + 1), "",
+ REG_NODE_NUM(REGNODE_p(ender))));
- RExC_close_parens[0]= ender;
- }
+ RExC_close_parens[0]= ender;
}
break;
}
- DEBUG_PARSE_r(if (!SIZE_ONLY) {
+ DEBUG_PARSE_r(
DEBUG_PARSE_MSG("lsbr");
regprop(RExC_rx, RExC_mysv1, REGNODE_p(lastbr), NULL, pRExC_state);
regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state);
(IV)REG_NODE_NUM(REGNODE_p(ender)),
(IV)(ender - lastbr)
);
- });
+ );
REGTAIL(pRExC_state, lastbr, ender);
- if (have_branch && !SIZE_ONLY) {
+ if (have_branch) {
char is_nothing= 1;
if (depth==1)
RExC_seen |= REG_TOP_LEVEL_BRANCHES_SEEN;
br= PL_regkind[OP(ret_as_regnode)] != BRANCH
? regnext(ret_as_regnode)
: ret_as_regnode;
- DEBUG_PARSE_r(if (!SIZE_ONLY) {
+ DEBUG_PARSE_r(
DEBUG_PARSE_MSG("NADA");
regprop(RExC_rx, RExC_mysv1, ret_as_regnode,
NULL, pRExC_state);
(IV)REG_NODE_NUM(REGNODE_p(ender)),
(IV)(ender - ret)
);
- });
+ );
OP(br)= NOTHING;
if (OP(REGNODE_p(ender)) == TAIL) {
NEXT_OFF(br)= 0;
* On success, returns the offset at which any next node should be placed into
* the regex engine program being compiled.
*
- * Returns 0 otherwise, setting flagp to RESTART_PARSE if the sizing scan needs
+ * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
* to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
* UTF-8
*/
if (first)
ret = 0;
else {
- if (!SIZE_ONLY && RExC_extralen)
+ if (RExC_use_BRANCHJ)
ret = reganode(pRExC_state, BRANCHJ, 0);
else {
ret = reg_node(pRExC_state, BRANCH);
}
}
- if (!first && SIZE_ONLY)
- RExC_extralen += 1; /* BRANCHJ */
-
*flagp = WORST; /* Tentatively. */
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
/* FIXME adding one for every branch after the first is probably
* excessive now we have TRIE support. (hv) */
MARK_NAUGHTY(1);
+ if ( chain > (SSize_t) BRANCH_MAX_OFFSET
+ && ! RExC_use_BRANCHJ)
+ {
+ /* XXX We could just redo this branch, but figuring out what
+ * bookkeeping needs to be reset is a pain */
+ REQUIRE_BRANCHJ(flagp, 0);
+ }
REGTAIL(pRExC_state, chain, latest);
}
chain = latest;
*
* Returns 0 otherwise, with *flagp set to indicate why:
* TRYAGAIN if regatom() returns 0 with TRYAGAIN.
- * RESTART_PARSE if the sizing scan needs to be restarted, or'd with
+ * RESTART_PARSE if the parse needs to be restarted, or'd with
* NEED_UTF8 if the pattern needs to be upgraded to UTF-8.
*/
STATIC regnode_offset
if (max < min) { /* If can't match, warn and optimize to fail
unconditionally */
reginsert(pRExC_state, OPFAIL, orig_emit, depth+1);
- if (PASS2) {
- ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
- NEXT_OFF(REGNODE_p(orig_emit)) =
- regarglen[OPFAIL] + NODE_STEP_REGNODE;
- }
+ ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
+ NEXT_OFF(REGNODE_p(orig_emit)) =
+ regarglen[OPFAIL] + NODE_STEP_REGNODE;
return ret;
}
else if (min == max && *RExC_parse == '?')
FLAGS(REGNODE_p(w)) = 0;
REGTAIL(pRExC_state, ret, w);
- if (!SIZE_ONLY && RExC_extralen) {
+ if (RExC_use_BRANCHJ) {
reginsert(pRExC_state, LONGJMP, ret, depth+1);
reginsert(pRExC_state, NOTHING, ret, depth+1);
NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over LONGJMP. */
Set_Node_Length(REGNODE_p(ret),
op == '{' ? (RExC_parse - parse_start) : 1);
- if (!SIZE_ONLY && RExC_extralen)
+ if (RExC_use_BRANCHJ)
NEXT_OFF(REGNODE_p(ret)) = 3; /* Go over NOTHING to
LONGJMP. */
REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
- if (SIZE_ONLY)
- RExC_whilem_seen++, RExC_extralen += 3;
+ RExC_whilem_seen++;
MARK_NAUGHTY_EXP(1, 4); /* compound interest */
}
FLAGS(REGNODE_p(ret)) = 0;
*flagp = WORST;
if (max > 0)
*flagp |= HASWIDTH;
- if (!SIZE_ONLY) {
- ARG1_SET(REGNODE_p(ret), (U16)min);
- ARG2_SET(REGNODE_p(ret), (U16)max);
- }
+ ARG1_SET(REGNODE_p(ret), (U16)min);
+ ARG2_SET(REGNODE_p(ret), (U16)max);
if (max == REG_INFTY)
RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
goto do_curly;
}
nest_check:
- if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
- SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
+ if (!(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
ckWARN2reg(RExC_parse,
"%" UTF8f " matches null string many times",
UTF8fARG(UTF, (RExC_parse >= origparse
? RExC_parse - origparse
: 0),
origparse));
- (void)ReREFCNT_inc(RExC_rx_sv);
}
if (*RExC_parse == '?') {
* function calling S_reg().
*
* The final possibility is that it is premature to be calling this function;
- * that pass1 needs to be restarted. This can happen when this changes from
+ * the parse needs to be restarted. This can happen when this changes from
* /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The
* latter occurs only when the fourth possibility would otherwise be in
* effect, and is because one of those code points requires the pattern to be
* The error reporting mechanism doesn't work for 2 levels of this, but the
* code above has validated this new construct, so there should be no
* errors generated by the below. And this isn' an exact copy, so the
- * mechanism to seamlessly deal with this won't work. XXX Maybe should
- * turn off all warnings for safety? */
+ * mechanism to seamlessly deal with this won't work, so turn off warnings
+ * during it */
save_start = RExC_start;
orig_end = RExC_end;
RExC_parse = RExC_start = SvPVX(substitute_parse);
RExC_end = RExC_parse + SvCUR(substitute_parse);
+ TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
*node_p = reg(pRExC_state, 1, &flags, depth+1);
/* Restore the saved values */
+ RESTORE_WARNINGS;
RExC_start = save_start;
RExC_parse = endbrace;
RExC_end = orig_end;
SvREFCNT_dec_NN(substitute_parse);
if (! *node_p) {
- RETURN_X_ON_RESTART(FALSE, flags, flagp);
+ RETURN_FAIL_ON_RESTART(flags, flagp);
FAIL2("panic: reg returned failure to grok_bslash_N, flags=%#" UVxf,
(UV) flags);
}
*
* If <len> is zero, the function assumes that the node is to contain only
* the single character given by <code_point> and calculates what <len>
- * should be. In pass 1, it sizes the node appropriately. In pass 2, it
- * additionally will populate the node's STRING with <code_point> or its
+ * should be. It populates the node's STRING with <code_point> or its
* fold if folding.
*
* In both cases <*flagp> is appropriately set
PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
- /* Don't bother to check for downgrading in PASS1, as it doesn't make any
- * sizing difference, and is extra work that is thrown away */
- if (downgradable && ! PASS2) {
- downgradable = FALSE;
- }
-
if (! len_passed_in) {
if (UTF) {
if (UVCHR_IS_INVARIANT(code_point)) {
if (LOC || ! FOLD) { /* /l defers folding until runtime */
*character = (U8) code_point;
}
- else { /* Here is /i and not /l. (toFOLD() is defined on just
- ASCII, which isn't the same thing as INVARIANT on
- EBCDIC, but it works there, as the extra invariants
- fold to themselves) */
+ else { /* Here is /i and not /l. */
*character = toFOLD((U8) code_point);
/* We can downgrade to an EXACT node if this character
}
len = 1;
}
- else if (FOLD && (! LOC
+ else if (FOLD && ( ! LOC
|| ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
{ /* Folding, and ok to do so now */
UV folded = _to_uni_fold_flags(
}
}
- if (SIZE_ONLY) {
- RExC_size += STR_SZ(len);
+ if (downgradable) {
+ change_engine_size(pRExC_state, STR_SZ(len));
}
- else {
- RExC_emit += STR_SZ(len);
- STR_LEN(REGNODE_p(node)) = len;
- if (! len_passed_in) {
- Copy((char *) character, STRING(REGNODE_p(node)), len, char);
- }
+
+ RExC_emit += STR_SZ(len);
+ STR_LEN(REGNODE_p(node)) = len;
+ if (! len_passed_in) {
+ Copy((char *) character, STRING(REGNODE_p(node)), len, char);
}
*flagp |= HASWIDTH;
*flagp |= SIMPLE;
}
- /* The OP may not be well defined in PASS1 */
- if (PASS2 && OP(REGNODE_p(node)) == EXACTFL) {
+ if (OP(REGNODE_p(node)) == EXACTFL) {
RExC_contains_locale = 1;
}
}
at which any next regnode should be placed.
Returns 0, setting *flagp to TRYAGAIN if reg() returns 0 with TRYAGAIN.
- Returns 0, setting *flagp to RESTART_PARSE if the sizing scan needs to be
+ Returns 0, setting *flagp to RESTART_PARSE if the parse needs to be
restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to UTF-8
Otherwise does not return 0.
FALSE, /* don't silence non-portable warnings. */
(bool) RExC_strict,
TRUE, /* Allow an optimized regnode result */
- NULL,
NULL);
if (ret == 0) {
- RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp, NEED_UTF8);
+ RETURN_FAIL_ON_RESTART_FLAGP(flagp);
FAIL2("panic: regclass returned failure to regatom, flags=%#" UVxf,
(UV) *flagp);
}
/* Special Escapes
This switch handles escape sequences that resolve to some kind
- of special regop and not to literal text. Escape sequnces that
+ of special regop and not to literal text. Escape sequences that
resolve to literal text are handled below in the switch marked
"Literal Escapes".
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, SBOL);
/* SBOL is shared with /^/ so we set the flags so we can tell
- * /\A/ from /^/ in split. We check ret because first pass we
- * have no regop struct to set the flags on. */
- if (PASS2)
- FLAGS(REGNODE_p(ret)) = 1;
+ * /\A/ from /^/ in split. */
+ FLAGS(REGNODE_p(ret)) = 1;
*flagp |= SIMPLE;
goto finish_meta_pat;
case 'G':
RExC_seen |= REG_LOOKBEHIND_SEEN;
op = BOUND + charset;
- if (op == BOUNDL) {
+ if (op == BOUND) {
+ RExC_seen_d_op = TRUE;
+ }
+ else if (op == BOUNDL) {
RExC_contains_locale = 1;
}
*flagp |= SIMPLE;
if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') {
FLAGS(REGNODE_p(ret)) = TRADITIONAL_BOUND;
- if (PASS2 && op > BOUNDA) { /* /aa is same as /a */
+ if (op > BOUNDA) { /* /aa is same as /a */
OP(REGNODE_p(ret)) = BOUNDA;
}
}
RExC_parse = endbrace;
REQUIRE_UNI_RULES(flagp, 0);
- if (PASS2 && op >= BOUNDA) { /* /aa is same as /a */
+ if (op >= BOUNDA) { /* /aa is same as /a */
OP(REGNODE_p(ret)) = BOUNDU;
length += 4;
}
}
- if (PASS2 && invert) {
+ if (invert) {
OP(REGNODE_p(ret)) += NBOUND - BOUND;
}
goto finish_meta_pat;
else if (op == POSIXL) {
RExC_contains_locale = 1;
}
+ else if (op == POSIXD) {
+ RExC_seen_d_op = TRUE;
+ }
join_posix_op_known:
}
ret = reg_node(pRExC_state, op);
- if (! SIZE_ONLY) {
- FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
- }
+ FLAGS(REGNODE_p(ret)) = namedclass_to_classnum(arg);
*flagp |= HASWIDTH|SIMPLE;
/* FALLTHROUGH */
non-portables */
(bool) RExC_strict,
TRUE, /* Allow an optimized regnode result */
- NULL,
NULL);
RETURN_FAIL_ON_RESTART_FLAGP(flagp);
/* regclass() can only return RESTART_PARSE and NEED_UTF8 if
vFAIL("Unterminated \\g{...} pattern");
RExC_parse++;
}
- if (!SIZE_ONLY) {
- if (num > (I32)RExC_rx->nparens)
- vFAIL("Reference to nonexistent group");
+ if (num >= (I32)RExC_npar) {
+
+ /* It might be a forward reference; we can't fail until we
+ * know, by completing the parse to get all the groups, and
+ * then reparsing */
+ if (RExC_total_parens > 0) {
+ if (num >= RExC_total_parens) {
+ vFAIL("Reference to nonexistent group");
+ }
+ }
+ else {
+ REQUIRE_PARENS_PASS;
+ }
}
RExC_sawback = 1;
ret = reganode(pRExC_state,
? REFFL
: REFF),
num);
+ if (OP(REGNODE_p(ret)) == REFF) {
+ RExC_seen_d_op = TRUE;
+ }
*flagp |= HASWIDTH;
/* override incorrect value set in reganode MJD */
/* This allows us to fill a node with just enough spare so that if the final
* character folds, its expansion is guaranteed to fit */
#define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
- char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE+1];
char *s0;
U8 upper_parse = MAX_NODE_STRING_SIZE;
/* We start out as an EXACT node, even if under /i, until we find a
* character which is in a fold. The algorithm now segregates into
* separate nodes, characters that fold from those that don't under
- * /i. (This hopefull will create nodes that are fixed strings
- * even under /i, giving the optimizer something to grab onto to.)
+ * /i. (This hopefully will create nodes that are fixed strings
+ * even under /i, giving the optimizer something to grab on to.)
* So, if a node has something in it and the next character is in
* the opposite category, that node is closed up, and the function
* returns. Then regatom is called again, and a new node is
* created for the new category. */
U8 node_type = EXACT;
+ /* Assume the node will be fully used; the excess is given back at
+ * the end. We can't make any other length assumptions, as a byte
+ * input sequence could shrink down. */
+ Ptrdiff_t initial_size = STR_SZ(256);
+
bool next_is_quantifier;
char * oldp = NULL;
* Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they
* contain only above-Latin1 characters (hence must be in UTF8),
* which don't participate in folds with Latin1-range characters,
- * as the latter's folds aren't known until runtime. (We don't
- * need to figure this out until pass 2) */
- bool maybe_exactfu = PASS2;
+ * as the latter's folds aren't known until runtime. */
+ bool maybe_exactfu = FOLD;
- /* To see if RExC_uni_semantics changes during parsing of the node.
- * */
- bool uni_semantics_at_node_start;
+ /* An EXACTF node that otherwise could be turned into EXACTFU,
+ * can't be if it starts and/or ends with [Ss]. Because, during
+ * optimization it could be joined with another node that ends
+ * and/or starts with [Ss], creating the sequence 'ss', which needs
+ * to remain in an EXACTF node. This flag is used to signal this
+ * situation */
+ bool maybe_exactfs = FALSE;
+
+ /* Single-character EXACTish nodes are almost always SIMPLE. This
+ * allows us to override this as encountered */
+ U8 maybe_SIMPLE = SIMPLE;
+
+ /* Does this node contain something that can't match unless the
+ * target string is (also) in UTF-8 */
+ bool requires_utf8_target = FALSE;
+
+ bool has_micro_sign = FALSE;
- /* The node_type may change below, but since the size of the node
- * doesn't change, it works */
- ret = reg_node(pRExC_state, node_type);
+ /* Allocate an EXACT node. The node_type may change below to
+ * another EXACTish node, but since the size of the node doesn't
+ * change, it works */
+ ret = regnode_guts(pRExC_state, node_type, initial_size, "exact");
+ FILL_NODE(ret, node_type);
+ RExC_emit++;
- /* In pass1, folded, we use a temporary buffer instead of the
- * actual node, as the node doesn't exist yet */
- s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(REGNODE_p(ret));
+ s = STRING(REGNODE_p(ret));
s0 = s;
|| UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
|| UTF8_IS_START(UCHARAT(RExC_parse)));
- uni_semantics_at_node_start = cBOOL(RExC_uni_semantics);
/* Here, we have a literal character. Find the maximal string of
* them in the input that we can fit into a single EXACTish node.
}
p = RExC_parse;
RExC_parse = parse_start;
- if (ender > 0xff) {
- REQUIRE_UTF8(flagp);
+
+ /* The \N{} means the pattern, if previously /d,
+ * becomes /u. That means it can't be an EXACTF node,
+ * but an EXACTFU */
+ if (node_type == EXACTF) {
+ node_type = EXACTFU;
+
+ /* If the node already contains something that
+ * differs between EXACTF and EXACTFU, reparse it
+ * as EXACTFU */
+ if (! maybe_exactfu) {
+ len = 0;
+ s = s0;
+ maybe_exactfu = FOLD; /* Prob. unnecessary */
+ goto reparse;
+ }
}
+
break;
case 'r':
ender = '\r';
RExC_end,
&result,
&error_msg,
- PASS2, /* out warnings */
+ TO_OUTPUT_WARNINGS(p),
(bool) RExC_strict,
TRUE, /* Output warnings
for non-
to exact spot of failure */
vFAIL(error_msg);
}
+ UPDATE_WARNINGS_LOC(p - 1);
ender = result;
- if (ender > 0xff) {
- REQUIRE_UTF8(flagp);
- }
break;
}
case 'x':
RExC_end,
&result,
&error_msg,
- PASS2, /* out warnings */
+ TO_OUTPUT_WARNINGS(p),
(bool) RExC_strict,
TRUE, /* Silence warnings
for non-
to exact spot of failure */
vFAIL(error_msg);
}
+ UPDATE_WARNINGS_LOC(p - 1);
ender = result;
if (ender < 0x100) {
}
#endif
}
- else {
- REQUIRE_UTF8(flagp);
- }
break;
}
case 'c':
p++;
- ender = grok_bslash_c(*p++, PASS2);
+ ender = grok_bslash_c(*p, TO_OUTPUT_WARNINGS(p));
+ UPDATE_WARNINGS_LOC(p);
+ p++;
break;
case '8': case '9': /* must be a backreference */
--p;
I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
STRLEN numlen = 3;
ender = grok_oct(p, &numlen, &flags, NULL);
- if (ender > 0xff) {
- REQUIRE_UTF8(flagp);
- }
p += numlen;
- if ( numlen < 3 /* like \08, \178 */
- && isDIGIT(*p) && ckWARN(WARN_REGEXP))
+ if ( isDIGIT(*p) /* like \08, \178 */
+ && ckWARN(WARN_REGEXP)
+ && numlen < 3)
{
reg_warn_non_literal_string(
p + 1,
/* FALLTHROUGH */
default:
if (isALPHANUMERIC(*p)) {
- /* Include any left brace following the alpha to emphasize
- * that it could be part of an escape at some point
- * in the future */
- int len = (isALPHA(*p) && *(p + 1) == '{') ? 2 : 1;
- ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
+ /* An alpha followed by '{' is going to fail next
+ * iteration, so don't output this warning in that
+ * case */
+ if (! isALPHA(*p) || *(p + 1) != '{') {
+ ckWARN2reg(p + 1, "Unrecognized escape \\%.1s"
+ " passed through", p);
+ }
}
goto normal_default;
} /* End of switch on '\' */
/* Here, have looked at the literal character, and <ender>
* contains its ordinal; <p> points to the character after it.
- * We need to check if the next non-ignored thing is a
+ * */
+
+ if (ender > 255) {
+ REQUIRE_UTF8(flagp);
+ }
+
+ /* We need to check if the next non-ignored thing is a
* quantifier. Move <p> to after anything that should be
* ignored, which, as a side effect, positions <p> for the next
* loop iteration */
if (! FOLD) { /* The simple case, just append the literal */
- /* In the sizing pass, we need only the size of the
- * character we are appending, hence we can delay getting
- * its representation until PASS2. */
- if (SIZE_ONLY) {
- if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
- const STRLEN unilen = UVCHR_SKIP(ender);
- s += unilen;
- added_len = unilen;
+ not_fold_common:
+ if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
+ *(s++) = (char) ender;
}
else {
- s++;
- }
- } else { /* PASS2 */
- not_fold_common:
- if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
U8 * new_s = uvchr_to_utf8((U8*)s, ender);
added_len = (char *) new_s - s;
s = (char *) new_s;
+
+ if (ender > 255) {
+ requires_utf8_target = TRUE;
+ }
}
- else {
- *(s++) = (char) ender;
- }
- }
}
else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
* existing node, so can start a new node with this one */
if (! len) {
node_type = EXACTFL;
+ RExC_contains_locale = 1;
}
else if (node_type == EXACT) {
p = oldp;
/* This code point means we can't simplify things */
maybe_exactfu = FALSE;
- /* A problematic code point in this context means that its
- * fold isn't known until runtime, so we can't fold it now.
- * (The non-problematic code points are the above-Latin1
- * ones that fold to also all above-Latin1. Their folds
- * don't vary no matter what the locale is.) But here we
- * have characters whose fold depends on the locale.
- * Unlike the non-folding case above, we have to keep track
- * of these in the sizing pass, so that we can make sure we
- * don't split too-long nodes in the middle of a potential
- * multi-char fold. And unlike the regular fold case
- * handled in the else clauses below, we don't actually
- * fold and don't have special cases to consider. What we
- * do for both passes is the PASS2 code for non-folding */
+ /* Here, we are adding a problematic fold character.
+ * "Problematic" in this context means that its fold isn't
+ * known until runtime. (The non-problematic code points
+ * are the above-Latin1 ones that fold to also all
+ * above-Latin1. Their folds don't vary no matter what the
+ * locale is.) But here we have characters whose fold
+ * depends on the locale. We just add in the unfolded
+ * character, and wait until runtime to fold it */
goto not_fold_common;
}
- else /* A regular FOLD code point */
- if (! UTF)
+ else /* regular fold; see if actually is in a fold */
+ if ( (ender < 256 && ! IS_IN_SOME_FOLD_L1(ender))
+ || (ender > 255
+ && ! _invlist_contains_cp(PL_utf8_foldable, ender)))
{
- /* Here, are folding and are not UTF-8 encoded; therefore
- * the character must be in the range 0-255, and is not /l.
- * (Not /l because we already handled these under /l in
- * is_PROBLEMATIC_LOCALE_FOLD_cp) */
- if (! IS_IN_SOME_FOLD_L1(ender)) {
-
- /* Start a new node for this non-folding character if
- * previous ones in the node were folded */
- if (len && node_type != EXACT) {
- p = oldp;
- goto loopdone;
- }
+ /* Here, folding, but the character isn't in a fold.
+ *
+ * Start a new node if previous characters in the node were
+ * folded */
+ if (len && node_type != EXACT) {
+ p = oldp;
+ goto loopdone;
+ }
- *(s++) = (char) ender;
+ /* Here, continuing a node with non-folded characters. Add
+ * this one */
+ goto not_fold_common;
+ }
+ else { /* Here, does participate in some fold */
+
+ /* If this is the first character in the node, change its
+ * type to folding. Otherwise, if this is the first
+ * folding character in the node, close up the existing
+ * node, so can start a new node with this one. */
+ if (! len) {
+ node_type = compute_EXACTish(pRExC_state);
+ }
+ else if (node_type == EXACT) {
+ p = oldp;
+ goto loopdone;
}
- else { /* Here, does participate in some fold */
-
- /* if this is the first character in the node, change
- * its type to folding. Otherwise, if this is the
- * first folding character in the node, close up the
- * existing node, so can start a new node with this
- * one. */
- if (! len) {
- node_type = compute_EXACTish(pRExC_state);
+
+ if (UTF) { /* For UTF-8, we add the folded value */
+ if (UVCHR_IS_INVARIANT(ender)) {
+ *(s)++ = (U8) toFOLD(ender);
}
- else if (node_type == EXACT) {
- p = oldp;
- goto loopdone;
+ else {
+ ender = _to_uni_fold_flags(
+ ender,
+ (U8 *) s,
+ &added_len,
+ FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
+ s += added_len;
+
+ if (ender > 255) {
+ requires_utf8_target = TRUE;
+ if (UNLIKELY(ender == GREEK_SMALL_LETTER_MU)) {
+ has_micro_sign = TRUE;
+ }
+ }
}
+ }
+ else {
- /* See if the character's fold differs between /d and
- * /u. On non-ancient Unicode versions, this includes
- * the multi-char fold SHARP S to 'ss' */
+ /* Here is non-UTF8; we don't normally store the folded
+ * value. First, see if the character's fold differs
+ * between /d and /u. */
+ if (PL_fold[ender] != PL_fold_latin1[ender]) {
+ maybe_exactfu = FALSE;
+ }
#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
|| (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
|| UNICODE_DOT_DOT_VERSION > 0)
- if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
-
- /* See comments for join_exact() as to why we fold
- * this non-UTF at compile time */
- if (node_type == EXACTFU) {
- *(s++) = 's';
+ /* On non-ancient Unicode versions, this includes the
+ * multi-char fold SHARP S to 'ss' */
- /* Let the code below add in the extra 's' */
- ender = 's';
- added_len = 2;
- }
- else if ( uni_semantics_at_node_start
- != RExC_uni_semantics)
- {
- /* Here, we are supossed to be using Unicode
- * rules, but this folding node is not. This
- * happens during pass 1 when the node started
- * out not under Unicode rules, but a \N{} was
- * encountered during the processing of it,
- * causing Unicode rules to be switched into.
- * Pass 1 continues uninterrupted, as by the
- * time we get to pass 2, we will know enough
- * to generate the correct folds. Except in
- * this one case, we need to restart the node,
- * because the fold of the sharp s requires 2
- * characters, and the sizing needs to account
- * for that. */
- p = oldp;
- goto loopdone;
- }
- else {
- RExC_seen_unfolded_sharp_s = 1;
- maybe_exactfu = FALSE;
- }
+ if (len == 0 && isALPHA_FOLD_EQ(ender, 's')) {
+ maybe_exactfs = TRUE; /* Node begins with 's' */
}
- else if ( len
- && isALPHA_FOLD_EQ(ender, 's')
- && isALPHA_FOLD_EQ(*(s-1), 's'))
+ else if ( UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)
+ || ( isALPHA_FOLD_EQ(ender, 's')
+ && isALPHA_FOLD_EQ(*(s-1), 's')))
{
- maybe_exactfu = FALSE;
+ /* Here, we have one of the following:
+ * a) a SHARP S. This folds to 'ss' only under
+ * /u rules. If we are in that situation,
+ * fold the SHARP S to 'ss'. See the comments
+ * for join_exact() as to why we fold this
+ * non-UTF at compile time, and no others.
+ * b) 'ss'. When under /u, there's nothing
+ * special needed to be done here. The
+ * previous iteration handled the first 's',
+ * and this iteration will handle the second.
+ * If, on the otherhand it's not /u, we have
+ * to exclude the possibility of moving to /u,
+ * so that we won't generate an unwanted
+ * match, unless, at runtime, the target
+ * string is in UTF-8.
+ * */
+
+ maybe_exactfs = FALSE; /* Can't generate an
+ EXACTFS node */
+ maybe_exactfu = FALSE; /* Nor EXACTFU (unless we
+ already are in one) */
+ if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
+ maybe_SIMPLE = 0;
+ if (node_type == EXACTFU) {
+ *(s++) = 's';
+
+ /* Let the code below add in the extra 's' */
+ ender = 's';
+ added_len = 2;
+ }
+ }
}
- else
#endif
- if (PL_fold[ender] != PL_fold_latin1[ender]) {
- maybe_exactfu = FALSE;
+ else if (UNLIKELY(ender == MICRO_SIGN)) {
+ has_micro_sign = TRUE;
}
/* Even when folding, we store just the input
- * character, as we have an array that finds its fold
- * quickly */
+ * character. The bottom line reason to do this is
+ * because the fold for MICRO SIGN requires UTF-8. But
+ * there's no real performance penalty for not folding,
+ * as we have an array that finds any fold quickly. */
*(s++) = (char) ender;
}
- }
- else { /* FOLD, and UTF */
- /* Unlike the non-fold case, we do actually have to
- * calculate the fold in pass 1. This is for two reasons,
- * the folded length may be longer than the unfolded, and
- * we have to calculate how many EXACTish nodes it will
- * take; and we may run out of room in a node in the middle
- * of a potential multi-char fold, and have to back off
- * accordingly. */
-
- if (isASCII_uni(ender)) {
-
- /* As above, we close up and start a new node if the
- * previous characters don't match the fold/non-fold
- * state of this one. And if this is the first
- * character in the node, and it folds, we change the
- * node away from being EXACT */
- if (! IS_IN_SOME_FOLD_L1(ender)) {
- if (len && node_type != EXACT) {
- p = oldp;
- goto loopdone;
- }
-
- *(s)++ = (U8) ender;
- }
- else { /* Is in a fold */
-
- if (! len) {
- node_type = compute_EXACTish(pRExC_state);
- }
- else if (node_type == EXACT) {
- p = oldp;
- goto loopdone;
- }
-
- *(s)++ = (U8) toFOLD(ender);
- }
- }
- else { /* Not ASCII */
- STRLEN foldlen;
-
- /* As above, we close up and start a new node if the
- * previous characters don't match the fold/non-fold
- * state of this one. And if this is the first
- * character in the node, and it folds, we change the
- * node away from being EXACT */
- if (! _invlist_contains_cp(PL_utf8_foldable, ender)) {
- if (len && node_type != EXACT) {
- p = oldp;
- goto loopdone;
- }
-
- s = (char *) uvchr_to_utf8((U8 *) s, ender);
- added_len = UVCHR_SKIP(ender);
- }
- else {
-
- if (! len) {
- node_type = compute_EXACTish(pRExC_state);
- }
- else if (node_type == EXACT) {
- p = oldp;
- goto loopdone;
- }
-
- ender = _to_uni_fold_flags(
- ender,
- (U8 *) s,
- &foldlen,
- FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0));
- s += foldlen;
- added_len = foldlen;
- }
- }
- }
+ } /* End of adding current character to the node */
len += added_len;
if (len == 0) {
len = full_len;
- /* If the node ends in an 's' we make sure it stays EXACTF,
- * as if it turns into an EXACTFU, it could later get
- * joined with another 's' that would then wrongly match
- * the sharp s */
- if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
- {
- maybe_exactfu = FALSE;
- }
} else {
/* Here, the node does contain some characters that aren't
loopdone: /* Jumped to when encounters something that shouldn't be
in the node */
+ /* Free up any over-allocated space */
+ change_engine_size(pRExC_state, - (initial_size - STR_SZ(len)));
+
/* I (khw) don't know if you can get here with zero length, but the
* old code handled this situation by creating a zero-length EXACT
* node. Might as well be NOTHING instead */
OP(REGNODE_p(ret)) = NOTHING;
}
else {
- OP(REGNODE_p(ret)) = node_type;
/* If the node type is EXACT here, check to see if it
- * should be EXACTL. */
+ * should be EXACTL, or EXACT_ONLY8. */
if (node_type == EXACT) {
if (LOC) {
- OP(REGNODE_p(ret)) = EXACTL;
+ node_type = EXACTL;
+ }
+ else if (requires_utf8_target) {
+ node_type = EXACT_ONLY8;
}
}
if (FOLD) {
+ /* If the node ends in an 's' it can't now be changed into
+ * an EXACTFU, as the node could later get joined with another
+ * one that begins with 's' and that combination that would
+ * then wrongly match the sharp s under /di. (Note that if
+ * it's already EXACTFU, this is irrelevant) If this is
+ * the only reason keeping it from being an EXACTFU, we
+ * create a special node type so that at joining time, we
+ * can turn it into an EXACTFU if no 'ss' is formed */
+ if (isALPHA_FOLD_EQ(ender, 's')) {
+ if (maybe_exactfu && node_type == EXACTF) {
+ node_type = (maybe_exactfs)
+ ? EXACTFS_BE_U
+ : EXACTFS_E_U;
+ }
+ maybe_exactfu = FALSE;
+ }
+
/* If 'maybe_exactfu' is set, then there are no code points
* that match differently depending on UTF8ness of the
* target string (for /u), or depending on locale for /l */
if (maybe_exactfu) {
if (node_type == EXACTF) {
- OP(REGNODE_p(ret)) = EXACTFU;
+ node_type = EXACTFU;
}
else if (node_type == EXACTFL) {
- OP(REGNODE_p(ret)) = EXACTFLU8;
+ node_type = EXACTFLU8;
+ }
+ }
+ else if (node_type == EXACTF) {
+ RExC_seen_d_op = TRUE;
+
+ /* If the only thing keeping this from being EXACTFU is
+ * that it begins with 's', change it to a special node
+ * type so that during the later join, we can easily
+ * check for, and do the change there if appropriate */
+ if (maybe_exactfs) {
+ node_type = EXACTFS_B_U;
}
}
+
+ /* The micro sign is the only below 256 character that
+ * folds to above 255 */
+ if ( node_type == EXACTFU
+ && requires_utf8_target
+ && LIKELY(! has_micro_sign))
+ {
+ node_type = EXACTFU_ONLY8;
+ }
}
- alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
- FALSE /* Don't look to see if could
- be turned into an EXACT
- node, as we have already
- computed that */
- );
+ OP(REGNODE_p(ret)) = node_type;
+ STR_LEN(REGNODE_p(ret)) = len;
+ RExC_emit += STR_SZ(len);
+
+ /* If the node isn't a single character, it can't be SIMPLE */
+ if (len > ((UTF) ? UVCHR_SKIP(ender) : 1)) {
+ maybe_SIMPLE = 0;
+ }
+
+ *flagp |= HASWIDTH | maybe_SIMPLE;
}
- RExC_parse = p - 1;
- Set_Node_Cur_Length(REGNODE_p(ret), parse_start);
- RExC_parse = p;
+ Set_Node_Length(REGNODE_p(ret), p - parse_start - 1);
+ RExC_parse = p;
+
{
/* len is STRLEN which is unsigned, need to copy to signed */
IV iv = len;
/* Position parse to next real character */
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
FALSE /* Don't force to /x */ );
- if ( PASS2 && *RExC_parse == '{'
+ if ( *RExC_parse == '{'
&& OP(REGNODE_p(ret)) != SBOL && ! regcurly(RExC_parse))
{
if (RExC_strict || new_regcurly(RExC_parse, RExC_end)) {
* In b) there may be errors or warnings generated. If 'check_only' is
* TRUE, then any errors are discarded. Warnings are returned to the
* caller via an AV* created into '*posix_warnings' if it is not NULL. If
- * instead it is NULL, warnings are suppressed. This is done in all
- * passes. The reason for this is that the rest of the parsing is heavily
- * dependent on whether this routine found a valid posix class or not. If
- * it did, the closing ']' is absorbed as part of the class. If no class,
- * or an invalid one is found, any ']' will be considered the terminator of
- * the outer bracketed character class, leading to very different results.
- * In particular, a '(?[ ])' construct will likely have a syntax error if
- * the class is parsed other than intended, and this will happen in pass1,
- * before the warnings would normally be output. This mechanism allows the
- * caller to output those warnings in pass1 just before dieing, giving a
- * much better clue as to what is wrong.
+ * instead it is NULL, warnings are suppressed.
*
* The reason for this function, and its complexity is that a bracketed
* character class can contain just about anything. But it's easy to
const bool save_fold = FOLD; /* Temporary */
char *save_end, *save_parse; /* Temporaries */
const bool in_locale = LOC; /* we turn off /l during processing */
- AV* posix_warnings = NULL;
GET_RE_DEBUG_FLAGS_DECL;
* compile time values are valid in all runtime cases */
REQUIRE_UNI_RULES(flagp, 0);
- /* This will return only an ANYOF regnode, or (unlikely) something smaller
- * (such as EXACT). Thus we can skip most everything if just sizing. We
- * call regclass to handle '[]' so as to not have to reinvent its parsing
- * rules here (throwing away the size it computes each time). And, we exit
- * upon an unescaped ']' that isn't one ending a regclass. To do both
- * these things, we need to realize that something preceded by a backslash
- * is escaped, so we have to keep track of backslashes */
- if (SIZE_ONLY) {
- UV nest_depth = 0; /* how many nested (?[...]) constructs */
-
- while (RExC_parse < RExC_end) {
- SV* current = NULL;
-
- skip_to_be_ignored_text(pRExC_state, &RExC_parse,
- TRUE /* Force /x */ );
-
- switch (*RExC_parse) {
- case '(':
- if (RExC_parse[1] == '?' && RExC_parse[2] == '[')
- nest_depth++, RExC_parse+=2;
- /* FALLTHROUGH */
- default:
- break;
- case '\\':
- /* Skip past this, so the next character gets skipped, after
- * the switch */
- RExC_parse++;
- if (*RExC_parse == 'c') {
- /* Skip the \cX notation for control characters */
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
- }
- break;
-
- case '[':
- {
- /* See if this is a [:posix:] class. */
- bool is_posix_class = (OOB_NAMEDCLASS
- < handle_possible_posix(pRExC_state,
- RExC_parse + 1,
- NULL,
- NULL,
- TRUE /* checking only */));
- /* If it is a posix class, leave the parse pointer at the
- * '[' to fool regclass() into thinking it is part of a
- * '[[:posix:]]'. */
- if (! is_posix_class) {
- RExC_parse++;
- }
-
- /* regclass() can only return RESTART_PARSE and NEED_UTF8
- * if multi-char folds are allowed. */
- if (!regclass(pRExC_state, flagp, depth+1,
- is_posix_class, /* parse the whole char
- class only if not a
- posix class */
- FALSE, /* don't allow multi-char folds */
- TRUE, /* silence non-portable warnings. */
- TRUE, /* strict */
- FALSE, /* Require return to be an ANYOF */
- ¤t,
- &posix_warnings
- ))
- FAIL2("panic: regclass returned failure to handle_sets, "
- "flags=%#" UVxf, (UV) *flagp);
-
- /* function call leaves parse pointing to the ']', except
- * if we faked it */
- if (is_posix_class) {
- RExC_parse--;
- }
-
- SvREFCNT_dec(current); /* In case it returned something */
- break;
- }
-
- case ']':
- if (RExC_parse[1] == ')') {
- RExC_parse++;
- if (nest_depth--) break;
- node = reganode(pRExC_state, ANYOF, 0);
- nextchar(pRExC_state);
- Set_Node_Length(REGNODE_p(node),
- RExC_parse - oregcomp_parse + 1); /* MJD */
- if (in_locale) {
- set_regex_charset(&RExC_flags, REGEX_LOCALE_CHARSET);
- }
-
- return node;
- }
- /* We output the messages even if warnings are off, because we'll fail
- * the very next thing, and these give a likely diagnosis for that */
- if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
- output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
- }
- RExC_parse++;
- vFAIL("Unexpected ']' with no following ')' in (?[...");
- }
-
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
- }
-
- /* We output the messages even if warnings are off, because we'll fail
- * the very next thing, and these give a likely diagnosis for that */
- if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
- output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
- }
-
- vFAIL("Syntax error in (?[...])");
- }
-
- /* Pass 2 only after this. */
ckWARNexperimental(RExC_parse,
WARN_EXPERIMENTAL__REGEX_SETS,
"The regex_sets feature is experimental");
skip_to_be_ignored_text(pRExC_state, &RExC_parse,
TRUE /* Force /x */ );
- if (RExC_parse >= RExC_end) {
- Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
+ if (RExC_parse >= RExC_end) { /* Fail */
+ break;
}
curchar = UCHARAT(RExC_parse);
if (UCHARAT(RExC_parse) != ')')
vFAIL("Expecting close paren for wrapper for nested extended charclass");
- RExC_parse++;
RExC_flags = save_flags;
goto handle_operand;
}
FALSE, /* don't silence non-portable warnings. */
TRUE, /* strict */
FALSE, /* Require return to be an ANYOF */
- ¤t,
- NULL))
+ ¤t))
{
FAIL2("panic: regclass returned failure to handle_sets, "
"flags=%#" UVxf, (UV) *flagp);
TRUE, /* silence non-portable warnings. */
TRUE, /* strict */
FALSE, /* Require return to be an ANYOF */
- ¤t,
- NULL
- ))
+ ¤t))
{
FAIL2("panic: regclass returned failure to handle_sets, "
"flags=%#" UVxf, (UV) *flagp);
}
+ if (! current) {
+ break;
+ }
+
/* function call leaves parse pointing to the ']', except if we
* faked it */
if (is_posix_class) {
case ')':
if (av_tindex_skip_len_mg(fence_stack) < 0) {
+ if (UCHARAT(RExC_parse - 1) == ']') {
+ break;
+ }
RExC_parse++;
vFAIL("Unexpected ')'");
}
default:
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
+ if (RExC_parse >= RExC_end) {
+ break;
+ }
vFAIL("Unexpected character");
handle_operand:
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
} /* End of loop parsing through the construct */
+ vFAIL("Syntax error in (?[...])");
+
done:
+
+ if (RExC_parse >= RExC_end || RExC_parse[1] != ')') {
+ if (RExC_parse < RExC_end) {
+ RExC_parse++;
+ }
+
+ vFAIL("Unexpected ']' with no following ')' in (?[...");
+ }
+
if (av_tindex_skip_len_mg(fence_stack) >= 0) {
vFAIL("Unmatched (");
}
RExC_parse = SvPV(result_string, len);
save_end = RExC_end;
RExC_end = RExC_parse + len;
+ TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE;
/* We turn off folding around the call, as the class we have constructed
* already has all folding taken into consideration, and we don't want
they're valid on this machine */
FALSE, /* similarly, no need for strict */
FALSE, /* Require return to be an ANYOF */
- NULL,
NULL
);
+ RESTORE_WARNINGS;
RExC_parse = save_parse + 1;
RExC_end = save_end;
SvREFCNT_dec_NN(final);
}
STATIC void
-S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings)
+S_output_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings)
{
- /* If the final parameter is NULL, output the elements of the array given
- * by '*posix_warnings' as REGEXP warnings. Otherwise, the elements are
- * pushed onto it, (creating if necessary) */
+ /* Output the elements of the array given by '*posix_warnings' as REGEXP
+ * warnings. */
SV * msg;
- const bool first_is_fatal = ! return_posix_warnings
- && ckDEAD(packWARN(WARN_REGEXP));
+ const bool first_is_fatal = ckDEAD(packWARN(WARN_REGEXP));
+
+ PERL_ARGS_ASSERT_OUTPUT_POSIX_WARNINGS;
- PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS;
+ if (! TO_OUTPUT_WARNINGS(RExC_parse)) {
+ return;
+ }
while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) {
- if (return_posix_warnings) {
- if (! *return_posix_warnings) { /* mortalize to not leak if
- warnings are fatal */
- *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV());
- }
- av_push(*return_posix_warnings, msg);
- }
- else {
- if (first_is_fatal) { /* Avoid leaking this */
- av_undef(posix_warnings); /* This isn't necessary if the
- array is mortal, but is a
- fail-safe */
- (void) sv_2mortal(msg);
- if (PASS2) {
- SAVEFREESV(RExC_rx_sv);
- }
- }
- Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
- SvREFCNT_dec_NN(msg);
+ if (first_is_fatal) { /* Avoid leaking this */
+ av_undef(posix_warnings); /* This isn't necessary if the
+ array is mortal, but is a
+ fail-safe */
+ (void) sv_2mortal(msg);
+ PREPARE_TO_DIE;
}
+ Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg));
+ SvREFCNT_dec_NN(msg);
}
+
+ UPDATE_WARNINGS_LOC(RExC_parse);
}
STATIC AV *
const bool strict,
bool optimizable, /* ? Allow a non-ANYOF return
node */
- SV** ret_invlist, /* Return an inversion list, not a node */
- AV** return_posix_warnings
+ SV** ret_invlist /* Return an inversion list, not a node */
)
{
/* parse a bracketed class specification. Most of these will produce an
* On success, returns the offset at which any next node should be placed
* into the regex engine program being compiled.
*
- * Returns 0 otherwise, setting flagp to RESTART_PARSE if the sizing scan needs
+ * Returns 0 otherwise, setting flagp to RESTART_PARSE if the parse needs
* to be restarted, or'd with NEED_UTF8 if the pattern needs to be upgraded to
* UTF-8
*/
bool warn_super = ALWAYS_WARN_SUPER;
- const regnode_offset orig_emit = RExC_emit; /* Save the original RExC_emit in
- case we need to change the emitted regop to an EXACT. */
const char * orig_parse = RExC_parse;
- const SSize_t orig_size = RExC_size;
bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
/* This variable is used to mark where the end in the input is of something
char *not_posix_region_end = RExC_parse - 1;
AV* posix_warnings = NULL;
- const bool do_posix_warnings = return_posix_warnings
- || (PASS2 && ckWARN(WARN_REGEXP));
+ const bool do_posix_warnings = ckWARN(WARN_REGEXP);
U8 op = END; /* The returned node-type, initialized to an impossible
one. */
U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */
U32 posixl = 0; /* bit field of posix classes matched under /l */
+ bool use_anyofd = FALSE; /* ? Is this to be an ANYOFD node */
GET_RE_DEBUG_FLAGS_DECL;
PERL_UNUSED_ARG(depth);
#endif
+
+ /* If wants an inversion list returned, we can't optimize to something
+ * else. */
+ if (ret_invlist) {
+ optimizable = FALSE;
+ }
+
DEBUG_PARSE("clas");
#if UNICODE_MAJOR_VERSION < 3 /* no multifolds in early Unicode */ \
allow_multi_folds = FALSE;
#endif
- if (SIZE_ONLY) {
- listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
- }
- else {
- listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
- initial_listsv_len = SvCUR(listsv);
- SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
- }
+ listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
+ initial_listsv_len = SvCUR(listsv);
+ SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
SKIP_BRACKETED_WHITE_SPACE(skip_white, RExC_parse);
¬_posix_region_end,
NULL,
TRUE /* checking only */);
- if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
- SAVEFREESV(RExC_rx_sv);
+ if (maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) {
ckWARN4reg(not_posix_region_end,
"POSIX syntax [%c %c] belongs inside character classes%s",
*RExC_parse, *RExC_parse,
: " (but this one isn't fully valid)")
: ""
);
- (void)ReREFCNT_inc(RExC_rx_sv);
}
}
{
/* Warnings about posix class issues are considered tentative until
* we are far enough along in the parse that we can no longer
- * change our mind, at which point we either output them or add
- * them, if it has so specified, to what gets returned to the
- * caller. This is done each time through the loop so that a later
- * class won't zap them before they have been dealt with. */
- output_or_return_posix_warnings(pRExC_state, posix_warnings,
- return_posix_warnings);
+ * change our mind, at which point we output them. This is done
+ * each time through the loop so that a later class won't zap them
+ * before they have been dealt with. */
+ output_posix_warnings(pRExC_state, posix_warnings);
}
if (RExC_parse >= stop_ptr) {
RExC_parse--;
vFAIL("\\N{} in inverted character class or as a range end-point is restricted to one character");
}
- else {
- ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
- }
+ ckWARNreg(RExC_parse, "Using just the first character returned by \\N{} in character class");
break; /* <value> contains the first code
point. Drop out of the switch to
process it */
|_CORE_SWASH_INIT_ACCEPT_INVLIST;
SvREFCNT_dec(swash); /* Free any left-overs */
+
+ /* \p means they want Unicode semantics */
+ REQUIRE_UNI_RULES(flagp, 0);
+
if (RExC_parse >= RExC_end)
vFAIL2("Empty \\%c", (U8)value);
if (*RExC_parse == '{') {
e = RExC_parse;
n = 1;
}
- if (!SIZE_ONLY) {
+ {
char* name = RExC_parse;
char* base_name; /* name after any packages are stripped */
char* lookup_name = NULL;
}
}
}
- } /* End of actually getting the values in pass 2 */
+ }
RExC_parse = e + 1;
namedclass = ANYOF_UNIPROP; /* no official name, but it's
named */
-
- /* \p means they want Unicode semantics */
- REQUIRE_UNI_RULES(flagp, 0);
}
break;
case 'n': value = '\n'; break;
RExC_end,
&value,
&error_msg,
- PASS2, /* warnings only in
- pass 2 */
+ TO_OUTPUT_WARNINGS(RExC_parse),
strict,
silence_non_portable,
UTF);
if (! valid) {
vFAIL(error_msg);
}
+ UPDATE_WARNINGS_LOC(RExC_parse - 1);
}
non_portable_endpoint++;
break;
RExC_end,
&value,
&error_msg,
- PASS2, /* Output warnings */
+ TO_OUTPUT_WARNINGS(RExC_parse),
strict,
silence_non_portable,
UTF);
if (! valid) {
vFAIL(error_msg);
}
+ UPDATE_WARNINGS_LOC(RExC_parse - 1);
}
non_portable_endpoint++;
break;
case 'c':
- value = grok_bslash_c(*RExC_parse++, PASS2);
+ value = grok_bslash_c(*RExC_parse, TO_OUTPUT_WARNINGS(RExC_parse));
+ UPDATE_WARNINGS_LOC(RExC_parse);
+ RExC_parse++;
non_portable_endpoint++;
break;
case '0': case '1': case '2': case '3': case '4':
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
vFAIL("Need exactly 3 octal digits");
}
- else if (! SIZE_ONLY /* like \08, \178 */
- && numlen < 3
+ else if ( numlen < 3 /* like \08, \178 */
&& RExC_parse < RExC_end
&& isDIGIT(*RExC_parse)
&& ckWARN(WARN_REGEXP))
{
- SAVEFREESV(RExC_rx_sv);
reg_warn_non_literal_string(
RExC_parse + 1,
form_short_octal_warning(RExC_parse, numlen));
- (void)ReREFCNT_inc(RExC_rx_sv);
}
}
non_portable_endpoint++;
}
default:
/* Allow \_ to not give an error */
- if (!SIZE_ONLY && isWORDCHAR(value) && value != '_') {
+ if (isWORDCHAR(value) && value != '_') {
if (strict) {
vFAIL2("Unrecognized escape \\%c in character class",
(int)value);
}
else {
- SAVEFREESV(RExC_rx_sv);
ckWARN2reg(RExC_parse,
"Unrecognized escape \\%c in character class passed through",
(int)value);
- (void)ReREFCNT_inc(RExC_rx_sv);
}
}
break;
* literal, as is the character that began the false range, i.e.
* the 'a' in the examples */
if (range) {
- if (!SIZE_ONLY) {
- const int w = (RExC_parse >= rangebegin)
- ? RExC_parse - rangebegin
- : 0;
- if (strict) {
- vFAIL2utf8f(
- "False [] range \"%" UTF8f "\"",
- UTF8fARG(UTF, w, rangebegin));
- }
- else {
- SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */
- ckWARN2reg(RExC_parse,
- "False [] range \"%" UTF8f "\"",
- UTF8fARG(UTF, w, rangebegin));
- (void)ReREFCNT_inc(RExC_rx_sv);
- cp_list = add_cp_to_invlist(cp_list, '-');
- cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
- prevvalue);
- }
- }
+ const int w = (RExC_parse >= rangebegin)
+ ? RExC_parse - rangebegin
+ : 0;
+ if (strict) {
+ vFAIL2utf8f(
+ "False [] range \"%" UTF8f "\"",
+ UTF8fARG(UTF, w, rangebegin));
+ }
+ else {
+ ckWARN2reg(RExC_parse,
+ "False [] range \"%" UTF8f "\"",
+ UTF8fARG(UTF, w, rangebegin));
+ cp_list = add_cp_to_invlist(cp_list, '-');
+ cp_foldable_list = add_cp_to_invlist(cp_foldable_list,
+ prevvalue);
+ }
range = 0; /* this was not a true range */
element_count += 2; /* So counts for three values */
&& classnum != _CC_ASCII
#endif
) {
+ SV* scratch_list = NULL;
+
/* What the Posix classes (like \w, [:space:]) match in locale
* isn't knowable under locale until actual match time. Room
* must be reserved (one time per outer bracketed class) to
POSIXL_SET(posixl, namedclass);
/* The above-Latin1 characters are not subject to locale rules.
- * Just add them, in the second pass, to the
- * unconditionally-matched list */
- if (! SIZE_ONLY) {
- SV* scratch_list = NULL;
-
- /* Get the list of the above-Latin1 code points this
- * matches */
- _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
- PL_XPosix_ptrs[classnum],
-
- /* Odd numbers are complements, like
- * NDIGIT, NASCII, ... */
- namedclass % 2 != 0,
- &scratch_list);
- /* Checking if 'cp_list' is NULL first saves an extra
- * clone. Its reference count will be decremented at the
- * next union, etc, or if this is the only instance, at the
- * end of the routine */
- if (! cp_list) {
- cp_list = scratch_list;
- }
- else {
- _invlist_union(cp_list, scratch_list, &cp_list);
- SvREFCNT_dec_NN(scratch_list);
- }
- continue; /* Go get next character */
+ * Just add them to the unconditionally-matched list */
+
+ /* Get the list of the above-Latin1 code points this matches */
+ _invlist_intersection_maybe_complement_2nd(PL_AboveLatin1,
+ PL_XPosix_ptrs[classnum],
+
+ /* Odd numbers are complements, like
+ * NDIGIT, NASCII, ... */
+ namedclass % 2 != 0,
+ &scratch_list);
+ /* Checking if 'cp_list' is NULL first saves an extra clone.
+ * Its reference count will be decremented at the next union,
+ * etc, or if this is the only instance, at the end of the
+ * routine */
+ if (! cp_list) {
+ cp_list = scratch_list;
}
+ else {
+ _invlist_union(cp_list, scratch_list, &cp_list);
+ SvREFCNT_dec_NN(scratch_list);
+ }
+ continue; /* Go get next character */
}
- else if (! SIZE_ONLY) {
+ else {
- /* Here, not in pass1 (in that pass we skip calculating the
- * contents of this class), and is not /l, or is a POSIX class
- * for which /l doesn't matter (or is a Unicode property, which
- * is skipped here). */
+ /* Here, is not /l, or is a POSIX class for which /l doesn't
+ * matter (or is a Unicode property, which is skipped here). */
if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */
if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */
&cp_list);
}
}
- else if ( UNI_SEMANTICS
+ else if ( RExC_uni_semantics
|| AT_LEAST_ASCII_RESTRICTED
|| classnum == _CC_ASCII
|| (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT
/* a bad range like \w-, [:word:]- ? */
if (namedclass > OOB_NAMEDCLASS) {
- if (strict || (PASS2 && ckWARN(WARN_REGEXP))) {
+ if (strict || ckWARN(WARN_REGEXP)) {
const int w = RExC_parse >= rangebegin
? RExC_parse - rangebegin
: 0;
w, w, rangebegin);
}
}
- if (!SIZE_ONLY) {
- cp_list = add_cp_to_invlist(cp_list, '-');
- }
+ cp_list = add_cp_to_invlist(cp_list, '-');
element_count++;
} else
range = 1; /* yeah, it's a range! */
* <prevvalue> is the beginning of the range, if any; or <value> if
* not. */
- /* non-Latin1 code point implies unicode semantics. Must be set in
- * pass1 so is there for the whole of pass 2 */
+ /* non-Latin1 code point implies unicode semantics. */
if (value > 255) {
REQUIRE_UNI_RULES(flagp, 0);
}
}
}
- if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
+ if (strict && ckWARN(WARN_REGEXP)) {
if (range) {
/* If the range starts above 255, everything is portable and
}
/* Deal with this element of the class */
- if (! SIZE_ONLY) {
#ifndef EBCDIC
- cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
- prevvalue, value);
+ cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
+ prevvalue, value);
#else
- /* On non-ASCII platforms, for ranges that span all of 0..255, and
- * ones that don't require special handling, we can just add the
- * range like we do for ASCII platforms */
- if ((UNLIKELY(prevvalue == 0) && value >= 255)
- || ! (prevvalue < 256
- && (unicode_range
- || (! non_portable_endpoint
- && ((isLOWER_A(prevvalue) && isLOWER_A(value))
- || (isUPPER_A(prevvalue)
- && isUPPER_A(value)))))))
- {
- cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
- prevvalue, value);
+ /* On non-ASCII platforms, for ranges that span all of 0..255, and ones
+ * that don't require special handling, we can just add the range like
+ * we do for ASCII platforms */
+ if ((UNLIKELY(prevvalue == 0) && value >= 255)
+ || ! (prevvalue < 256
+ && (unicode_range
+ || (! non_portable_endpoint
+ && ((isLOWER_A(prevvalue) && isLOWER_A(value))
+ || (isUPPER_A(prevvalue)
+ && isUPPER_A(value)))))))
+ {
+ cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
+ prevvalue, value);
+ }
+ else {
+ /* Here, requires special handling. This can be because it is a
+ * range whose code points are considered to be Unicode, and so
+ * must be individually translated into native, or because its a
+ * subrange of 'A-Z' or 'a-z' which each aren't contiguous in
+ * EBCDIC, but we have defined them to include only the "expected"
+ * upper or lower case ASCII alphabetics. Subranges above 255 are
+ * the same in native and Unicode, so can be added as a range */
+ U8 start = NATIVE_TO_LATIN1(prevvalue);
+ unsigned j;
+ U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
+ for (j = start; j <= end; j++) {
+ cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
}
- else {
- /* Here, requires special handling. This can be because it is
- * a range whose code points are considered to be Unicode, and
- * so must be individually translated into native, or because
- * its a subrange of 'A-Z' or 'a-z' which each aren't
- * contiguous in EBCDIC, but we have defined them to include
- * only the "expected" upper or lower case ASCII alphabetics.
- * Subranges above 255 are the same in native and Unicode, so
- * can be added as a range */
- U8 start = NATIVE_TO_LATIN1(prevvalue);
- unsigned j;
- U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
- for (j = start; j <= end; j++) {
- cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
- }
- if (value > 255) {
- cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
- 256, value);
- }
+ if (value > 255) {
+ cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
+ 256, value);
}
-#endif
}
+#endif
range = 0; /* this range (if it was one) is done now */
} /* End of loop through all the text within the brackets */
-
if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
- output_or_return_posix_warnings(pRExC_state, posix_warnings,
- return_posix_warnings);
+ output_posix_warnings(pRExC_state, posix_warnings);
}
/* If anything in the class expands to more than one character, we have to
return ret;
}
- /* Here, we've gone through the entire class and dealt with multi-char
- * folds. We are now in a position that we can do some checks to see if we
- * can optimize this ANYOF node into a simpler one, even in Pass 1.
- * Currently we only do two checks:
- * 1) is in the unlikely event that the user has specified both, eg. \w and
- * \W under /l, then the class matches everything. (This optimization
- * is done only to make the optimizer code run later work.)
- * 2) if the character class contains only a single element (including a
- * single range), we see if there is an equivalent node for it.
- * Other checks are possible */
- if ( optimizable
- && ! ret_invlist /* Can't optimize if returning the constructed
- inversion list */
- && (UNLIKELY(posixl_matches_all) || element_count == 1))
- {
- U8 op = END;
- U8 arg = 0;
-
- if (UNLIKELY(posixl_matches_all)) {
- op = SANY;
- }
- else if (namedclass > OOB_NAMEDCLASS) { /* this is a single named
- class, like \w or [:digit:]
- or \p{foo} */
-
- /* All named classes are mapped into POSIXish nodes, with its FLAG
- * argument giving which class it is */
- switch ((I32)namedclass) {
- case ANYOF_UNIPROP:
- break;
-
- /* These don't depend on the charset modifiers. They always
- * match under /u rules */
- case ANYOF_NHORIZWS:
- case ANYOF_HORIZWS:
- namedclass = ANYOF_BLANK + namedclass - ANYOF_HORIZWS;
- /* FALLTHROUGH */
-
- case ANYOF_NVERTWS:
- case ANYOF_VERTWS:
- op = POSIXU;
- goto join_posix;
-
- /* The actual POSIXish node for all the rest depends on the
- * charset modifier. The ones in the first set depend only on
- * ASCII or, if available on this platform, also locale */
-
- case ANYOF_ASCII:
- case ANYOF_NASCII:
-
-#ifdef HAS_ISASCII
- if (LOC) {
- op = POSIXL;
- goto join_posix;
- }
-#endif
- /* (named_class - ANYOF_ASCII) is 0 or 1. xor'ing with
- * invert converts that to 1 or 0 */
- op = ASCII + ((namedclass - ANYOF_ASCII) ^ invert);
- break;
-
- /* The following don't have any matches in the upper Latin1
- * range, hence /d is equivalent to /u for them. Making it /u
- * saves some branches at runtime */
- case ANYOF_DIGIT:
- case ANYOF_NDIGIT:
- case ANYOF_XDIGIT:
- case ANYOF_NXDIGIT:
- if (! DEPENDS_SEMANTICS) {
- goto treat_as_default;
- }
-
- op = POSIXU;
- goto join_posix;
-
- /* The following change to CASED under /i */
- case ANYOF_LOWER:
- case ANYOF_NLOWER:
- case ANYOF_UPPER:
- case ANYOF_NUPPER:
- if (FOLD) {
- namedclass = ANYOF_CASED + (namedclass % 2);
- }
- /* FALLTHROUGH */
-
- /* The rest have more possibilities depending on the charset.
- * We take advantage of the enum ordering of the charset
- * modifiers to get the exact node type, */
- default:
- treat_as_default:
- op = POSIXD + get_regex_charset(RExC_flags);
- if (op > POSIXA) { /* /aa is same as /a */
- op = POSIXA;
- }
-
- join_posix:
- /* The odd numbered ones are the complements of the
- * next-lower even number one */
- if (namedclass % 2 == 1) {
- invert = ! invert;
- namedclass--;
- }
- arg = namedclass_to_classnum(namedclass);
- break;
- }
- }
- else if (value == prevvalue) {
-
- /* Here, the class consists of just a single code point */
-
- if (invert) {
- if (! LOC && value == '\n') {
- op = REG_ANY; /* Optimize [^\n] */
- *flagp |= HASWIDTH|SIMPLE;
- MARK_NAUGHTY(1);
- }
- }
- else if (value < 256 || UTF) {
-
- /* Optimize a single value into an EXACTish node, but not if it
- * would require converting the pattern to UTF-8. */
- op = compute_EXACTish(pRExC_state);
- }
- } /* Otherwise is a range */
- else if (! LOC) { /* locale could vary these */
- if (prevvalue == '0') {
- if (value == '9') {
- arg = _CC_DIGIT;
- op = POSIXA;
- }
- }
- else if (! FOLD || ASCII_FOLD_RESTRICTED) {
- /* We can optimize A-Z or a-z, but not if they could match
- * something like the KELVIN SIGN under /i. */
- if (prevvalue == 'A') {
- if (value == 'Z'
-#ifdef EBCDIC
- && ! non_portable_endpoint
-#endif
- ) {
- arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
- op = POSIXA;
- }
- }
- else if (prevvalue == 'a') {
- if (value == 'z'
-#ifdef EBCDIC
- && ! non_portable_endpoint
-#endif
- ) {
- arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
- op = POSIXA;
- }
- }
- }
- }
-
- /* Here, we have changed <op> away from its initial value iff we found
- * an optimization */
- if (op != END) {
-
- /* Emit the calculated regnode,
- * which should correspond to the beginning, not current, state of
- * the parse */
- const char * cur_parse = RExC_parse;
- RExC_parse = (char *)orig_parse;
- if (PL_regkind[op] == POSIXD) {
- if (op == POSIXL) {
- RExC_contains_locale = 1;
- }
- if (invert) {
- op += NPOSIXD - POSIXD;
- }
- }
-
- ret = reg_node(pRExC_state, op);
-
- if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
- if (! SIZE_ONLY) {
- FLAGS(REGNODE_p(ret)) = arg;
- }
- *flagp |= HASWIDTH|SIMPLE;
- }
- else if (PL_regkind[op] == EXACT) {
- alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
- TRUE /* downgradable to EXACT */
- );
- }
- else {
- *flagp |= HASWIDTH|SIMPLE;
- }
-
- RExC_parse = (char *) cur_parse;
-
- SvREFCNT_dec(posixes);
- SvREFCNT_dec(nposixes);
- SvREFCNT_dec(simple_posixes);
- SvREFCNT_dec(cp_list);
- SvREFCNT_dec(cp_foldable_list);
- return ret;
- }
- }
-
- /* Assume we are going to generate an ANYOF-type node. */
- op = (posixl)
- ? ANYOFPOSIXL
- : (LOC)
- ? ANYOFL
- : ANYOF;
- ret = reganode(pRExC_state, op, 0);
-
- if (SIZE_ONLY) {
- return ret;
- }
-
- /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
-
- ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
- if (posixl) {
- ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
- }
-
/* If folding, we calculate all characters that could fold to or from the
* ones already on the list */
if (cp_foldable_list) {
_invlist_subtract(only_non_utf8_list, cp_list,
&only_non_utf8_list);
if (_invlist_len(only_non_utf8_list) != 0) {
- ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+ anyof_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
}
SvREFCNT_dec_NN(only_non_utf8_list);
}
}
if (warn_super) {
- ANYOF_FLAGS(REGNODE_p(ret))
+ anyof_flags
|= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
/* Because an ANYOF node is the only one that warns, this node
}
}
if (only_utf8_locale_list) {
- ANYOF_FLAGS(REGNODE_p(ret))
- |= ANYOFL_FOLD
- |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
+ anyof_flags
+ |= ANYOFL_FOLD
+ | ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
}
else if (cp_list) { /* Look to see if a 0-255 code point is in list */
UV start, end;
invlist_iterinit(cp_list);
if (invlist_iternext(cp_list, &start, &end) && start < 256) {
- ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOFL_FOLD;
+ anyof_flags |= ANYOFL_FOLD;
}
invlist_iterfinish(cp_list);
}
}
else if ( DEPENDS_SEMANTICS
&& ( has_upper_latin1_only_utf8_matches
- || (ANYOF_FLAGS(REGNODE_p(ret)) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
+ || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
{
- OP(REGNODE_p(ret)) = ANYOFD;
+ use_anyofd = TRUE;
+ RExC_seen_d_op = TRUE;
optimizable = FALSE;
}
-
/* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
* at compile time. Besides not inverting folded locale now, we can't
* invert if there are things such as \w, which aren't known until runtime
* */
if ( cp_list
&& invert
- && OP(REGNODE_p(ret)) != ANYOFD
- && ! (ANYOF_FLAGS(REGNODE_p(ret)) & (ANYOF_LOCALE_FLAGS))
+ && ! use_anyofd
+ && ! (anyof_flags & (ANYOF_LOCALE_FLAGS))
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
{
_invlist_invert(cp_list);
}
if (ret_invlist) {
- assert(cp_list);
-
*ret_invlist = cp_list;
SvREFCNT_dec(swash);
- /* Discard the generated node */
- if (SIZE_ONLY) {
- RExC_size = orig_size;
- }
- else {
- RExC_emit = orig_emit;
- }
- return orig_emit;
+ return RExC_emit;
}
/* Some character classes are equivalent to other nodes. Such nodes take
* up less room and generally fewer operations to execute than ANYOF nodes.
- * Above, we checked for and optimized into some such equivalents for
- * certain common classes that are easy to test. Getting to this point in
- * the code means that the class didn't get optimized there. Since this
- * code is only executed in Pass 2, it is too late to save space--it has
- * been allocated in Pass 1, and currently isn't given back. XXX Why not?
- * But turning things into an EXACTish node can allow the optimizer to join
- * it to any adjacent such nodes. And if the class is equivalent to things
- * like /./, expensive run-time swashes can be avoided. Now that we have
- * more complete information, we can find things necessarily missed by the
- * earlier code. */
-
- if (optimizable && cp_list && ! invert) {
- UV start, end;
- U8 op = END; /* The optimzation node-type */
+ * */
+
+ if (optimizable) {
int posix_class = -1; /* Illegal value */
- const char * cur_parse= RExC_parse;
U8 ANYOFM_mask = 0xFF;
U32 anode_arg = 0;
+ UV start, end;
- invlist_iterinit(cp_list);
- if (! invlist_iternext(cp_list, &start, &end)) {
-
- /* Here, the list is empty. This happens, for example, when a
- * Unicode property that doesn't match anything is the only element
- * in the character class (perluniprops.pod notes such properties).
- * */
- op = OPFAIL;
- *flagp |= HASWIDTH|SIMPLE;
+ if (UNLIKELY(posixl_matches_all)) {
+ op = SANY;
}
- else if (start == end) { /* The range is a single code point */
- if (! invlist_iternext(cp_list, &start, &end)
+ else if (cp_list && ! invert) {
- /* Don't do this optimization if it would require changing
- * the pattern to UTF-8 */
- && (start < 256 || UTF))
- {
- /* Here, the list contains a single code point. Can optimize
- * into an EXACTish node */
+ invlist_iterinit(cp_list);
+ if (! invlist_iternext(cp_list, &start, &end)) {
- value = start;
+ /* Here, the list is empty. This happens, for example, when a
+ * Unicode property that doesn't match anything is the only
+ * element in the character class (perluniprops.pod notes such
+ * properties). */
+ op = OPFAIL;
+ *flagp |= HASWIDTH|SIMPLE;
+ }
+ else if (start == end) { /* The range is a single code point */
+ if (! invlist_iternext(cp_list, &start, &end)
- if (! FOLD) {
- op = (LOC)
- ? EXACTL
- : EXACT;
- }
- else if (LOC) {
+ /* Don't do this optimization if it would require
+ * changing the pattern to UTF-8 */
+ && (start < 256 || UTF))
+ {
+ /* Here, the list contains a single code point. Can
+ * optimize into an EXACTish node */
- /* A locale node under folding with one code point can be
- * an EXACTFL, as its fold won't be calculated until
- * runtime */
- op = EXACTFL;
- }
- else {
+ value = start;
- /* Here, we are generally folding, but there is only one
- * code point to match. If we have to, we use an EXACT
- * node, but it would be better for joining with adjacent
- * nodes in the optimization pass if we used the same
- * EXACTFish node that any such are likely to be. We can
- * do this iff the code point doesn't participate in any
- * folds. For example, an EXACTF of a colon is the same as
- * an EXACT one, since nothing folds to or from a colon. */
- if (value < 256) {
- if (IS_IN_SOME_FOLD_L1(value)) {
- op = EXACT;
- }
+ if (! FOLD) {
+ op = (LOC)
+ ? EXACTL
+ : EXACT;
+ }
+ else if (LOC) {
+
+ /* A locale node under folding with one code point can
+ * be an EXACTFL, as its fold won't be calculated until
+ * runtime */
+ op = EXACTFL;
}
else {
- if (_invlist_contains_cp(PL_utf8_foldable, value)) {
- op = EXACT;
+
+ /* Here, we are generally folding, but there is only
+ * one code point to match. If we have to, we use an
+ * EXACT node, but it would be better for joining with
+ * adjacent nodes in the optimization phase if we used
+ * the same EXACTFish node that any such are likely to
+ * be. We can do this iff the code point doesn't
+ * participate in any folds. For example, an EXACTF of
+ * a colon is the same as an EXACT one, since nothing
+ * folds to or from a colon. */
+ if (value < 256) {
+ if (IS_IN_SOME_FOLD_L1(value)) {
+ op = EXACT;
+ }
+ }
+ else {
+ if (_invlist_contains_cp(PL_utf8_foldable, value)) {
+ op = EXACT;
+ }
}
- }
- /* If we haven't found the node type, above, it means we
- * can use the prevailing one */
- if (op == END) {
- op = compute_EXACTish(pRExC_state);
+ /* If we haven't found the node type, above, it means
+ * we can use the prevailing one */
+ if (op == END) {
+ op = compute_EXACTish(pRExC_state);
+ }
}
}
+ } /* End of first range contains just a single code point */
+ else if (start == 0) {
+ if (end == UV_MAX) {
+ op = SANY;
+ *flagp |= HASWIDTH|SIMPLE;
+ MARK_NAUGHTY(1);
+ }
+ else if (end == '\n' - 1
+ && invlist_iternext(cp_list, &start, &end)
+ && start == '\n' + 1 && end == UV_MAX)
+ {
+ op = REG_ANY;
+ *flagp |= HASWIDTH|SIMPLE;
+ MARK_NAUGHTY(1);
+ }
}
- } /* End of first range contains just a single code point */
- else if (start == 0) {
- if (end == UV_MAX) {
- op = SANY;
- *flagp |= HASWIDTH|SIMPLE;
- MARK_NAUGHTY(1);
- }
- else if (end == '\n' - 1
- && invlist_iternext(cp_list, &start, &end)
- && start == '\n' + 1 && end == UV_MAX)
- {
- op = REG_ANY;
- *flagp |= HASWIDTH|SIMPLE;
- MARK_NAUGHTY(1);
- }
- }
- invlist_iterfinish(cp_list);
+ invlist_iterfinish(cp_list);
- if (op == END) {
+ if (op == END) {
- /* Here, didn't find an optimization. See if this matches any of
- * the POSIX classes. First try ASCII */
+ /* Here, didn't find an optimization. See if this matches any
+ * of the POSIX classes. First try ASCII */
- if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 0)) {
- op = ASCII;
- *flagp |= HASWIDTH|SIMPLE;
- }
- else if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 1)) {
- op = NASCII;
- *flagp |= HASWIDTH|SIMPLE;
- }
- else if (invlist_highest(cp_list) >= 0x2029) {
-
- /* Then try the other POSIX classes. The POSIXA ones are about
- * the same speed as ANYOF ops, but the ones that have
- * above-Latin1 code point matches are somewhat faster than
- * ANYOF. So optimize those, but don't bother with the POSIXA
- * ones nor [:cntrl:] which has no above-Latin1 matches. If
- * this ANYOF node has a lower highest possible matching code
- * point than any of the XPosix ones, we know that it can't
- * possibly be the same as any of them, so we can avoid
- * executing this code. The 0x2029 above for the lowest max
- * was determined by manual inspection of the classes, and
- * comes from \v. Suppose Unicode in a later version adds a
- * higher code point to \v. All that means is that this code
- * can be executed unnecessarily. It will still give the
- * correct answer. */
-
- for (posix_class = 0;
- posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
- posix_class++)
- {
- int try_inverted;
+ if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 0)) {
+ op = ASCII;
+ *flagp |= HASWIDTH|SIMPLE;
+ }
+ else if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 1)) {
+ op = NASCII;
+ *flagp |= HASWIDTH|SIMPLE;
+ }
+ else {
- if (posix_class == _CC_CNTRL) {
- continue;
- }
+ /* Then try the other POSIX classes. The POSIXA ones are
+ * about the same speed as ANYOF ops, but take less room;
+ * the ones that have above-Latin1 code point matches are
+ * somewhat faster than ANYOF. */
- for (try_inverted = 0; try_inverted < 2; try_inverted++) {
+ for (posix_class = 0;
+ posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
+ posix_class++)
+ {
+ int try_inverted;
- /* Check if matches normal or inverted */
- if (_invlistEQ(cp_list,
- PL_XPosix_ptrs[posix_class],
- try_inverted))
+ for (try_inverted = 0; try_inverted < 2; try_inverted++)
{
- op = (try_inverted)
- ? NPOSIXU
- : POSIXU;
- *flagp |= HASWIDTH|SIMPLE;
- goto found_posix;
+
+ /* Check if matches POSIXA, normal or inverted */
+ if (PL_Posix_ptrs[posix_class]) {
+ if (_invlistEQ(cp_list,
+ PL_Posix_ptrs[posix_class],
+ try_inverted))
+ {
+ op = (try_inverted)
+ ? NPOSIXA
+ : POSIXA;
+ *flagp |= HASWIDTH|SIMPLE;
+ goto found_posix;
+ }
+ }
+
+ /* Check if matches POSIXU, normal or inverted */
+ if (_invlistEQ(cp_list,
+ PL_XPosix_ptrs[posix_class],
+ try_inverted))
+ {
+ op = (try_inverted)
+ ? NPOSIXU
+ : POSIXU;
+ *flagp |= HASWIDTH|SIMPLE;
+ goto found_posix;
+ }
}
}
- }
- found_posix: ;
- }
-
- /* If it didn't match a POSIX class, it might be able to be turned
- * into an ANYOFM node. Compare two different bytes, bit-by-bit.
- * In some positions, the bits in each will be 1; and in other
- * positions both will be 0; and in some positions the bit will be
- * 1 in one byte, and 0 in the other. Let 'n' be the number of
- * positions where the bits differ. We create a mask which has
- * exactly 'n' 0 bits, each in a position where the two bytes
- * differ. Now take the set of all bytes that when ANDed with the
- * mask yield the same result. That set has 2**n elements, and is
- * representable by just two 8 bit numbers: the result and the
- * mask. Importantly, matching the set can be vectorized by
- * creating a word full of the result bytes, and a word full of the
- * mask bytes, yielding a significant speed up. Here, see if this
- * node matches such a set. As a concrete example consider [01],
- * and the byte representing '0' which is 0x30 on ASCII machines.
- * It has the bits 0011 0000. Take the mask 1111 1110. If we AND
- * 0x31 and 0x30 with that mask we get 0x30. Any other bytes ANDed
- * yield something else. So [01], which is a common usage, is
- * optimizable into ANYOFM, and can benefit from the speed up. We
- * can only do this on UTF-8 invariant bytes, because the variance
- * would throw this off. */
- if ( op == END
- && invlist_highest(cp_list) <=
+ found_posix: ;
+ }
+
+ /* If it didn't match a POSIX class, it might be able to be
+ * turned into an ANYOFM node. Compare two different bytes,
+ * bit-by-bit. In some positions, the bits in each will be 1;
+ * and in other positions both will be 0; and in some positions
+ * the bit will be 1 in one byte, and 0 in the other. Let 'n'
+ * be the number of positions where the bits differ. We create
+ * a mask which has exactly 'n' 0 bits, each in a position
+ * where the two bytes differ. Now take the set of all bytes
+ * that when ANDed with the mask yield the same result. That
+ * set has 2**n elements, and is representable by just two 8
+ * bit numbers: the result and the mask. Importantly, matching
+ * the set can be vectorized by creating a word full of the
+ * result bytes, and a word full of the mask bytes, yielding a
+ * significant speed up. Here, see if this node matches such a
+ * set. As a concrete example consider [01], and the byte
+ * representing '0' which is 0x30 on ASCII machines. It has
+ * the bits 0011 0000. Take the mask 1111 1110. If we AND
+ * 0x31 and 0x30 with that mask we get 0x30. Any other bytes
+ * ANDed yield something else. So [01], which is a common
+ * usage, is optimizable into ANYOFM, and can benefit from the
+ * speed up. We can only do this on UTF-8 invariant bytes,
+ * because the variance would throw this off. */
+ if (op == END) {
+ PERL_UINT_FAST8_T inverted = 0;
#ifdef EBCDIC
- 0xFF
+ const PERL_UINT_FAST8_T max_permissible = 0xFF;
#else
- 0x7F
+ const PERL_UINT_FAST8_T max_permissible = 0x7F;
#endif
- ) {
- Size_t cp_count = 0;
- bool first_time = TRUE;
- unsigned int lowest_cp = 0xFF;
- U8 bits_differing = 0;
-
- /* Only needed on EBCDIC, as there, variants and non- are mixed
- * together. Could #ifdef it out on ASCII, but probably the
- * compiler will optimize it out */
- bool has_variant = FALSE;
-
- /* Go through the bytes and find the bit positions that differ */
- invlist_iterinit(cp_list);
- while (invlist_iternext(cp_list, &start, &end)) {
- unsigned int i = start;
-
- cp_count += end - start + 1;
-
- if (first_time) {
- if (! UVCHR_IS_INVARIANT(i)) {
- has_variant = TRUE;
- continue;
- }
+ if (invlist_highest(cp_list) > max_permissible) {
+ _invlist_invert(cp_list);
+ inverted = 1;
+ }
- first_time = FALSE;
- lowest_cp = start;
+ if (invlist_highest(cp_list) <= max_permissible) {
+ Size_t cp_count = 0;
+ bool first_time = TRUE;
+ unsigned int lowest_cp = 0xFF;
+ U8 bits_differing = 0;
- i++;
- }
+ /* Only needed on EBCDIC, as there, variants and non- are mixed
+ * together. Could #ifdef it out on ASCII, but probably the
+ * compiler will optimize it out */
+ bool has_variant = FALSE;
- /* Find the bit positions that differ from the lowest code
- * point in the node. Keep track of all such positions by
- * OR'ing */
- for (; i <= end; i++) {
- if (! UVCHR_IS_INVARIANT(i)) {
- has_variant = TRUE;
- continue;
+ /* Go through the bytes and find the bit positions that differ */
+ invlist_iterinit(cp_list);
+ while (invlist_iternext(cp_list, &start, &end)) {
+ unsigned int i = start;
+
+ cp_count += end - start + 1;
+
+ if (first_time) {
+ if (! UVCHR_IS_INVARIANT(i)) {
+ has_variant = TRUE;
+ continue;
+ }
+
+ first_time = FALSE;
+ lowest_cp = start;
+
+ i++;
}
- bits_differing |= i ^ lowest_cp;
+ /* Find the bit positions that differ from the lowest
+ * code point in the node. Keep track of all such
+ * positions by OR'ing */
+ for (; i <= end; i++) {
+ if (! UVCHR_IS_INVARIANT(i)) {
+ has_variant = TRUE;
+ continue;
+ }
+
+ bits_differing |= i ^ lowest_cp;
+ }
}
- }
- invlist_iterfinish(cp_list);
-
- /* At the end of the loop, we count how many bits differ from
- * the bits in lowest code point, call the count 'd'. If the
- * set we found contains 2**d elements, it is the closure of
- * all code points that differ only in those bit positions. To
- * convince yourself of that, first note that the number in the
- * closure must be a power of 2, which we test for. The only
- * way we could have that count and it be some differing set,
- * is if we got some code points that don't differ from the
- * lowest code point in any position, but do differ from each
- * other in some other position. That means one code point has
- * a 1 in that position, and another has a 0. But that would
- * mean that one of them differs from the lowest code point in
- * that position, which possibility we've already excluded. */
- if ( ! has_variant
- && cp_count == 1U << PL_bitcount[bits_differing])
- {
- assert(cp_count > 1);
- op = ANYOFM;
+ invlist_iterfinish(cp_list);
+
+ /* At the end of the loop, we count how many bits differ
+ * from the bits in lowest code point, call the count 'd'.
+ * If the set we found contains 2**d elements, it is the
+ * closure of all code points that differ only in those bit
+ * positions. To convince yourself of that, first note
+ * that the number in the closure must be a power of 2,
+ * which we test for. The only way we could have that
+ * count and it be some differing set, is if we got some
+ * code points that don't differ from the lowest code point
+ * in any position, but do differ from each other in some
+ * other position. That means one code point has a 1 in
+ * that position, and another has a 0. But that would mean
+ * that one of them differs from the lowest code point in
+ * that position, which possibility we've already excluded.
+ * */
+ if ( ! has_variant
+ && cp_count == 1U << PL_bitcount[bits_differing])
+ {
+ assert(inverted || cp_count > 1);
+ op = ANYOFM + inverted;;
- /* We need to make the bits that differ be 0's */
- ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
+ /* We need to make the bits that differ be 0's */
+ ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS
+ */
- /* The argument is the lowest code point */
- anode_arg = lowest_cp;
- *flagp |= HASWIDTH|SIMPLE;
+ /* The argument is the lowest code point */
+ anode_arg = lowest_cp;
+ *flagp |= HASWIDTH|SIMPLE;
+ }
}
+ if (inverted) {
+ _invlist_invert(cp_list);
+ }
+ }
}
}
if (op != END) {
- RExC_parse = (char *)orig_parse;
- RExC_emit = orig_emit;
-
if (regarglen[op]) {
ret = reganode(pRExC_state, op, anode_arg);
} else {
ret = reg_node(pRExC_state, op);
}
-
- RExC_parse = (char *)cur_parse;
+ Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
+ RExC_parse - orig_parse);;
if (PL_regkind[op] == EXACT) {
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
SvREFCNT_dec_NN(cp_list);
return ret;
}
- }
+ } /* End of seeing if can optimize it into a different node */
+
+ /* It's going to be an ANYOF node. */
+ op = (use_anyofd)
+ ? ANYOFD
+ : ((posixl)
+ ? ANYOFPOSIXL
+ : ((LOC)
+ ? ANYOFL
+ : ANYOF));
+ ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof");
+ FILL_NODE(ret, op); /* We set the argument later */
+ RExC_emit += 1 + regarglen[op];
+ ANYOF_FLAGS(REGNODE_p(ret)) = anyof_flags;
/* Here, <cp_list> contains all the code points we can determine at
* compile time that match under all conditions. Go through it, and
populate_ANYOF_from_invlist(REGNODE_p(ret), &cp_list);
+ if (posixl) {
+ ANYOF_POSIXL_SET_TO_BITMAP(REGNODE_p(ret), posixl);
+ }
+
if (invert) {
ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_INVERT;
}
}
}
+STATIC void
+S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size)
+{
+ PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE;
+
+ RExC_size += size;
+
+ Renewc(RExC_rxi,
+ sizeof(regexp_internal) + (RExC_size + 1) * sizeof(regnode),
+ /* +1 for REG_MAGIC */
+ char,
+ regexp_internal);
+ if ( RExC_rxi == NULL )
+ FAIL("Regexp out of space");
+ RXi_SET(RExC_rx, RExC_rxi);
+
+ RExC_emit_start = RExC_rxi->program;
+ if (size > 0) {
+ Zero(REGNODE_p(RExC_emit), size, regnode);
+ }
+
+#ifdef RE_TRACK_PATTERN_OFFSETS
+ Renew(RExC_offsets, 2*RExC_size+1, U32);
+ if (size > 0) {
+ Zero(RExC_offsets + 2*(RExC_size - size) + 1, 2 * size, U32);
+ }
+ RExC_offsets[0] = RExC_size;
+#endif
+}
+
STATIC regnode_offset
S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_size, const char* const name)
{
- /* Allocate a regnode for 'op', with 'extra_size' extra space. In pass1,
- * it aligns and increments RExC_size; in pass2, RExC_emit
+ /* Allocate a regnode for 'op', with 'extra_size' extra space. It aligns
+ * and increments RExC_size and RExC_emit
*
- * It returns the renode's offset into the regex engine program (meaningful
- * only in pass2 */
+ * It returns the regnode's offset into the regex engine program */
const regnode_offset ret = RExC_emit;
PERL_ARGS_ASSERT_REGNODE_GUTS;
- assert(extra_size >= regarglen[op]);
-
- if (SIZE_ONLY) {
- SIZE_ALIGN(RExC_size);
- RExC_size += 1 + extra_size;
- return(ret);
- }
- if (REGNODE_p(RExC_emit) >= RExC_emit_bound)
- Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
- op, (void*)REGNODE_p(RExC_emit), (void*)RExC_emit_bound);
-
+ SIZE_ALIGN(RExC_size);
+ change_engine_size(pRExC_state, (Ptrdiff_t) 1 + extra_size);
NODE_ALIGN_FILL(REGNODE_p(ret));
#ifndef RE_TRACK_PATTERN_OFFSETS
PERL_UNUSED_ARG(name);
+ PERL_UNUSED_ARG(op);
#else
+ assert(extra_size >= regarglen[op] || PL_regkind[op] == ANYOF);
+
if (RExC_offsets) { /* MJD */
MJD_OFFSET_DEBUG(
("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n",
S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
{
const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
+ regnode_offset ptr = ret;
PERL_ARGS_ASSERT_REG_NODE;
assert(regarglen[op] == 0);
- if (PASS2) {
- regnode_offset ptr = ret;
- FILL_ADVANCE_NODE(ptr, op);
- RExC_emit = ptr;
- }
+ FILL_ADVANCE_NODE(ptr, op);
+ RExC_emit = ptr;
return(ret);
}
S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
{
const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
+ regnode_offset ptr = ret;
PERL_ARGS_ASSERT_REGANODE;
/* ANYOF are special cased to allow non-length 1 args */
- assert(regarglen[op] == 1 || PL_regkind[op] == ANYOF);
+ assert(regarglen[op] == 1);
- if (PASS2) {
- regnode_offset ptr = ret;
- FILL_ADVANCE_NODE_ARG(ptr, op, arg);
- RExC_emit = ptr;
- }
+ FILL_ADVANCE_NODE_ARG(ptr, op, arg);
+ RExC_emit = ptr;
return(ret);
}
/* emit a node with U32 and I32 arguments */
const regnode_offset ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
+ regnode_offset ptr = ret;
PERL_ARGS_ASSERT_REG2LANODE;
assert(regarglen[op] == 2);
- if (PASS2) {
- regnode_offset ptr = ret;
- FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
- RExC_emit = ptr;
- }
+ FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2);
+ RExC_emit = ptr;
return(ret);
}
* IMPORTANT NOTE - it is the *callers* responsibility to correctly
* set up NEXT_OFF() of the inserted node if needed. Something like this:
*
-* reginsert(pRExC, OPFAIL, orig_emit, depth+1);
-* if (PASS2)
-* NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
+* reginsert(pRExC, OPFAIL, orig_emit, depth+1);
+* NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
*
* ALSO NOTE - FLAGS(newly-inserted-operator) will be set to 0 as well.
*/
STATIC void
-S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op,
- regnode_offset operand, U32 depth)
+S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op,
+ const regnode_offset operand, const U32 depth)
{
regnode *src;
regnode *dst;
PERL_UNUSED_ARG(depth);
/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
DEBUG_PARSE_FMT("inst"," - %s", PL_reg_name[op]);
- if (SIZE_ONLY) {
- RExC_size += size;
- return;
- }
assert(!RExC_study_started); /* I believe we should never use reginsert once we have started
studying. If this is wrong then we need to adjust RExC_recurse
below like we do with RExC_open_parens/RExC_close_parens. */
+ change_engine_size(pRExC_state, (Ptrdiff_t) size);
src = REGNODE_p(RExC_emit);
RExC_emit += size;
dst = REGNODE_p(RExC_emit);
PERL_UNUSED_ARG(depth);
#endif
- if (SIZE_ONLY)
- return;
-
/* Find last node. */
scan = (regnode_offset) p;
for (;;) {
PERL_ARGS_ASSERT_REGTAIL_STUDY;
- if (SIZE_ONLY)
- return exact;
-
/* Find last node. */
scan = p;
if ( exact ) {
switch (OP(REGNODE_p(scan))) {
case EXACT:
+ case EXACT_ONLY8:
case EXACTL:
case EXACTF:
+ case EXACTFS_B_U:
+ case EXACTFS_E_U:
+ case EXACTFS_BE_U:
case EXACTFAA_NO_TRIE:
case EXACTFAA:
case EXACTFU:
+ case EXACTFU_ONLY8:
case EXACTFLU8:
case EXACTFU_SS:
case EXACTFL:
STATIC SV*
S_get_ANYOFM_contents(pTHX_ const regnode * n) {
- /* Returns an inversion list of all the code points matched by the ANYOFM
- * node 'n' */
+ /* Returns an inversion list of all the code points matched by the
+ * ANYOFM/NANYOFM node 'n' */
SV * cp_list = _new_invlist(-1);
const U8 lowest = (U8) ARG(n);
}
}
+ if (OP(n) == NANYOFM) {
+ _invlist_invert(cp_list);
+ }
return cp_list;
}
SV * cp_list = get_ANYOFM_contents(o);
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+ if (OP(o) == NANYOFM) {
+ _invlist_invert(cp_list);
+ }
+
put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
PERL_ARGS_ASSERT_PREGFREE2;
+ if (! r)
+ return;
+
if (r->mother_re) {
ReREFCNT_dec(r->mother_re);
} else {
PERL_ARGS_ASSERT_REGFREE_INTERNAL;
+ if (! ri) {
+ return;
+ }
+
DEBUG_COMPILE_r({
if (!PL_colorset)
reginitcolors();
PL_utf8_foldable = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
UNI__PERL_FOLDS_TO_MULTI_CHAR]);
+ PL_InMultiCharFold = _new_invlist_C_array(_Perl_Is_In_Multi_Char_Fold_invlist);
PL_NonL1NonFinalFold = _new_invlist_C_array(
NonL1_Perl_Non_Final_Folds_invlist);