regnode *next_regnode; /* next node to process when last is reached */
U32 prev_recursed_depth;
I32 stopparen; /* what stopparen do we use */
- U32 is_top_frame; /* what flags do we use? */
struct scan_frame *this_prev_frame; /* this previous frame */
struct scan_frame *prev_frame; /* previous frame */
I32 sawback; /* Did we see \1, ...? */
U32 seen;
SSize_t size; /* Code size. */
- I32 npar; /* Capture buffer count, (OPEN) plus
+ I32 npar; /* Capture buffer count, (OPEN) plus
one. ("par" 0 is the whole
pattern)*/
I32 nestroot; /* root parens we are in - used by
bool seen_unfolded_sharp_s;
bool strict;
bool study_started;
+ bool in_script_run;
};
#define RExC_flags (pRExC_state->flags)
#define RExC_strict (pRExC_state->strict)
#define RExC_study_started (pRExC_state->study_started)
#define RExC_warn_text (pRExC_state->warn_text)
+#define RExC_in_script_run (pRExC_state->in_script_run)
/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
* a flag to disable back-off on the fixed/floating substrings - if it's
For each string some basic information is maintained:
- - offset or min_offset
+ - min_offset
This is the position the string must appear at, or not before.
It also implicitly (when combined with minlenp) tells us how many
characters must match before the string we are searching for.
Only used for floating strings. This is the rightmost point that
the string can appear at. If set to SSize_t_MAX it indicates that the
string can occur infinitely far to the right.
+ For fixed strings, it is equal to min_offset.
- minlenp
A pointer to the minimum number of characters of the pattern that the
*/
+struct scan_data_substrs {
+ SV *str; /* longest substring found in pattern */
+ SSize_t min_offset; /* earliest point in string it can appear */
+ SSize_t max_offset; /* latest point in string it can appear */
+ SSize_t *minlenp; /* pointer to the minlen relevant to the string */
+ SSize_t lookbehind; /* is the pos of the string modified by LB */
+ I32 flags; /* per substring SF_* and SCF_* flags */
+};
+
typedef struct scan_data_t {
/*I32 len_min; unused */
/*I32 len_delta; unused */
SSize_t last_end; /* min value, <0 unless valid. */
SSize_t last_start_min;
SSize_t last_start_max;
- SV **longest; /* Either &l_fixed, or &l_float. */
- SV *longest_fixed; /* longest fixed string found in pattern */
- SSize_t offset_fixed; /* offset where it starts */
- SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
- I32 lookbehind_fixed; /* is the position of the string modfied by LB */
- SV *longest_float; /* longest floating string found in pattern */
- SSize_t offset_float_min; /* earliest point in string it can appear */
- SSize_t offset_float_max; /* latest point in string it can appear */
- SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
- SSize_t lookbehind_float; /* is the pos of the string modified by LB */
- I32 flags;
+ U8 cur_is_floating; /* whether the last_* values should be set as
+ * the next fixed (0) or floating (1)
+ * substring */
+
+ /* [0] is longest fixed substring so far, [1] is longest float so far */
+ struct scan_data_substrs substrs[2];
+
+ I32 flags; /* common SF_* and SCF_* flags */
I32 whilem_c;
SSize_t *last_closep;
regnode_ssc *start_class;
* Forward declarations for pregcomp()'s friends.
*/
-static const scan_data_t zero_scan_data =
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
+static const scan_data_t zero_scan_data = {
+ 0, 0, NULL, 0, 0, 0, 0,
+ {
+ { NULL, 0, 0, 0, 0, 0 },
+ { NULL, 0, 0, 0, 0, 0 },
+ },
+ 0, 0, NULL, NULL
+};
+
+/* study flags */
-#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
#define SF_BEFORE_SEOL 0x0001
#define SF_BEFORE_MEOL 0x0002
-#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
-#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
-
-#define SF_FIX_SHIFT_EOL (+2)
-#define SF_FL_SHIFT_EOL (+4)
-
-#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
-#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
+#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
-#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
-#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
#define SF_IS_INF 0x0040
#define SF_HAS_PAR 0x0080
#define SF_IN_PAR 0x0100
UTF8fARG(UTF, \
(xI(xC) > eC) /* Don't run off end */ \
? eC - sC /* Length before the <--HERE */ \
- : xI_offset(xC), \
+ : ( __ASSERT_(xI_offset(xC) >= 0) xI_offset(xC) ), \
sC), /* The input pattern printed up to the <--HERE */ \
UTF8fARG(UTF, \
(xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \
#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
-#define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
- if ( ( flags ) ) { \
- Perl_re_printf( aTHX_ "%s", open_str); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
- Perl_re_printf( aTHX_ "%s", close_str); \
- }
-
-
-#define DEBUG_STUDYDATA(str,data,depth) \
-DEBUG_OPTIMISE_MORE_r(if(data){ \
- Perl_re_indentf( aTHX_ "" str "Pos:%" IVdf "/%" IVdf \
- " Flags: 0x%" UVXf, \
- depth, \
- (IV)((data)->pos_min), \
- (IV)((data)->pos_delta), \
- (UV)((data)->flags) \
- ); \
- DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
- Perl_re_printf( aTHX_ \
- " Whilem_c: %" IVdf " Lcp: %" IVdf " %s", \
- (IV)((data)->whilem_c), \
- (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
- is_inf ? "INF " : "" \
- ); \
- if ((data)->last_found) \
- Perl_re_printf( aTHX_ \
- "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf \
- " %sFixed:'%s' @ %" IVdf \
- " %sFloat: '%s' @ %" IVdf "/%" IVdf, \
- SvPVX_const((data)->last_found), \
- (IV)((data)->last_end), \
- (IV)((data)->last_start_min), \
- (IV)((data)->last_start_max), \
- ((data)->longest && \
- (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
- SvPVX_const((data)->longest_fixed), \
- (IV)((data)->offset_fixed), \
- ((data)->longest && \
- (data)->longest==&((data)->longest_float)) ? "*" : "", \
- SvPVX_const((data)->longest_float), \
- (IV)((data)->offset_float_min), \
- (IV)((data)->offset_float_max) \
- ); \
- Perl_re_printf( aTHX_ "\n"); \
-});
+
+#ifdef DEBUGGING
+static void
+S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
+ const char *close_str)
+{
+ if (!flags)
+ return;
+
+ Perl_re_printf( aTHX_ "%s", open_str);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
+ Perl_re_printf( aTHX_ "%s", close_str);
+}
+
+
+static void
+S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
+ U32 depth, int is_inf)
+{
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ DEBUG_OPTIMISE_MORE_r({
+ if (!data)
+ return;
+ Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
+ depth,
+ where,
+ (IV)data->pos_min,
+ (IV)data->pos_delta,
+ (UV)data->flags
+ );
+
+ S_debug_show_study_flags(aTHX_ data->flags," [","]");
+
+ Perl_re_printf( aTHX_
+ " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
+ (IV)data->whilem_c,
+ (IV)(data->last_closep ? *((data)->last_closep) : -1),
+ is_inf ? "INF " : ""
+ );
+
+ if (data->last_found) {
+ int i;
+ Perl_re_printf(aTHX_
+ "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
+ SvPVX_const(data->last_found),
+ (IV)data->last_end,
+ (IV)data->last_start_min,
+ (IV)data->last_start_max
+ );
+
+ for (i = 0; i < 2; i++) {
+ Perl_re_printf(aTHX_
+ " %s%s: '%s' @ %" IVdf "/%" IVdf,
+ data->cur_is_floating == i ? "*" : "",
+ i ? "Float" : "Fixed",
+ SvPVX_const(data->substrs[i].str),
+ (IV)data->substrs[i].min_offset,
+ (IV)data->substrs[i].max_offset
+ );
+ S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
+ }
+ }
+
+ Perl_re_printf( aTHX_ "\n");
+ });
+}
+
+
+static void
+S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
+ regnode *scan, U32 depth, U32 flags)
+{
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ DEBUG_OPTIMISE_r({
+ regnode *Next;
+
+ if (!scan)
+ return;
+ Next = regnext(scan);
+ regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
+ Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
+ depth,
+ str,
+ REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
+ Next ? (REG_NODE_NUM(Next)) : 0 );
+ S_debug_show_study_flags(aTHX_ flags," [ ","]");
+ Perl_re_printf( aTHX_ "\n");
+ });
+}
+
+
+# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
+ S_debug_studydata(aTHX_ where, data, depth, is_inf)
+
+# define DEBUG_PEEP(str, scan, depth, flags) \
+ S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
+
+#else
+# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
+# define DEBUG_PEEP(str, scan, depth, flags) NOOP
+#endif
/* =========================================================
push(UV key,item* curr)
{
item* head;
- Newxz(head, 1, item);
+ Newx(head, 1, item);
head->key = key;
head->value = 0;
head->next = curr;
PERL_ARGS_ASSERT_EDIT_DISTANCE;
/* intialize matrix start values */
- Newxz(scores, ( (x + 2) * (y + 2)), UV);
+ Newx(scores, ( (x + 2) * (y + 2)), UV);
scores[0] = score_ceil;
scores[1 * (y + 2) + 0] = score_ceil;
scores[0 * (y + 2) + 1] = score_ceil;
}
/* Mark that we cannot extend a found fixed substring at this point.
- Update the longest found anchored substring and the longest found
+ Update the longest found anchored substring or the longest found
floating substrings if needed. */
STATIC void
SSize_t *minlenp, int is_inf)
{
const STRLEN l = CHR_SVLEN(data->last_found);
- const STRLEN old_l = CHR_SVLEN(*data->longest);
+ SV * const longest_sv = data->substrs[data->cur_is_floating].str;
+ const STRLEN old_l = CHR_SVLEN(longest_sv);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_SCAN_COMMIT;
if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
- SvSetMagicSV(*data->longest, data->last_found);
- if (*data->longest == data->longest_fixed) {
- data->offset_fixed = l ? data->last_start_min : data->pos_min;
- if (data->flags & SF_BEFORE_EOL)
- data->flags
- |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
- else
- data->flags &= ~SF_FIX_BEFORE_EOL;
- data->minlen_fixed=minlenp;
- data->lookbehind_fixed=0;
- }
- else { /* *data->longest == data->longest_float */
- data->offset_float_min = l ? data->last_start_min : data->pos_min;
- data->offset_float_max = (l
+ const U8 i = data->cur_is_floating;
+ SvSetMagicSV(longest_sv, data->last_found);
+ data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
+
+ if (!i) /* fixed */
+ data->substrs[0].max_offset = data->substrs[0].min_offset;
+ else { /* float */
+ data->substrs[1].max_offset = (l
? data->last_start_max
: (data->pos_delta > SSize_t_MAX - data->pos_min
? SSize_t_MAX
: data->pos_min + data->pos_delta));
if (is_inf
- || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
- data->offset_float_max = SSize_t_MAX;
- if (data->flags & SF_BEFORE_EOL)
- data->flags
- |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
- else
- data->flags &= ~SF_FL_BEFORE_EOL;
- data->minlen_float=minlenp;
- data->lookbehind_float=0;
- }
+ || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
+ data->substrs[1].max_offset = SSize_t_MAX;
+ }
+
+ if (data->flags & SF_BEFORE_EOL)
+ data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
+ else
+ data->substrs[i].flags &= ~SF_BEFORE_EOL;
+ data->substrs[i].minlenp = minlenp;
+ data->substrs[i].lookbehind = 0;
}
+
SvCUR_set(data->last_found, 0);
{
SV * const sv = data->last_found;
}
data->last_end = -1;
data->flags &= ~SF_BEFORE_EOL;
- DEBUG_STUDYDATA("commit: ",data,0);
+ DEBUG_STUDYDATA("commit", data, 0, is_inf);
}
/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
regnode_charclass_posixl temp;
int add = 1; /* To calculate the index of the complement */
+ Zero(&temp, 1, regnode_charclass_posixl);
ANYOF_POSIXL_ZERO(&temp);
for (i = 0; i < ANYOF_MAX; i++) {
assert(i % 2 != 0
} STMT_END
#define TRIE_LIST_NEW(state) STMT_START { \
- Newxz( trie->states[ state ].trans.list, \
+ Newx( trie->states[ state ].trans.list, \
4, reg_trie_trans_le ); \
TRIE_LIST_CUR( state ) = 1; \
TRIE_LIST_LEN( state ) = 4; \
/* we just use folder as a flag in utf8 */
const U8 * folder = NULL;
+ /* in the below add_data call we are storing either 'tu' or 'tuaa'
+ * which stands for one trie structure, one hash, optionally followed
+ * by two arrays */
#ifdef DEBUGGING
- const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
+ const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
AV *trie_words = NULL;
/* along with revcharmap, this only used during construction but both are
* useful during debugging so we store them in the struct when debugging.
aho->trie=trie_offset;
aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
Copy( trie->states, aho->states, numstates, reg_trie_state );
- Newxz( q, numstates, U32);
+ Newx( q, numstates, U32);
aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
aho->refcount = 1;
fail = aho->fail;
}
-#define DEBUG_PEEP(str,scan,depth) \
- DEBUG_OPTIMISE_r({if (scan){ \
- regnode *Next = regnext(scan); \
- regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);\
- Perl_re_indentf( aTHX_ "" str ">%3d: %s (%d)", \
- depth, REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
- Next ? (REG_NODE_NUM(Next)) : 0 );\
- DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
- Perl_re_printf( aTHX_ "\n"); \
- }});
-
/* The below joins as many adjacent EXACTish nodes as possible into a single
* one. The regop may be changed if the node(s) contain certain sequences that
* require special handling. The joining is only done if:
PERL_UNUSED_ARG(flags);
PERL_UNUSED_ARG(val);
#endif
- DEBUG_PEEP("join",scan,depth);
+ DEBUG_PEEP("join", scan, depth, 0);
/* Look through the subsequent nodes in the chain. Skip NOTHING, merge
* EXACT ones that are mergeable to the current one. */
if (OP(n) == TAIL || n > next)
stringok = 0;
if (PL_regkind[OP(n)] == NOTHING) {
- DEBUG_PEEP("skip:",n,depth);
+ DEBUG_PEEP("skip:", n, depth, 0);
NEXT_OFF(scan) += NEXT_OFF(n);
next = n + NODE_STEP_REGNODE;
#ifdef DEBUGGING
if (oldl + STR_LEN(n) > U8_MAX)
break;
- DEBUG_PEEP("merg",n,depth);
+ DEBUG_PEEP("merg", n, depth, 0);
merged++;
NEXT_OFF(scan) += NEXT_OFF(n);
#ifdef EXPERIMENTAL_INPLACESCAN
if (flags && !NEXT_OFF(n)) {
- DEBUG_PEEP("atch", val, depth);
+ DEBUG_PEEP("atch", val, depth, 0);
if (reg_off_by_arg[OP(n)]) {
ARG_SET(n, val - n);
}
n++;
}
#endif
- DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
+ DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
return stopnow;
}
} while (f);
}
-
+/* the return from this sub is the minimum length that could possibly match */
STATIC SSize_t
S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
SSize_t *minlenp, SSize_t *deltap,
PERL_ARGS_ASSERT_STUDY_CHUNK;
RExC_study_started= 1;
+ Zero(&data_fake, 1, scan_data_t);
if ( depth == 0 ) {
while (first_non_open && OP(first_non_open) == OPEN)
the folded version may be shorter) */
bool unfolded_multi_char = FALSE;
/* Peephole optimizer: */
- DEBUG_STUDYDATA("Peep:", data, depth);
- DEBUG_PEEP("Peep", scan, depth);
+ DEBUG_STUDYDATA("Peep", data, depth, is_inf);
+ DEBUG_PEEP("Peep", scan, depth, flags);
/* The reason we do this here is that we need to deal with things like
StructCopy(&zero_scan_data, &data_fake, scan_data_t);
scan = regnext(scan);
assert( OP(scan) == IFTHEN );
- DEBUG_PEEP("expect IFTHEN", scan, depth);
+ DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
data_fake.last_closep= &fake_last_close;
minlen = *minlenp;
next = regnext(scan);
scan = NEXTOPER(NEXTOPER(scan));
- DEBUG_PEEP("scan", scan, depth);
- DEBUG_PEEP("next", next, depth);
+ DEBUG_PEEP("scan", scan, depth, flags);
+ DEBUG_PEEP("next", next, depth, flags);
/* we suppose the run is continuous, last=next...
* NOTE we dont use the return here! */
+ /* DEFINEP study_chunk() recursion */
(void)study_chunk(pRExC_state, &scan, &minlen,
&deltanext, next, &data_fake, stopparen,
recursed_depth, NULL, f, depth+1);
I32 f = 0;
regnode_ssc this_class;
- DEBUG_PEEP("Branch", scan, depth);
+ DEBUG_PEEP("Branch", scan, depth, flags);
num++;
StructCopy(&zero_scan_data, &data_fake, scan_data_t);
f |= SCF_WHILEM_VISITED_POS;
/* we suppose the run is continuous, last=next...*/
+ /* recurse study_chunk() for each BRANCH in an alternation */
minnext = study_chunk(pRExC_state, &scan, minlenp,
&deltanext, next, &data_fake, stopparen,
recursed_depth, NULL, f,depth+1);
else
data->pos_delta += max1 - min1;
if (max1 != min1 || is_inf)
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1;
}
min += min1;
if (delta == SSize_t_MAX
RExC_study_chunk_recursed_bytes, U8);
}
/* we havent recursed into this paren yet, so recurse into it */
- DEBUG_STUDYDATA("gosub-set:", data,depth);
+ DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
my_recursed_depth= recursed_depth + 1;
} else {
- DEBUG_STUDYDATA("gosub-inf:", data,depth);
+ DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
/* some form of infinite recursion, assume infinite length
* */
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1;
}
is_inf = is_inf_internal = 1;
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
newframe->prev_recursed_depth = recursed_depth;
newframe->this_prev_frame= frame;
- DEBUG_STUDYDATA("frame-new:",data,depth);
- DEBUG_PEEP("fnew", scan, depth);
+ DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
+ DEBUG_PEEP("fnew", scan, depth, flags);
frame = newframe;
scan = start;
else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
SSize_t l = STR_LEN(scan);
UV uc;
+ assert(l);
if (UTF) {
const U8 * const s = (U8*)STRING(scan);
uc = utf8_to_uvchr_buf(s, s + l, NULL);
}
data->pos_delta += min_subtract;
if (min_subtract) {
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
}
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
/* Cannot extend fixed substrings */
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
is_inf = is_inf_internal = 1;
scan = regnext(scan);
f &= ~SCF_WHILEM_VISITED_POS;
/* This will finish on WHILEM, setting scan, or on NULL: */
+ /* recurse study_chunk() on loop bodies */
minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
last, data, stopparen, recursed_depth, NULL,
(mincount == 0
}
if (!scan) /* It was not CURLYX, but CURLY. */
scan = next;
- if (!(flags & SCF_TRIE_DOING_RESTUDY)
+ if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
/* ? quantifier ok, except for (?{ ... }) */
&& (next_is_eval || !(mincount == 0 && maxcount == 1))
&& (minnext == 0) && (deltanext == 0)
}
#endif
/* Optimize again: */
+ /* recurse study_chunk() on optimised CURLYX => CURLYM */
study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
NULL, stopparen, recursed_depth, NULL, 0,depth+1);
}
? SSize_t_MAX
: data->pos_min + data->pos_delta - last_chrs;
}
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
SvREFCNT_dec(last_str);
}
if (flags & SCF_DO_SUBSTR) {
/* Cannot expect anything... */
scan_commit(pRExC_state, data, minlenp, is_inf);
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
is_inf = is_inf_internal = 1;
if (flags & SCF_DO_STCLASS_OR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
data->pos_min += 1;
data->pos_delta += 1;
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
}
else if (REGNODE_SIMPLE(OP(scan))) {
(regnode_charclass *) scan);
break;
+ case ANYOFM:
+ {
+ SV* cp_list = get_ANYOFM_contents(scan);
+
+ if (flags & SCF_DO_STCLASS_OR) {
+ ssc_union(data->start_class,
+ cp_list,
+ FALSE /* don't invert */
+ );
+ }
+ else if (flags & SCF_DO_STCLASS_AND) {
+ ssc_intersection(data->start_class,
+ cp_list,
+ FALSE /* don't invert */
+ );
+ }
+
+ SvREFCNT_dec_NN(cp_list);
+ break;
+ }
+
case NPOSIXL:
invert = 1;
/* FALLTHROUGH */
}
break;
+ case NASCII:
+ invert = 1;
+ /* FALLTHROUGH */
+ case ASCII:
+ my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
+
+ /* This can be handled as a Posix class */
+ goto join_posix_and_ascii;
+
case NPOSIXA: /* For these, we always know the exact set of
what's matched */
invert = 1;
/* FALLTHROUGH */
case POSIXA:
- if (FLAGS(scan) == _CC_ASCII) {
- my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
- }
- else {
- _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
- PL_XPosix_ptrs[_CC_ASCII],
- &my_invlist);
- }
- goto join_posix;
+ assert(FLAGS(scan) != _CC_ASCII);
+ _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
+ PL_XPosix_ptrs[_CC_ASCII],
+ &my_invlist);
+ goto join_posix_and_ascii;
case NPOSIXD:
case NPOSIXU:
&my_invlist);
}
- join_posix:
+ join_posix_and_ascii:
if (flags & SCF_DO_STCLASS_AND) {
ssc_intersection(data->start_class, my_invlist, invert);
f |= SCF_WHILEM_VISITED_POS;
next = regnext(scan);
nscan = NEXTOPER(NEXTOPER(scan));
+
+ /* recurse study_chunk() for lookahead body */
minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
last, &data_fake, stopparen,
recursed_depth, NULL, f, depth+1);
else
data_fake.last_closep = &fake;
data_fake.flags = 0;
+ data_fake.substrs[0].flags = 0;
+ data_fake.substrs[1].flags = 0;
data_fake.pos_delta = delta;
if (is_inf)
data_fake.flags |= SF_IS_INF;
next = regnext(scan);
nscan = NEXTOPER(NEXTOPER(scan));
+ /* positive lookahead study_chunk() recursion */
*minnextp = study_chunk(pRExC_state, &nscan, minnextp,
&deltanext, last, &data_fake,
stopparen, recursed_depth, NULL,
data->flags |= SF_HAS_EVAL;
data->whilem_c = data_fake.whilem_c;
if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
+ int i;
if (RExC_rx->minlen<*minnextp)
RExC_rx->minlen=*minnextp;
scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
SvREFCNT_dec_NN(data_fake.last_found);
- if ( data_fake.minlen_fixed != minlenp )
- {
- data->offset_fixed= data_fake.offset_fixed;
- data->minlen_fixed= data_fake.minlen_fixed;
- data->lookbehind_fixed+= scan->flags;
- }
- if ( data_fake.minlen_float != minlenp )
- {
- data->minlen_float= data_fake.minlen_float;
- data->offset_float_min=data_fake.offset_float_min;
- data->offset_float_max=data_fake.offset_float_max;
- data->lookbehind_float+= scan->flags;
+ for (i = 0; i < 2; i++) {
+ if (data_fake.substrs[i].minlenp != minlenp) {
+ data->substrs[i].min_offset =
+ data_fake.substrs[i].min_offset;
+ data->substrs[i].max_offset =
+ data_fake.substrs[i].max_offset;
+ data->substrs[i].minlenp =
+ data_fake.substrs[i].minlenp;
+ data->substrs[i].lookbehind += scan->flags;
+ }
}
}
}
}
#endif
}
+
else if (OP(scan) == OPEN) {
if (stopparen != (I32)ARG(scan))
pars++;
{
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
is_inf = is_inf_internal = 1;
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
/* We go from the jump point to the branch that follows
it. Note this means we need the vestigal unused
branches even though they arent otherwise used. */
+ /* optimise study_chunk() for TRIE */
minnext = study_chunk(pRExC_state, &scan, minlenp,
&deltanext, (regnode *)nextbranch, &data_fake,
stopparen, recursed_depth, NULL, f,depth+1);
data->pos_min += min1;
data->pos_delta += max1 - min1;
if (max1 != min1 || is_inf)
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
min += min1;
- if (delta != SSize_t_MAX)
- delta += max1 - min1;
+ if (delta != SSize_t_MAX) {
+ if (SSize_t_MAX - (max1 - min1) >= delta)
+ delta += max1 - min1;
+ else
+ delta = SSize_t_MAX;
+ }
if (flags & SCF_DO_STCLASS_OR) {
ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
if (min1) {
data->pos_min += trie->minlen;
data->pos_delta += (trie->maxlen - trie->minlen);
if (trie->maxlen != trie->minlen)
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
if (trie->jump) /* no more substrings -- for now /grr*/
flags &= ~SCF_DO_SUBSTR;
/* we need to unwind recursion. */
depth = depth - 1;
- DEBUG_STUDYDATA("frame-end:",data,depth);
- DEBUG_PEEP("fend", scan, depth);
+ DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
+ DEBUG_PEEP("fend", scan, depth, flags);
/* restore previous context */
last = frame->last_regnode;
}
assert(!frame);
- DEBUG_STUDYDATA("pre-fin:",data,depth);
+ DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
*scanp = scan;
*deltap = is_inf_internal ? SSize_t_MAX : delta;
if (flags & SCF_TRIE_RESTUDY)
data->flags |= SCF_TRIE_RESTUDY;
- DEBUG_STUDYDATA("post-fin:",data,depth);
+ DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
{
SSize_t final_minlen= min < stopmin ? min : stopmin;
STATIC bool
-S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
- SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
- SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
- STRLEN longest_length, bool eol, bool meol)
+S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
+ struct reg_substr_datum *rsd,
+ struct scan_data_substrs *sub,
+ STRLEN longest_length)
{
/* This is the common code for setting up the floating and fixed length
* string data extracted from Perl_re_op_compile() below. Returns a boolean
I32 t;
SSize_t ml;
+ bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
+ bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
if (! (longest_length
|| (eol /* Can't have SEOL and MULTI */
/* copy the information about the longest from the reg_scan_data
over to the program. */
- if (SvUTF8(sv_longest)) {
- *rx_utf8 = sv_longest;
- *rx_substr = NULL;
+ if (SvUTF8(sub->str)) {
+ rsd->substr = NULL;
+ rsd->utf8_substr = sub->str;
} else {
- *rx_substr = sv_longest;
- *rx_utf8 = NULL;
+ rsd->substr = sub->str;
+ rsd->utf8_substr = NULL;
}
/* end_shift is how many chars that must be matched that
follow this item. We calculate it ahead of time as once the
lookbehind offset is added in we lose the ability to correctly
calculate it.*/
- ml = minlen ? *(minlen) : (SSize_t)longest_length;
- *rx_end_shift = ml - offset
+ ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
+ rsd->end_shift = ml - sub->min_offset
- longest_length
/* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
* intead? - DAPM
- + (SvTAIL(sv_longest) != 0)
+ + (SvTAIL(sub->str) != 0)
*/
- + lookbehind;
+ + sub->lookbehind;
t = (eol/* Can't have SEOL and MULTI */
&& (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
- fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
+ fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
return TRUE;
}
if ( ! dump_len_string
|| ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
{
- PL_dump_re_max_len = 0;
+ PL_dump_re_max_len = 60; /* A reasonable default */
}
#endif
}
RExC_seen_unfolded_sharp_s = 0;
RExC_contains_locale = 0;
RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
+ RExC_in_script_run = 0;
RExC_study_started = 0;
pRExC_state->runtime_code_qr = NULL;
RExC_frame_head= NULL;
});
DEBUG_COMPILE_r({
SV *dsv= sv_newmortal();
- RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
+ RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
PL_colors[4],PL_colors[5],s);
});
/* make sure PL_bitcount bounds not exceeded */
assert(sizeof(STD_PAT_MODS) <= 8);
- Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
- r->xpv_len_u.xpvlenu_pv = p;
+ p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
+ SvPOK_on(rx);
if (RExC_utf8)
SvFLAGS(rx) |= SVf_UTF8;
*p++='('; *p++='?';
3-units-long substrs field. */
Newx(r->substrs, 1, struct reg_substr_data);
if (RExC_recurse_count) {
- Newxz(RExC_recurse,RExC_recurse_count,regnode *);
+ Newx(RExC_recurse,RExC_recurse_count,regnode *);
SAVEFREEPV(RExC_recurse);
}
if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
*/
SSize_t fake;
- STRLEN longest_float_length, longest_fixed_length;
+ STRLEN longest_length[2];
regnode_ssc ch_class; /* pointed to by data */
int stclass_flag;
SSize_t last_close = 0; /* pointed to by data */
regnode *first= scan;
regnode *first_next= regnext(first);
+ int i;
+
/*
* Skip introductions and multiplicators >= 1
* so that we can extract the 'meat' of the pattern that must
/* Starting-point info. */
again:
- DEBUG_PEEP("first:",first,0);
+ DEBUG_PEEP("first:", first, 0, 0);
/* Ignore EXACT as we deal with it later. */
if (PL_regkind[OP(first)] == EXACT) {
if (OP(first) == EXACT || OP(first) == EXACTL)
* earlier string may buy us something the later one won't.]
*/
- data.longest_fixed = newSVpvs("");
- data.longest_float = newSVpvs("");
+ data.substrs[0].str = newSVpvs("");
+ data.substrs[1].str = newSVpvs("");
data.last_found = newSVpvs("");
- data.longest = &(data.longest_fixed);
+ data.cur_is_floating = 0; /* initially any found substring is fixed */
ENTER_with_name("study_chunk");
- SAVEFREESV(data.longest_fixed);
- SAVEFREESV(data.longest_float);
+ SAVEFREESV(data.substrs[0].str);
+ SAVEFREESV(data.substrs[1].str);
SAVEFREESV(data.last_found);
first = scan;
if (!ri->regstclass) {
data.last_closep = &last_close;
DEBUG_RExC_seen();
+ /*
+ * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
+ * (NO top level branches)
+ */
minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
scan + RExC_size, /* Up to end */
&data, -1, 0, NULL,
CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
- if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
+ if ( RExC_npar == 1 && !data.cur_is_floating
&& data.last_start_min == 0 && data.last_end > 0
&& !RExC_seen_zerolen
&& !(RExC_seen & REG_VERBARG_SEEN)
}
scan_commit(pRExC_state, &data,&minlen,0);
- longest_float_length = CHR_SVLEN(data.longest_float);
-
- if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
- && data.offset_fixed == data.offset_float_min
- && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
- && S_setup_longest (aTHX_ pRExC_state,
- data.longest_float,
- &(r->float_utf8),
- &(r->float_substr),
- &(r->float_end_shift),
- data.lookbehind_float,
- data.offset_float_min,
- data.minlen_float,
- longest_float_length,
- cBOOL(data.flags & SF_FL_BEFORE_EOL),
- cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
- {
- r->float_min_offset = data.offset_float_min - data.lookbehind_float;
- r->float_max_offset = data.offset_float_max;
- if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
- r->float_max_offset -= data.lookbehind_float;
- SvREFCNT_inc_simple_void_NN(data.longest_float);
- }
- else {
- r->float_substr = r->float_utf8 = NULL;
- longest_float_length = 0;
- }
- longest_fixed_length = CHR_SVLEN(data.longest_fixed);
-
- if (S_setup_longest (aTHX_ pRExC_state,
- data.longest_fixed,
- &(r->anchored_utf8),
- &(r->anchored_substr),
- &(r->anchored_end_shift),
- data.lookbehind_fixed,
- data.offset_fixed,
- data.minlen_fixed,
- longest_fixed_length,
- cBOOL(data.flags & SF_FIX_BEFORE_EOL),
- cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
- {
- r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
- SvREFCNT_inc_simple_void_NN(data.longest_fixed);
- }
- else {
- r->anchored_substr = r->anchored_utf8 = NULL;
- longest_fixed_length = 0;
- }
+ /* XXX this is done in reverse order because that's the way the
+ * code was before it was parameterised. Don't know whether it
+ * actually needs doing in reverse order. DAPM */
+ for (i = 1; i >= 0; i--) {
+ longest_length[i] = CHR_SVLEN(data.substrs[i].str);
+
+ if ( !( i
+ && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
+ && data.substrs[0].min_offset
+ == data.substrs[1].min_offset
+ && SvCUR(data.substrs[0].str)
+ == SvCUR(data.substrs[1].str)
+ )
+ && S_setup_longest (aTHX_ pRExC_state,
+ &(r->substrs->data[i]),
+ &(data.substrs[i]),
+ longest_length[i]))
+ {
+ r->substrs->data[i].min_offset =
+ data.substrs[i].min_offset - data.substrs[i].lookbehind;
+
+ r->substrs->data[i].max_offset = data.substrs[i].max_offset;
+ /* Don't offset infinity */
+ if (data.substrs[i].max_offset < SSize_t_MAX)
+ r->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
+ SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
+ }
+ else {
+ r->substrs->data[i].substr = NULL;
+ r->substrs->data[i].utf8_substr = NULL;
+ longest_length[i] = 0;
+ }
+ }
+
LEAVE_with_name("study_chunk");
if (ri->regstclass
&& (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
ri->regstclass = NULL;
- if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
+ if ((!(r->substrs->data[0].substr || r->substrs->data[0].utf8_substr)
+ || r->substrs->data[0].min_offset)
&& stclass_flag
&& ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
&& is_ssc_worth_it(pRExC_state, data.start_class))
data.start_class = NULL;
}
- /* A temporary algorithm prefers floated substr to fixed one to dig
- * more info. */
- if (longest_fixed_length > longest_float_length) {
- r->substrs->check_ix = 0;
- r->check_end_shift = r->anchored_end_shift;
- r->check_substr = r->anchored_substr;
- r->check_utf8 = r->anchored_utf8;
- r->check_offset_min = r->check_offset_max = r->anchored_offset;
- if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
- r->intflags |= PREGf_NOSCAN;
- }
- else {
- r->substrs->check_ix = 1;
- r->check_end_shift = r->float_end_shift;
- r->check_substr = r->float_substr;
- r->check_utf8 = r->float_utf8;
- r->check_offset_min = r->float_min_offset;
- r->check_offset_max = r->float_max_offset;
- }
+ /* A temporary algorithm prefers floated substr to fixed one of
+ * same length to dig more info. */
+ i = (longest_length[0] <= longest_length[1]);
+ r->substrs->check_ix = i;
+ r->check_end_shift = r->substrs->data[i].end_shift;
+ r->check_substr = r->substrs->data[i].substr;
+ r->check_utf8 = r->substrs->data[i].utf8_substr;
+ r->check_offset_min = r->substrs->data[i].min_offset;
+ r->check_offset_max = r->substrs->data[i].max_offset;
+ if (!i && (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
+ r->intflags |= PREGf_NOSCAN;
+
if ((r->check_substr || r->check_utf8) ) {
r->extflags |= RXf_USE_INTUIT;
if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
r->extflags |= RXf_INTUIT_TAIL;
}
- r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
- if ( (STRLEN)minlen < longest_float_length )
- minlen= longest_float_length;
- if ( (STRLEN)minlen < longest_fixed_length )
- minlen= longest_fixed_length;
+ if ( (STRLEN)minlen < longest_length[1] )
+ minlen= longest_length[1];
+ if ( (STRLEN)minlen < longest_length[0] )
+ minlen= longest_length[0];
*/
}
else {
data.last_closep = &last_close;
DEBUG_RExC_seen();
+ /*
+ * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
+ * (patterns WITH top level branches)
+ */
minlen = study_chunk(pRExC_state,
&scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
CHECK_RESTUDY_GOTO_butfirst(NOOP);
- r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
- = r->float_substr = r->float_utf8 = NULL;
+ r->check_substr = NULL;
+ r->check_utf8 = NULL;
+ r->substrs->data[0].substr = NULL;
+ r->substrs->data[0].utf8_substr = NULL;
+ r->substrs->data[1].substr = NULL;
+ r->substrs->data[1].utf8_substr = NULL;
if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
&& is_ssc_worth_it(pRExC_state, data.start_class))
if (RExC_seen & REG_RECURSE_SEEN ) {
r->intflags |= PREGf_RECURSE_SEEN;
- Newxz(r->recurse_locinput, r->nparens + 1, char *);
+ Newx(r->recurse_locinput, r->nparens + 1, char *);
}
if (RExC_seen & REG_GPOS_SEEN)
r->intflags |= PREGf_GPOS_SEEN;
* here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
* intervening space, as the sequence is a token, and a token should be
* indivisible */
- bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
+ bool has_intervening_patws = (paren == 2 || paren == 's')
+ && *(RExC_parse - 1) != '(';
if (RExC_parse >= RExC_end) {
vFAIL("Unmatched (");
}
- if ( *RExC_parse == '*') { /* (*VERB:ARG) */
+ if (paren == 's') {
+
+ /* A nested script run is a no-op besides clustering */
+ if (RExC_in_script_run) {
+ paren = ':';
+ nextchar(pRExC_state);
+ ret = NULL;
+ goto parse_rest;
+ }
+ RExC_in_script_run = 1;
+
+ ret = reg_node(pRExC_state, SROPEN);
+ is_open = 1;
+ }
+ else if ( *RExC_parse == '*') { /* (*VERB:ARG) */
char *start_verb = RExC_parse + 1;
STRLEN verb_len;
char *start_arg = NULL;
if (RExC_parse >= RExC_end) {
goto unterminated_verb_pattern;
}
+
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
while ( RExC_parse < RExC_end && *RExC_parse != ')' )
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
nextchar(pRExC_state);
return ret;
}
+ else if (*RExC_parse == '+') { /* (+...) */
+ RExC_parse++;
+
+ if (has_intervening_patws) {
+ /* XXX Note that a potential gotcha is that outside of /x '( +
+ * ...)' means to match a space at least once ... This is a
+ * problem elsewhere too */
+ vFAIL("In '(+...)', the '(' and '+' must be adjacent");
+ }
+
+ if (! memBEGINPs(RExC_parse, (STRLEN) (RExC_end - RExC_parse),
+ "script_run:"))
+ {
+ RExC_parse += strcspn(RExC_parse, ":)");
+ vFAIL("Unknown (+ pattern");
+ }
+ else {
+
+ /* This indicates Unicode rules. */
+ REQUIRE_UNI_RULES(flagp, NULL);
+
+ RExC_parse += sizeof("script_run:") - 1;
+
+ if (PASS2) {
+ Perl_ck_warner_d(aTHX_
+ packWARN(WARN_EXPERIMENTAL__SCRIPT_RUN),
+ "The script_run feature is experimental"
+ REPORT_LOCATION, REPORT_LOCATION_ARGS(RExC_parse));
+ }
+
+ ret = reg(pRExC_state, 's', &flags, depth+1);
+ if (flags & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flags & (RESTART_PASS1|NEED_UTF8);
+ return NULL;
+ }
+
+ return ret;
+ }
+ }
else if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
ret = reganode(pRExC_state,NGROUPP,num);
goto insert_if_check_paren;
}
- else if (RExC_end - RExC_parse >= DEFINE_len
- && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
+ else if (memBEGINs(RExC_parse,
+ (STRLEN) (RExC_end - RExC_parse),
+ "DEFINE"))
{
ret = reganode(pRExC_state,DEFINEP,0);
RExC_parse += DEFINE_len;
vFAIL("Unknown switch condition (?(...))");
}
case '[': /* (?[ ... ]) */
- return handle_regex_sets(pRExC_state, NULL, flagp, depth,
+ return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
oregcomp_parse);
case 0: /* A NUL */
RExC_parse--; /* for vFAIL to print correctly */
goto parse_rest;
} /* end switch */
}
- else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
+ else {
+ if (*RExC_parse == '{' && PASS2) {
+ ckWARNregdep(RExC_parse + 1,
+ "Unescaped left brace in regex is "
+ "deprecated here (and will be fatal "
+ "in Perl 5.32), passed through");
+ }
+ /* Not bothering to indent here, as the above 'else' is temporary
+ * */
+ if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
capturing_parens:
parno = RExC_npar;
RExC_npar++;
paren = ':';
ret = NULL;
}
+ }
}
else /* ! paren */
ret = NULL;
Set_Node_Offset(ender,RExC_parse+1); /* MJD */
Set_Node_Length(ender,1); /* MJD */
break;
+ case 's':
+ ender = reg_node(pRExC_state, SRCLOSE);
+ RExC_in_script_run = 0;
+ break;
case '<':
case ',':
case '=':
if ((flags&SIMPLE)) {
if (min == 0 && max == REG_INFTY) {
reginsert(pRExC_state, STAR, ret, depth+1);
- ret->flags = 0;
MARK_NAUGHTY(4);
RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
goto nest_check;
}
if (min == 1 && max == REG_INFTY) {
reginsert(pRExC_state, PLUS, ret, depth+1);
- ret->flags = 0;
MARK_NAUGHTY(3);
RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
goto nest_check;
ender = reg_node(pRExC_state, SUCCEED);
REGTAIL(pRExC_state, ret, ender);
reginsert(pRExC_state, SUSPEND, ret, depth+1);
- ret->flags = 0;
ender = reg_node(pRExC_state, TAIL);
REGTAIL(pRExC_state, ret, ender);
}
RExC_parse++; /* Skip past the '{' */
- if (! (endbrace = strchr(RExC_parse, '}'))) { /* no trailing brace */
+ endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
+ if (! endbrace) { /* no trailing brace */
vFAIL2("Missing right brace on \\%c{}", 'N');
}
- else if(!(endbrace == RExC_parse /* nothing between the {} */
- || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */
- && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
- error msg) */
+ else if (!( endbrace == RExC_parse /* nothing between the {} */
+ || memBEGINs(RExC_parse, /* U+ (bad hex is checked below
+ for a better error msg) */
+ (STRLEN) (RExC_end - RExC_parse),
+ "U+")))
{
RExC_parse = endbrace; /* position msg's '<--HERE' */
vFAIL("\\N{NAME} must be resolved by the lexer");
/* Convert code point from hex */
length_of_hex = (STRLEN)(endchar - RExC_parse);
grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX
+ | PERL_SCAN_DISALLOW_PREFIX
- /* No errors in the first pass (See [perl
- * #122671].) We let the code below find the
- * errors when there are multiple chars. */
- | ((SIZE_ONLY)
- ? PERL_SCAN_SILENT_ILLDIGIT
- : 0);
+ /* No errors in the first pass (See [perl
+ * #122671].) We let the code below find the
+ * errors when there are multiple chars. */
+ | ((SIZE_ONLY)
+ ? PERL_SCAN_SILENT_ILLDIGIT
+ : 0);
/* This routine is the one place where both single- and double-quotish
* \N{U+xxxx} are evaluated. The value is a Unicode code point which
* must be converted to native. */
*code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
- &length_of_hex,
- &grok_hex_flags,
- NULL));
+ &length_of_hex,
+ &grok_hex_flags,
+ NULL));
/* The tokenizer should have guaranteed validity, but it's possible to
* bypass it by using single quoting, so check. Don't do the check
}
/* Fail if caller doesn't want to handle a multi-code-point sequence.
- * But don't backup up the pointer if the caller want to know how many
+ * But don't backup up the pointer if the caller wants to know how many
* code points there are (they can then handle things) */
if (! node_p) {
if (! cp_count) {
}
sv_catpv(substitute_parse, ")");
- RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse,
- len);
+ len = SvCUR(substitute_parse);
/* Don't allow empty number */
if (len < (STRLEN) 8) {
RExC_parse = endbrace;
vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
+
+ RExC_parse = RExC_start = RExC_adjusted_start
+ = SvPV_nolen(substitute_parse);
RExC_end = RExC_parse + len;
/* The values are Unicode, and therefore not subject to recoding, but
RExC_recode_x_to_native = 1;
#endif
- if (node_p) {
- if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
- if (flags & (RESTART_PASS1|NEED_UTF8)) {
- *flagp = flags & (RESTART_PASS1|NEED_UTF8);
- return FALSE;
- }
- FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#" UVxf,
- (UV) flags);
- }
- *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
- }
+ *node_p = reg(pRExC_state, 1, &flags, depth+1);
/* Restore the saved values */
RExC_start = RExC_adjusted_start = save_start;
#ifdef EBCDIC
RExC_recode_x_to_native = 0;
#endif
-
SvREFCNT_dec_NN(substitute_parse);
+
+ if (! *node_p) {
+ if (flags & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flags & (RESTART_PASS1|NEED_UTF8);
+ return FALSE;
+ }
+ FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#" UVxf,
+ (UV) flags);
+ }
+ *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
+
nextchar(pRExC_state);
return TRUE;
else {
STRLEN length;
char name = *RExC_parse;
- char * endbrace;
+ char * endbrace = NULL;
RExC_parse += 2;
- endbrace = strchr(RExC_parse, '}');
+ if (RExC_parse < RExC_end) {
+ endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
+ }
if (! endbrace) {
vFAIL2("Missing right brace on \\%c{}", name);
}*/
switch (*RExC_parse) {
case 'g':
- if (length != 1
- && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
+ if ( length != 1
+ && (memNEs(RExC_parse + 1, length - 1, "cb")))
{
goto bad_bound_type;
}
char *p;
char *s;
#define MAX_NODE_STRING_SIZE 127
- char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
+ char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE+1];
char *s0;
U8 upper_parse = MAX_NODE_STRING_SIZE;
U8 node_type = compute_EXACTish(pRExC_state);
goto loopdone;
}
p = RExC_parse;
+ RExC_parse = parse_start;
if (ender > 0xff) {
REQUIRE_UTF8(flagp);
}
const char* error_msg;
bool valid = grok_bslash_o(&p,
+ RExC_end,
&result,
&error_msg,
PASS2, /* out warnings */
const char* error_msg;
bool valid = grok_bslash_x(&p,
+ RExC_end,
&result,
&error_msg,
PASS2, /* out warnings */
* character we are appending, hence we can delay getting
* its representation until PASS2. */
if (SIZE_ONLY) {
- if (UTF) {
+ if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
const STRLEN unilen = UVCHR_SKIP(ender);
s += unilen;
}
} else { /* PASS2 */
not_fold_common:
- if (UTF) {
+ if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
U8 * new_s = uvchr_to_utf8((U8*)s, ender);
len += (char *) new_s - s - 1;
s = (char *) new_s;
REPORT_LOCATION_ARGS(p))); \
} \
} STMT_END
+#define CLEAR_POSIX_WARNINGS() \
+ STMT_START { \
+ if (posix_warnings && RExC_warn_text) \
+ av_clear(RExC_warn_text); \
+ } STMT_END
+
+#define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
+ STMT_START { \
+ CLEAR_POSIX_WARNINGS(); \
+ return ret; \
+ } STMT_END
STATIC int
S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
*
* The syntax for a legal posix class is:
*
- * qr/(?xa: \[ : \^? [:lower:]{4,6} : \] )/
+ * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
*
* What this routine considers syntactically to be an intended posix class
* is this (the comments indicate some restrictions that the pattern
* # for it to be considered to be
* # an intended posix class.
* \h*
- * [:punct:]? # The closing class character,
+ * [[:punct:]]? # The closing class character,
* # possibly omitted. If not a colon
* # nor semi colon, the class name
* # must be even closer to a valid
* decide that no posix class was intended. Should be at least
* sizeof("alphanumeric") */
UV input_text[15];
+ STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
- if (posix_warnings && RExC_warn_text)
- av_clear(RExC_warn_text);
+ CLEAR_POSIX_WARNINGS();
if (p >= e) {
return NOT_MEANT_TO_BE_A_POSIX_CLASS;
*updated_parse_ptr = (char *) temp_ptr;
}
- return OOB_NAMEDCLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
}
}
/* We consider something like [^:^alnum:]] to not have been intended to
* be a posix class, but XXX maybe we should */
if (complement) {
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
complement = 1;
* this leaves this construct looking like [:] or [:^], which almost
* certainly weren't intended to be posix classes */
if (has_opening_bracket) {
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
/* But this function can be called when we parse the colon for
/* XXX We are currently very restrictive here, so this code doesn't
* consider the possibility that, say, /[alpha.]]/ was intended to
* be a posix class. */
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
/* Here we have something like 'foo:]'. There was no initial colon,
}
/* Otherwise, it can't have meant to have been a class */
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
/* If we ran off the end, and the final character was a punctuation
* class name. (We can do this on the first pass, as any second pass
* will yield an even shorter name) */
if (name_len < 3) {
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
/* Find which class it is. Initially switch on the length of the name.
* */
switch (name_len) {
case 4:
- if (memEQ(name_start, "word", 4)) {
+ if (memEQs(name_start, 4, "word")) {
/* this is not POSIX, this is the Perl \w */
class_number = ANYOF_WORDCHAR;
}
* Offset 4 gives the best switch position. */
switch (name_start[4]) {
case 'a':
- if (memEQ(name_start, "alph", 4)) /* alpha */
+ if (memBEGINs(name_start, 5, "alph")) /* alpha */
class_number = ANYOF_ALPHA;
break;
case 'e':
- if (memEQ(name_start, "spac", 4)) /* space */
+ if (memBEGINs(name_start, 5, "spac")) /* space */
class_number = ANYOF_SPACE;
break;
case 'h':
- if (memEQ(name_start, "grap", 4)) /* graph */
+ if (memBEGINs(name_start, 5, "grap")) /* graph */
class_number = ANYOF_GRAPH;
break;
case 'i':
- if (memEQ(name_start, "asci", 4)) /* ascii */
+ if (memBEGINs(name_start, 5, "asci")) /* ascii */
class_number = ANYOF_ASCII;
break;
case 'k':
- if (memEQ(name_start, "blan", 4)) /* blank */
+ if (memBEGINs(name_start, 5, "blan")) /* blank */
class_number = ANYOF_BLANK;
break;
case 'l':
- if (memEQ(name_start, "cntr", 4)) /* cntrl */
+ if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
class_number = ANYOF_CNTRL;
break;
case 'm':
- if (memEQ(name_start, "alnu", 4)) /* alnum */
+ if (memBEGINs(name_start, 5, "alnu")) /* alnum */
class_number = ANYOF_ALPHANUMERIC;
break;
case 'r':
- if (memEQ(name_start, "lowe", 4)) /* lower */
+ if (memBEGINs(name_start, 5, "lowe")) /* lower */
class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
- else if (memEQ(name_start, "uppe", 4)) /* upper */
+ else if (memBEGINs(name_start, 5, "uppe")) /* upper */
class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
break;
case 't':
- if (memEQ(name_start, "digi", 4)) /* digit */
+ if (memBEGINs(name_start, 5, "digi")) /* digit */
class_number = ANYOF_DIGIT;
- else if (memEQ(name_start, "prin", 4)) /* print */
+ else if (memBEGINs(name_start, 5, "prin")) /* print */
class_number = ANYOF_PRINT;
- else if (memEQ(name_start, "punc", 4)) /* punct */
+ else if (memBEGINs(name_start, 5, "punc")) /* punct */
class_number = ANYOF_PUNCT;
break;
}
break;
case 6:
- if (memEQ(name_start, "xdigit", 6))
+ if (memEQs(name_start, 6, "xdigit"))
class_number = ANYOF_XDIGIT;
break;
}
}
/* Here neither pass found a close-enough class name */
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
probably_meant_to_be:
/* If it is a known class, return the class. The class number
* #defines are structured so each complement is +1 to the normal
* one */
- return class_number + complement;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
}
else if (! check_only) {
'stack' of where the undealt-with left
parens would be if they were actually
put there */
- /* The 'VOL' (expanding to 'volatile') is a workaround for an optimiser bug
+ /* The 'volatile' is a workaround for an optimiser bug
* in Solaris Studio 12.3. See RT #127455 */
- VOL IV fence = 0; /* Position of where most recent undealt-
+ volatile IV fence = 0; /* Position of where most recent undealt-
with left paren in stack is; -1 if none.
*/
STRLEN len; /* Temporary */
PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
+ DEBUG_PARSE("xcls");
+
if (in_locale) {
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
}
* these things, we need to realize that something preceded by a backslash
* is escaped, so we have to keep track of backslashes */
if (SIZE_ONLY) {
- UV depth = 0; /* how many nested (?[...]) constructs */
+ UV nest_depth = 0; /* how many nested (?[...]) constructs */
while (RExC_parse < RExC_end) {
SV* current = NULL;
TRUE /* Force /x */ );
switch (*RExC_parse) {
- case '?':
- if (RExC_parse[1] == '[') depth++, RExC_parse++;
+ case '(':
+ if (RExC_parse[1] == '?' && RExC_parse[2] == '[')
+ nest_depth++, RExC_parse+=2;
/* FALLTHROUGH */
default:
break;
}
case ']':
- if (depth--) break;
- RExC_parse++;
- if (*RExC_parse == ')') {
+ if (RExC_parse[1] == ')') {
+ RExC_parse++;
+ if (nest_depth--) break;
node = reganode(pRExC_state, ANYOF, 0);
RExC_size += ANYOF_SKIP;
nextchar(pRExC_state);
return node;
}
- goto no_close;
+ /* We output the messages even if warnings are off, because we'll fail
+ * the very next thing, and these give a likely diagnosis for that */
+ if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
+ output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
+ }
+ RExC_parse++;
+ vFAIL("Unexpected ']' with no following ')' in (?[...");
}
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
}
- no_close:
/* We output the messages even if warnings are off, because we'll fail
* the very next thing, and these give a likely diagnosis for that */
if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
}
- FAIL("Syntax error in (?[...])");
+ vFAIL("Syntax error in (?[...])");
}
/* Pass 2 only after this. */
* inversion list, and RExC_parse points to the trailing
* ']'; the next character should be the ')' */
RExC_parse++;
- assert(UCHARAT(RExC_parse) == ')');
+ if (UCHARAT(RExC_parse) != ')')
+ vFAIL("Expecting close paren for nested extended charclass");
/* Then the ')' matching the original '(' handled by this
* case: statement */
RExC_parse++;
- assert(UCHARAT(RExC_parse) == ')');
+ if (UCHARAT(RExC_parse) != ')')
+ vFAIL("Expecting close paren for wrapper for nested extended charclass");
RExC_parse++;
RExC_flags = save_flags;
do_posix_warnings ? &posix_warnings : NULL,
TRUE /* checking only */);
}
+ else if ( strict && ! skip_white
+ && ( _generic_isCC(value, _CC_VERTSPACE)
+ || is_VERTWS_cp_high(value)))
+ {
+ vFAIL("Literal vertical space in [] is illegal except under /x");
+ }
else if (value == '\\') {
/* Is a backslash; get the code point of the char after it */
vFAIL2("Empty \\%c", (U8)value);
if (*RExC_parse == '{') {
const U8 c = (U8)value;
- e = strchr(RExC_parse, '}');
+ e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
if (!e) {
RExC_parse++;
vFAIL2("Missing right brace on \\%c{}", c);
* referred to outside it. [perl #121777] */
if (! has_pkg && curpkg) {
char* pkgname = HvNAME(curpkg);
- if (strNE(pkgname, "main")) {
+ if (memNEs(pkgname, HvNAMELEN(curpkg), "main")) {
char* full_name = Perl_form(aTHX_
"%s::%s",
pkgname,
{
const char* error_msg;
bool valid = grok_bslash_o(&RExC_parse,
+ RExC_end,
&value,
&error_msg,
PASS2, /* warnings only in
{
const char* error_msg;
bool valid = grok_bslash_x(&RExC_parse,
+ RExC_end,
&value,
&error_msg,
PASS2, /* Output warnings */
{
/* Here <value> is indeed a multi-char fold. Get what it is */
- U8 foldbuf[UTF8_MAXBYTES_CASE];
+ U8 foldbuf[UTF8_MAXBYTES_CASE+1];
STRLEN foldlen;
UV folded = _to_uni_fold_flags(
" be some subset of \"0-9\","
" \"A-Z\", or \"a-z\"");
}
- else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
+ else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
SSize_t index_start;
SSize_t index_final;
* can't do the same checks for above-ASCII ranges,
* except in the case of digit ones. These should
* contain only digits from the same group of 10. The
- * ASCII case is handled just above. 0x660 is the
- * first digit character beyond ASCII. Hence here, the
+ * ASCII case is handled just above. Hence here, the
* range could be a range of digits. First some
* unlikely special cases. Grandfather in that a range
* ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
/* The actual POSIXish node for all the rest depends on the
* charset modifier. The ones in the first set depend only on
* ASCII or, if available on this platform, also locale */
+
case ANYOF_ASCII:
case ANYOF_NASCII:
+
#ifdef HAS_ISASCII
- op = (LOC) ? POSIXL : POSIXA;
-#else
- op = POSIXA;
+ if (LOC) {
+ op = POSIXL;
+ goto join_posix;
+ }
#endif
- goto join_posix;
+ /* (named_class - ANYOF_ASCII) is 0 or 1. xor'ing with
+ * invert converts that to 1 or 0 */
+ op = ASCII + ((namedclass - ANYOF_ASCII) ^ invert);
+ break;
/* The following don't have any matches in the upper Latin1
* range, hence /d is equivalent to /u for them. Making it /u
TRUE /* downgradable to EXACT */
);
}
+ else {
+ *flagp |= HASWIDTH|SIMPLE;
+ }
RExC_parse = (char *) cur_parse;
if (_invlist_len(only_non_utf8_list) != 0) {
ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
}
+ SvREFCNT_dec_NN(only_non_utf8_list);
}
else {
/* Here there were no complemented posix classes. That means
* certain common classes that are easy to test. Getting to this point in
* the code means that the class didn't get optimized there. Since this
* code is only executed in Pass 2, it is too late to save space--it has
- * been allocated in Pass 1, and currently isn't given back. But turning
- * things into an EXACTish node can allow the optimizer to join it to any
- * adjacent such nodes. And if the class is equivalent to things like /./,
- * expensive run-time swashes can be avoided. Now that we have more
- * complete information, we can find things necessarily missed by the
- * earlier code. Another possible "optimization" that isn't done is that
- * something like [Ee] could be changed into an EXACTFU. khw tried this
- * and found that the ANYOF is faster, including for code points not in the
- * bitmap. This still might make sense to do, provided it got joined with
- * an adjacent node(s) to create a longer EXACTFU one. This could be
- * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
- * routine would know is joinable. If that didn't happen, the node type
- * could then be made a straight ANYOF */
+ * been allocated in Pass 1, and currently isn't given back. XXX Why not?
+ * But turning things into an EXACTish node can allow the optimizer to join
+ * it to any adjacent such nodes. And if the class is equivalent to things
+ * like /./, expensive run-time swashes can be avoided. Now that we have
+ * more complete information, we can find things necessarily missed by the
+ * earlier code. */
if (optimizable && cp_list && ! invert) {
UV start, end;
U8 op = END; /* The optimzation node-type */
int posix_class = -1; /* Illegal value */
const char * cur_parse= RExC_parse;
+ U8 ANYOFM_mask = 0xFF;
+ U32 anode_arg = 0;
invlist_iterinit(cp_list);
if (! invlist_iternext(cp_list, &start, &end)) {
invlist_iterfinish(cp_list);
if (op == END) {
- const UV cp_list_len = _invlist_len(cp_list);
- const UV* cp_list_array = invlist_array(cp_list);
/* Here, didn't find an optimization. See if this matches any of
- * the POSIX classes. These run slightly faster for above-Unicode
- * code points, so don't bother with POSIXA ones nor the 2 that
- * have no above-Unicode matches. We can avoid these checks unless
- * the ANYOF matches at least as high as the lowest POSIX one
- * (which was manually found to be \v. The actual code point may
- * increase in later Unicode releases, if a higher code point is
- * assigned to be \v, but this code will never break. It would
- * just mean we could execute the checks for posix optimizations
- * unnecessarily) */
-
- if (cp_list_array[cp_list_len-1] > 0x2029) {
+ * the POSIX classes. First try ASCII */
+
+ if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 0)) {
+ op = ASCII;
+ *flagp |= HASWIDTH|SIMPLE;
+ }
+ else if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 1)) {
+ op = NASCII;
+ *flagp |= HASWIDTH|SIMPLE;
+ }
+ else if (invlist_highest(cp_list) >= 0x2029) {
+
+ /* Then try the other POSIX classes. The POSIXA ones are about
+ * the same speed as ANYOF ops, but the ones that have
+ * above-Latin1 code point matches are somewhat faster than
+ * ANYOF. So optimize those, but don't bother with the POSIXA
+ * ones nor [:cntrl:] which has no above-Latin1 matches. If
+ * this ANYOF node has a lower highest possible matching code
+ * point than any of the XPosix ones, we know that it can't
+ * possibly be the same as any of them, so we can avoid
+ * executing this code. The 0x2029 above for the lowest max
+ * was determined by manual inspection of the classes, and
+ * comes from \v. Suppose Unicode in a later version adds a
+ * higher code point to \v. All that means is that this code
+ * can be executed unnecessarily. It will still give the
+ * correct answer. */
+
for (posix_class = 0;
posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
posix_class++)
{
int try_inverted;
- if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
+
+ if (posix_class == _CC_CNTRL) {
continue;
}
+
for (try_inverted = 0; try_inverted < 2; try_inverted++) {
/* Check if matches normal or inverted */
}
found_posix: ;
}
+
+ /* If it didn't match a POSIX class, it might be able to be turned
+ * into an ANYOFM node. Compare two different bytes, bit-by-bit.
+ * In some positions, the bits in each will be 1; and in other
+ * positions both will be 0; and in some positions the bit will be
+ * 1 in one byte, and 0 in the other. Let 'n' be the number of
+ * positions where the bits differ. We create a mask which has
+ * exactly 'n' 0 bits, each in a position where the two bytes
+ * differ. Now take the set of all bytes that when ANDed with the
+ * mask yield the same result. That set has 2**n elements, and is
+ * representable by just two 8 bit numbers: the result and the
+ * mask. Importantly, matching the set can be vectorized by
+ * creating a word full of the result bytes, and a word full of the
+ * mask bytes, yielding a significant speed up. Here, see if this
+ * node matches such a set. As a concrete example consider [01],
+ * and the byte representing '0' which is 0x30 on ASCII machines.
+ * It has the bits 0011 0000. Take the mask 1111 1110. If we AND
+ * 0x31 and 0x30 with that mask we get 0x30. Any other bytes ANDed
+ * yield something else. So [01], which is a common usage, is
+ * optimizable into ANYOFM, and can benefit from the speed up. We
+ * can only do this on UTF-8 invariant bytes, because the variance
+ * would throw this off. */
+ if ( op == END
+ && invlist_highest(cp_list) <=
+#ifdef EBCDIC
+ 0xFF
+#else
+ 0x7F
+#endif
+ ) {
+ Size_t cp_count = 0;
+ bool first_time = TRUE;
+ unsigned int lowest_cp = 0xFF;
+ U8 bits_differing = 0;
+
+ /* Only needed on EBCDIC, as there, variants and non- are mixed
+ * together. Could #ifdef it out on ASCII, but probably the
+ * compiler will optimize it out */
+ bool has_variant = FALSE;
+
+ /* Go through the bytes and find the bit positions that differ */
+ invlist_iterinit(cp_list);
+ while (invlist_iternext(cp_list, &start, &end)) {
+ unsigned int i = start;
+
+ cp_count += end - start + 1;
+
+ if (first_time) {
+ if (! UVCHR_IS_INVARIANT(i)) {
+ has_variant = TRUE;
+ continue;
+ }
+
+ first_time = FALSE;
+ lowest_cp = start;
+
+ i++;
+ }
+
+ /* Find the bit positions that differ from the lowest code
+ * point in the node. Keep track of all such positions by
+ * OR'ing */
+ for (; i <= end; i++) {
+ if (! UVCHR_IS_INVARIANT(i)) {
+ has_variant = TRUE;
+ continue;
+ }
+
+ bits_differing |= i ^ lowest_cp;
+ }
+ }
+ invlist_iterfinish(cp_list);
+
+ /* At the end of the loop, we count how many bits differ from
+ * the bits in lowest code point, call the count 'd'. If the
+ * set we found contains 2**d elements, it is the closure of
+ * all code points that differ only in those bit positions. To
+ * convince yourself of that, first note that the number in the
+ * closure must be a power of 2, which we test for. The only
+ * way we could have that count and it be some differing set,
+ * is if we got some code points that don't differ from the
+ * lowest code point in any position, but do differ from each
+ * other in some other position. That means one code point has
+ * a 1 in that position, and another has a 0. But that would
+ * mean that one of them differs from the lowest code point in
+ * that position, which possibility we've already excluded. */
+ if ( ! has_variant
+ && cp_count == 1U << PL_bitcount[bits_differing])
+ {
+ assert(cp_count > 1);
+ op = ANYOFM;
+
+ /* We need to make the bits that differ be 0's */
+ ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
+
+ /* The argument is the lowest code point */
+ anode_arg = lowest_cp;
+ *flagp |= HASWIDTH|SIMPLE;
+ }
+ }
}
if (op != END) {
RExC_emit = (regnode *)orig_emit;
if (regarglen[op]) {
- ret = reganode(pRExC_state, op, 0);
+ ret = reganode(pRExC_state, op, anode_arg);
} else {
ret = reg_node(pRExC_state, op);
}
else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
FLAGS(ret) = posix_class;
}
+ else if (PL_regkind[op] == ANYOFM) {
+ FLAGS(ret) = ANYOFM_mask;
+ }
SvREFCNT_dec_NN(cp_list);
return ret;
* if (PASS2)
* NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
*
+* ALSO NOTE - operand->flags will be set to 0 as well.
*/
STATIC void
S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *operand, U32 depth)
#endif
}
-
place = operand; /* Op node, where operand used to be. */
#ifdef RE_TRACK_PATTERN_OFFSETS
if (RExC_offsets) { /* MJD */
}
#endif
src = NEXTOPER(place);
+ place->flags = 0;
FILL_ADVANCE_NODE(place, op);
Zero(src, offset, regnode);
}
}
#endif
+STATIC SV*
+S_get_ANYOFM_contents(pTHX_ const regnode * n) {
+
+ /* Returns an inversion list of all the code points matched by the ANYOFM
+ * node 'n' */
+
+ SV * cp_list = _new_invlist(-1);
+ const U8 lowest = ARG(n);
+ unsigned int i;
+ U8 count = 0;
+ U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
+
+ PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
+
+ /* Starting with the lowest code point, any code point that ANDed with the
+ * mask yields the lowest code point is in the set */
+ for (i = lowest; i <= 0xFF; i++) {
+ if ((i & FLAGS(n)) == ARG(n)) {
+ cp_list = add_cp_to_invlist(cp_list, i);
+ count++;
+
+ /* We know how many code points (a power of two) that are in the
+ * set. No use looking once we've got that number */
+ if (count >= needed) break;
+ }
+ }
+
+ return cp_list;
+}
+
/*
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
*/
Perl_regdump(pTHX_ const regexp *r)
{
#ifdef DEBUGGING
+ int i;
SV * const sv = sv_newmortal();
SV *dsv= sv_newmortal();
RXi_GET_DECL(r,ri);
(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
/* Header fields of interest. */
- if (r->anchored_substr) {
- RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
- RE_SV_DUMPLEN(r->anchored_substr), 30);
- Perl_re_printf( aTHX_
- "anchored %s%s at %" IVdf " ",
- s, RE_SV_TAIL(r->anchored_substr),
- (IV)r->anchored_offset);
- } else if (r->anchored_utf8) {
- RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
- RE_SV_DUMPLEN(r->anchored_utf8), 30);
- Perl_re_printf( aTHX_
- "anchored utf8 %s%s at %" IVdf " ",
- s, RE_SV_TAIL(r->anchored_utf8),
- (IV)r->anchored_offset);
- }
- if (r->float_substr) {
- RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
- RE_SV_DUMPLEN(r->float_substr), 30);
- Perl_re_printf( aTHX_
- "floating %s%s at %" IVdf "..%" UVuf " ",
- s, RE_SV_TAIL(r->float_substr),
- (IV)r->float_min_offset, (UV)r->float_max_offset);
- } else if (r->float_utf8) {
- RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
- RE_SV_DUMPLEN(r->float_utf8), 30);
- Perl_re_printf( aTHX_
- "floating utf8 %s%s at %" IVdf "..%" UVuf " ",
- s, RE_SV_TAIL(r->float_utf8),
- (IV)r->float_min_offset, (UV)r->float_max_offset);
+ for (i = 0; i < 2; i++) {
+ if (r->substrs->data[i].substr) {
+ RE_PV_QUOTED_DECL(s, 0, dsv,
+ SvPVX_const(r->substrs->data[i].substr),
+ RE_SV_DUMPLEN(r->substrs->data[i].substr),
+ PL_dump_re_max_len);
+ Perl_re_printf( aTHX_
+ "%s %s%s at %" IVdf "..%" UVuf " ",
+ i ? "floating" : "anchored",
+ s,
+ RE_SV_TAIL(r->substrs->data[i].substr),
+ (IV)r->substrs->data[i].min_offset,
+ (UV)r->substrs->data[i].max_offset);
+ }
+ else if (r->substrs->data[i].utf8_substr) {
+ RE_PV_QUOTED_DECL(s, 1, dsv,
+ SvPVX_const(r->substrs->data[i].utf8_substr),
+ RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
+ 30);
+ Perl_re_printf( aTHX_
+ "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
+ i ? "floating" : "anchored",
+ s,
+ RE_SV_TAIL(r->substrs->data[i].utf8_substr),
+ (IV)r->substrs->data[i].min_offset,
+ (UV)r->substrs->data[i].max_offset);
+ }
}
+
if (r->check_substr || r->check_utf8)
Perl_re_printf( aTHX_
(const char *)
- (r->check_substr == r->float_substr
- && r->check_utf8 == r->float_utf8
+ ( r->check_substr == r->substrs->data[1].substr
+ && r->check_utf8 == r->substrs->data[1].utf8_substr
? "(checking floating" : "(checking anchored"));
if (r->intflags & PREGf_NOSCAN)
Perl_re_printf( aTHX_ " noscan");
* is a crude hack but it may be the best for now since
* we have no flag "this EXACTish node was UTF-8"
* --jhi */
- pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
+ pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
+ PL_colors[0], PL_colors[1],
PERL_PV_ESCAPE_UNI_DETECT |
PERL_PV_ESCAPE_NONASCII |
PERL_PV_PRETTY_ELLIPSES |
if ( k == REF && reginfo) {
U32 n = ARG(o); /* which paren pair */
I32 ln = prog->offs[n].start;
- if (prog->lastparen < n || ln == -1)
+ if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
Perl_sv_catpvf(aTHX_ sv, ": FAIL");
else if (ln == prog->offs[n].end)
Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
SV* contents;
/* See if truncation size is overridden */
- const STRLEN dump_len = (PL_dump_re_max_len)
+ const STRLEN dump_len = (PL_dump_re_max_len > 256)
? PL_dump_re_max_len
: 256;
SvREFCNT_dec(unresolved);
}
+ else if (k == ANYOFM) {
+ SV * cp_list = get_ANYOFM_contents(o);
+
+ Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+ put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
+ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+
+ SvREFCNT_dec(cp_list);
+ }
else if (k == POSIXD || k == NPOSIXD) {
U8 index = FLAGS(o) * 2;
if (index < C_ARRAY_LENGTH(anyofs)) {
/* add on the verb argument if there is one */
if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
- Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
+ if ( ARG(o) )
+ Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
+ else
+ sv_catpvs(sv, ":NULL");
}
#else
PERL_UNUSED_CONTEXT;
PL_colors[5],PL_colors[0],
s,
PL_colors[1],
- (strlen(s) > 60 ? "..." : ""));
+ (strlen(s) > PL_dump_re_max_len ? "..." : ""));
} );
/* use UTF8 check substring if regexp pattern itself is in UTF8 */
} else {
CALLREGFREE_PVT(rx); /* free the private data */
SvREFCNT_dec(RXp_PAREN_NAMES(r));
- Safefree(r->xpv_len_u.xpvlenu_pv);
}
if (r->substrs) {
- SvREFCNT_dec(r->anchored_substr);
- SvREFCNT_dec(r->anchored_utf8);
- SvREFCNT_dec(r->float_substr);
- SvREFCNT_dec(r->float_utf8);
+ int i;
+ for (i = 0; i < 2; i++) {
+ SvREFCNT_dec(r->substrs->data[i].substr);
+ SvREFCNT_dec(r->substrs->data[i].utf8_substr);
+ }
Safefree(r->substrs);
}
RX_MATCH_COPY_FREE(rx);
SvREFCNT_dec(r->qr_anoncv);
if (r->recurse_locinput)
Safefree(r->recurse_locinput);
- rx->sv_u.svu_rx = 0;
}
+
/* reg_temp_copy()
- This is a hacky workaround to the structural issue of match results
+ Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
+ except that dsv will be created if NULL.
+
+ This function is used in two main ways. First to implement
+ $r = qr/....; $s = $$r;
+
+ Secondly, it is used as a hacky workaround to the structural issue of
+ match results
being stored in the regexp structure which is in turn stored in
PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
could be PL_curpm in multiple contexts, and could require multiple
REGEXP *
-Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
+Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
{
- struct regexp *ret;
- struct regexp *const r = ReANY(rx);
- const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
+ struct regexp *drx;
+ struct regexp *const srx = ReANY(ssv);
+ const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
PERL_ARGS_ASSERT_REG_TEMP_COPY;
- if (!ret_x)
- ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
+ if (!dsv)
+ dsv = (REGEXP*) newSV_type(SVt_REGEXP);
else {
- SvOK_off((SV *)ret_x);
+ SvOK_off((SV *)dsv);
if (islv) {
- /* For PVLVs, SvANY points to the xpvlv body while sv_u points
- to the regexp. (For SVt_REGEXPs, sv_upgrade has already
- made both spots point to the same regexp body.) */
+ /* For PVLVs, the head (sv_any) points to an XPVLV, while
+ * the LV's xpvlenu_rx will point to a regexp body, which
+ * we allocate here */
REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
- assert(!SvPVX(ret_x));
- ret_x->sv_u.svu_rx = temp->sv_any;
+ assert(!SvPVX(dsv));
+ ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
temp->sv_any = NULL;
SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
SvREFCNT_dec_NN(temp);
/* SvCUR still resides in the xpvlv struct, so the regexp copy-
ing below will not set it. */
- SvCUR_set(ret_x, SvCUR(rx));
+ SvCUR_set(dsv, SvCUR(ssv));
}
}
/* This ensures that SvTHINKFIRST(sv) is true, and hence that
sv_force_normal(sv) is called. */
- SvFAKE_on(ret_x);
- ret = ReANY(ret_x);
+ SvFAKE_on(dsv);
+ drx = ReANY(dsv);
- SvFLAGS(ret_x) |= SvUTF8(rx);
+ SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
+ SvPV_set(dsv, RX_WRAPPED(ssv));
/* We share the same string buffer as the original regexp, on which we
hold a reference count, incremented when mother_re is set below.
The string pointer is copied here, being part of the regexp struct.
*/
- memcpy(&(ret->xpv_cur), &(r->xpv_cur),
+ memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
- if (r->offs) {
- const I32 npar = r->nparens+1;
- Newx(ret->offs, npar, regexp_paren_pair);
- Copy(r->offs, ret->offs, npar, regexp_paren_pair);
- }
- if (r->substrs) {
- Newx(ret->substrs, 1, struct reg_substr_data);
- StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
+ if (!islv)
+ SvLEN_set(dsv, 0);
+ if (srx->offs) {
+ const I32 npar = srx->nparens+1;
+ Newx(drx->offs, npar, regexp_paren_pair);
+ Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
+ }
+ if (srx->substrs) {
+ int i;
+ Newx(drx->substrs, 1, struct reg_substr_data);
+ StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
- SvREFCNT_inc_void(ret->anchored_substr);
- SvREFCNT_inc_void(ret->anchored_utf8);
- SvREFCNT_inc_void(ret->float_substr);
- SvREFCNT_inc_void(ret->float_utf8);
+ for (i = 0; i < 2; i++) {
+ SvREFCNT_inc_void(drx->substrs->data[i].substr);
+ SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
+ }
/* check_substr and check_utf8, if non-NULL, point to either their
anchored or float namesakes, and don't hold a second reference. */
}
- RX_MATCH_COPIED_off(ret_x);
+ RX_MATCH_COPIED_off(dsv);
#ifdef PERL_ANY_COW
- ret->saved_copy = NULL;
+ drx->saved_copy = NULL;
#endif
- ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
- SvREFCNT_inc_void(ret->qr_anoncv);
- if (r->recurse_locinput)
- Newxz(ret->recurse_locinput,r->nparens + 1,char *);
+ drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
+ SvREFCNT_inc_void(drx->qr_anoncv);
+ if (srx->recurse_locinput)
+ Newx(drx->recurse_locinput,srx->nparens + 1,char *);
- return ret_x;
+ return dsv;
}
#endif
+
/* regfree_internal()
Free the private data in a regexp. This is overloadable by
{
SV *dsv= sv_newmortal();
RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
- dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
+ dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
PL_colors[4],PL_colors[5],s);
}
/* Do it this way to avoid reading from *r after the StructCopy().
That way, if any of the sv_dup_inc()s dislodge *r from the L1
cache, it doesn't matter. */
+ int i;
const bool anchored = r->check_substr
- ? r->check_substr == r->anchored_substr
- : r->check_utf8 == r->anchored_utf8;
+ ? r->check_substr == r->substrs->data[0].substr
+ : r->check_utf8 == r->substrs->data[0].utf8_substr;
Newx(ret->substrs, 1, struct reg_substr_data);
StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
- ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
- ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
- ret->float_substr = sv_dup_inc(ret->float_substr, param);
- ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
+ for (i = 0; i < 2; i++) {
+ ret->substrs->data[i].substr =
+ sv_dup_inc(ret->substrs->data[i].substr, param);
+ ret->substrs->data[i].utf8_substr =
+ sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
+ }
/* check_substr and check_utf8, if non-NULL, point to either their
anchored or float namesakes, and don't hold a second reference. */
if (ret->check_substr) {
if (anchored) {
- assert(r->check_utf8 == r->anchored_utf8);
- ret->check_substr = ret->anchored_substr;
- ret->check_utf8 = ret->anchored_utf8;
+ assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
+
+ ret->check_substr = ret->substrs->data[0].substr;
+ ret->check_utf8 = ret->substrs->data[0].utf8_substr;
} else {
- assert(r->check_substr == r->float_substr);
- assert(r->check_utf8 == r->float_utf8);
- ret->check_substr = ret->float_substr;
- ret->check_utf8 = ret->float_utf8;
+ assert(r->check_substr == r->substrs->data[1].substr);
+ assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
+
+ ret->check_substr = ret->substrs->data[1].substr;
+ ret->check_utf8 = ret->substrs->data[1].utf8_substr;
}
} else if (ret->check_utf8) {
if (anchored) {
- ret->check_utf8 = ret->anchored_utf8;
+ ret->check_utf8 = ret->substrs->data[0].utf8_substr;
} else {
- ret->check_utf8 = ret->float_utf8;
+ ret->check_utf8 = ret->substrs->data[1].utf8_substr;
}
}
}
RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
if (r->recurse_locinput)
- Newxz(ret->recurse_locinput,r->nparens + 1,char *);
+ Newx(ret->recurse_locinput,r->nparens + 1,char *);
if (ret->pprivate)
RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
1: a buffer in a different thread
2: something we no longer hold a reference on
so we need to copy it locally. */
- RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
+ RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
ret->mother_re = NULL;
}
#endif /* PERL_IN_XSUB_RE */
d->what[i] = ri->data->what[i];
switch (d->what[i]) {
/* see also regcomp.h and regfree_internal() */
- case 'a': /* actually an AV, but the dup function is identical. */
- case 'r':
- case 's':
- case 'S':
- case 'u': /* actually an HV, but the dup function is identical. */
+ case 'a': /* actually an AV, but the dup function is identical.
+ values seem to be "plain sv's" generally. */
+ case 'r': /* a compiled regex (but still just another SV) */
+ case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
+ this use case should go away, the code could have used
+ 'a' instead - see S_set_ANYOF_arg() for array contents. */
+ case 'S': /* actually an SV, but the dup function is identical. */
+ case 'u': /* actually an HV, but the dup function is identical.
+ values are "plain sv's" */
d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
break;
case 'f':
+ /* Synthetic Start Class - "Fake" charclass we generate to optimize
+ * patterns which could start with several different things. Pre-TRIE
+ * this was more important than it is now, however this still helps
+ * in some places, for instance /x?a+/ might produce a SSC equivalent
+ * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
+ * in regexec.c
+ */
/* This is cheating. */
Newx(d->data[i], 1, regnode_ssc);
StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
reti->regstclass = (regnode*)d->data[i];
break;
case 'T':
- /* Trie stclasses are readonly and can thus be shared
+ /* AHO-CORASICK fail table */
+ /* Trie stclasses are readonly and can thus be shared
* without duplication. We free the stclass in pregfree
* when the corresponding reg_ac_data struct is freed.
*/
reti->regstclass= ri->regstclass;
/* FALLTHROUGH */
case 't':
+ /* TRIE transition table */
OP_REFCNT_LOCK;
((reg_trie_data*)ri->data->data[i])->refcount++;
OP_REFCNT_UNLOCK;
/* FALLTHROUGH */
- case 'l':
- case 'L':
+ case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
+ case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
+ is not from another regexp */
d->data[i] = ri->data->data[i];
break;
default:
#else
format = "\\x%02" UVXf "-\\x%02" UVXf;
#endif
- GCC_DIAG_IGNORE(-Wformat-nonliteral);
+ GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
- GCC_DIAG_RESTORE;
+ GCC_DIAG_RESTORE_STMT;
break;
}
}
{
/* Appends to 'sv' a displayable version of the innards of the bracketed
* character class defined by the other arguments:
- * 'bitmap' points to the bitmap.
+ * 'bitmap' points to the bitmap, or NULL if to ignore that.
* 'nonbitmap_invlist' is an inversion list of the code points that are in
* the bitmap range, but for some reason aren't in the bitmap; NULL if
* none. The reasons for this could be that they require some
* was not resolved at the time of the regex compilation (under /u)
* 'only_utf8_locale_invlist' is an inversion list of the code points that
* are valid only if the runtime locale is a UTF-8 one; NULL if none
- * 'node' is the regex pattern node. It is needed only when the above two
- * parameters are not null, and is passed so that this routine can
- * tease apart the various reasons for them.
+ * 'node' is the regex pattern ANYOF node. It is needed only when the
+ * above two parameters are not null, and is passed so that this
+ * routine can tease apart the various reasons for them.
* 'force_as_is_display' is TRUE if this routine should definitely NOT try
* to invert things to see if that leads to a cleaner display. If
* FALSE, this routine is free to use its judgment about doing this.
}
/* Accumulate the bit map into the unconditional match list */
- for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
- if (BITMAP_TEST(bitmap, i)) {
- int start = i++;
- for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) {
- /* empty */
+ if (bitmap) {
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+ if (BITMAP_TEST(bitmap, i)) {
+ int start = i++;
+ for (;
+ i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
+ i++)
+ { /* empty */ }
+ invlist = _add_range_to_invlist(invlist, start, i-1);
}
- invlist = _add_range_to_invlist(invlist, start, i-1);
}
}
/* While that wasn't END last time... */
NODE_ALIGN(node);
op = OP(node);
- if (op == CLOSE || op == WHILEM)
+ if (op == CLOSE || op == SRCLOSE || op == WHILEM)
indent--;
next = regnext((regnode *)node);
indent+3,
elem_ptr
? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
- SvCUR(*elem_ptr), 60,
+ SvCUR(*elem_ptr), PL_dump_re_max_len,
PL_colors[0], PL_colors[1],
(SvUTF8(*elem_ptr)
? PERL_PV_ESCAPE_UNI
node = NEXTOPER(node);
node += regarglen[(U8)op];
}
- if (op == CURLYX || op == OPEN)
+ if (op == CURLYX || op == OPEN || op == SROPEN)
indent++;
}
CLEAR_OPTSTART;