regnode *next_regnode; /* next node to process when last is reached */
U32 prev_recursed_depth;
I32 stopparen; /* what stopparen do we use */
- U32 is_top_frame; /* what flags do we use? */
struct scan_frame *this_prev_frame; /* this previous frame */
struct scan_frame *prev_frame; /* previous frame */
/* Certain characters are output as a sequence with the first being a
* backslash. */
-#define isBACKSLASHED_PUNCT(c) \
- ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
+#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
struct RExC_state_t {
I32 sawback; /* Did we see \1, ...? */
U32 seen;
SSize_t size; /* Code size. */
- I32 npar; /* Capture buffer count, (OPEN) plus
+ I32 npar; /* Capture buffer count, (OPEN) plus
one. ("par" 0 is the whole
pattern)*/
I32 nestroot; /* root parens we are in - used by
bool seen_unfolded_sharp_s;
bool strict;
bool study_started;
+ bool in_script_run;
};
#define RExC_flags (pRExC_state->flags)
#define RExC_strict (pRExC_state->strict)
#define RExC_study_started (pRExC_state->study_started)
#define RExC_warn_text (pRExC_state->warn_text)
+#define RExC_in_script_run (pRExC_state->in_script_run)
/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
* a flag to disable back-off on the fixed/floating substrings - if it's
/* Change from /d into /u rules, and restart the parse if we've already seen
* something whose size would increase as a result, by setting *flagp and
* returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
- * we've change to /u during the parse. */
+ * we've changed to /u during the parse. */
#define REQUIRE_UNI_RULES(flagp, restart_retval) \
STMT_START { \
if (DEPENDS_SEMANTICS) { \
For each string some basic information is maintained:
- - offset or min_offset
+ - min_offset
This is the position the string must appear at, or not before.
It also implicitly (when combined with minlenp) tells us how many
characters must match before the string we are searching for.
Only used for floating strings. This is the rightmost point that
the string can appear at. If set to SSize_t_MAX it indicates that the
string can occur infinitely far to the right.
+ For fixed strings, it is equal to min_offset.
- minlenp
A pointer to the minimum number of characters of the pattern that the
*/
+struct scan_data_substrs {
+ SV *str; /* longest substring found in pattern */
+ SSize_t min_offset; /* earliest point in string it can appear */
+ SSize_t max_offset; /* latest point in string it can appear */
+ SSize_t *minlenp; /* pointer to the minlen relevant to the string */
+ SSize_t lookbehind; /* is the pos of the string modified by LB */
+ I32 flags; /* per substring SF_* and SCF_* flags */
+};
+
typedef struct scan_data_t {
/*I32 len_min; unused */
/*I32 len_delta; unused */
SSize_t last_end; /* min value, <0 unless valid. */
SSize_t last_start_min;
SSize_t last_start_max;
- SV **longest; /* Either &l_fixed, or &l_float. */
- SV *longest_fixed; /* longest fixed string found in pattern */
- SSize_t offset_fixed; /* offset where it starts */
- SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
- I32 lookbehind_fixed; /* is the position of the string modfied by LB */
- SV *longest_float; /* longest floating string found in pattern */
- SSize_t offset_float_min; /* earliest point in string it can appear */
- SSize_t offset_float_max; /* latest point in string it can appear */
- SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
- SSize_t lookbehind_float; /* is the pos of the string modified by LB */
- I32 flags;
+ U8 cur_is_floating; /* whether the last_* values should be set as
+ * the next fixed (0) or floating (1)
+ * substring */
+
+ /* [0] is longest fixed substring so far, [1] is longest float so far */
+ struct scan_data_substrs substrs[2];
+
+ I32 flags; /* common SF_* and SCF_* flags */
I32 whilem_c;
SSize_t *last_closep;
regnode_ssc *start_class;
* Forward declarations for pregcomp()'s friends.
*/
-static const scan_data_t zero_scan_data =
- { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
+static const scan_data_t zero_scan_data = {
+ 0, 0, NULL, 0, 0, 0, 0,
+ {
+ { NULL, 0, 0, 0, 0, 0 },
+ { NULL, 0, 0, 0, 0, 0 },
+ },
+ 0, 0, NULL, NULL
+};
+
+/* study flags */
-#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
#define SF_BEFORE_SEOL 0x0001
#define SF_BEFORE_MEOL 0x0002
-#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
-#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
-
-#define SF_FIX_SHIFT_EOL (+2)
-#define SF_FL_SHIFT_EOL (+4)
-
-#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
-#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
+#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
-#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
-#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
#define SF_IS_INF 0x0040
#define SF_HAS_PAR 0x0080
#define SF_IN_PAR 0x0100
UTF8fARG(UTF, \
(xI(xC) > eC) /* Don't run off end */ \
? eC - sC /* Length before the <--HERE */ \
- : xI_offset(xC), \
+ : ( __ASSERT_(xI_offset(xC) >= 0) xI_offset(xC) ), \
sC), /* The input pattern printed up to the <--HERE */ \
UTF8fARG(UTF, \
(xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \
#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
-#define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
- if ( ( flags ) ) { \
- Perl_re_printf( aTHX_ "%s", open_str); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
- DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
- DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
- Perl_re_printf( aTHX_ "%s", close_str); \
- }
-
-
-#define DEBUG_STUDYDATA(str,data,depth) \
-DEBUG_OPTIMISE_MORE_r(if(data){ \
- Perl_re_indentf( aTHX_ "" str "Pos:%" IVdf "/%" IVdf \
- " Flags: 0x%" UVXf, \
- depth, \
- (IV)((data)->pos_min), \
- (IV)((data)->pos_delta), \
- (UV)((data)->flags) \
- ); \
- DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
- Perl_re_printf( aTHX_ \
- " Whilem_c: %" IVdf " Lcp: %" IVdf " %s", \
- (IV)((data)->whilem_c), \
- (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
- is_inf ? "INF " : "" \
- ); \
- if ((data)->last_found) \
- Perl_re_printf( aTHX_ \
- "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf \
- " %sFixed:'%s' @ %" IVdf \
- " %sFloat: '%s' @ %" IVdf "/%" IVdf, \
- SvPVX_const((data)->last_found), \
- (IV)((data)->last_end), \
- (IV)((data)->last_start_min), \
- (IV)((data)->last_start_max), \
- ((data)->longest && \
- (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
- SvPVX_const((data)->longest_fixed), \
- (IV)((data)->offset_fixed), \
- ((data)->longest && \
- (data)->longest==&((data)->longest_float)) ? "*" : "", \
- SvPVX_const((data)->longest_float), \
- (IV)((data)->offset_float_min), \
- (IV)((data)->offset_float_max) \
- ); \
- Perl_re_printf( aTHX_ "\n"); \
-});
+
+#ifdef DEBUGGING
+static void
+S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
+ const char *close_str)
+{
+ if (!flags)
+ return;
+
+ Perl_re_printf( aTHX_ "%s", open_str);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
+ DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
+ Perl_re_printf( aTHX_ "%s", close_str);
+}
+
+
+static void
+S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
+ U32 depth, int is_inf)
+{
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ DEBUG_OPTIMISE_MORE_r({
+ if (!data)
+ return;
+ Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
+ depth,
+ where,
+ (IV)data->pos_min,
+ (IV)data->pos_delta,
+ (UV)data->flags
+ );
+
+ S_debug_show_study_flags(aTHX_ data->flags," [","]");
+
+ Perl_re_printf( aTHX_
+ " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
+ (IV)data->whilem_c,
+ (IV)(data->last_closep ? *((data)->last_closep) : -1),
+ is_inf ? "INF " : ""
+ );
+
+ if (data->last_found) {
+ int i;
+ Perl_re_printf(aTHX_
+ "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
+ SvPVX_const(data->last_found),
+ (IV)data->last_end,
+ (IV)data->last_start_min,
+ (IV)data->last_start_max
+ );
+
+ for (i = 0; i < 2; i++) {
+ Perl_re_printf(aTHX_
+ " %s%s: '%s' @ %" IVdf "/%" IVdf,
+ data->cur_is_floating == i ? "*" : "",
+ i ? "Float" : "Fixed",
+ SvPVX_const(data->substrs[i].str),
+ (IV)data->substrs[i].min_offset,
+ (IV)data->substrs[i].max_offset
+ );
+ S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
+ }
+ }
+
+ Perl_re_printf( aTHX_ "\n");
+ });
+}
+
+
+static void
+S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
+ regnode *scan, U32 depth, U32 flags)
+{
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ DEBUG_OPTIMISE_r({
+ regnode *Next;
+
+ if (!scan)
+ return;
+ Next = regnext(scan);
+ regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
+ Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
+ depth,
+ str,
+ REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
+ Next ? (REG_NODE_NUM(Next)) : 0 );
+ S_debug_show_study_flags(aTHX_ flags," [ ","]");
+ Perl_re_printf( aTHX_ "\n");
+ });
+}
+
+
+# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
+ S_debug_studydata(aTHX_ where, data, depth, is_inf)
+
+# define DEBUG_PEEP(str, scan, depth, flags) \
+ S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
+
+#else
+# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
+# define DEBUG_PEEP(str, scan, depth, flags) NOOP
+#endif
/* =========================================================
push(UV key,item* curr)
{
item* head;
- Newxz(head, 1, item);
+ Newx(head, 1, item);
head->key = key;
head->value = 0;
head->next = curr;
PERL_ARGS_ASSERT_EDIT_DISTANCE;
/* intialize matrix start values */
- Newxz(scores, ( (x + 2) * (y + 2)), UV);
+ Newx(scores, ( (x + 2) * (y + 2)), UV);
scores[0] = score_ceil;
scores[1 * (y + 2) + 0] = score_ceil;
scores[0 * (y + 2) + 1] = score_ceil;
}
/* Mark that we cannot extend a found fixed substring at this point.
- Update the longest found anchored substring and the longest found
+ Update the longest found anchored substring or the longest found
floating substrings if needed. */
STATIC void
SSize_t *minlenp, int is_inf)
{
const STRLEN l = CHR_SVLEN(data->last_found);
- const STRLEN old_l = CHR_SVLEN(*data->longest);
+ SV * const longest_sv = data->substrs[data->cur_is_floating].str;
+ const STRLEN old_l = CHR_SVLEN(longest_sv);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_SCAN_COMMIT;
if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
- SvSetMagicSV(*data->longest, data->last_found);
- if (*data->longest == data->longest_fixed) {
- data->offset_fixed = l ? data->last_start_min : data->pos_min;
- if (data->flags & SF_BEFORE_EOL)
- data->flags
- |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
- else
- data->flags &= ~SF_FIX_BEFORE_EOL;
- data->minlen_fixed=minlenp;
- data->lookbehind_fixed=0;
- }
- else { /* *data->longest == data->longest_float */
- data->offset_float_min = l ? data->last_start_min : data->pos_min;
- data->offset_float_max = (l
+ const U8 i = data->cur_is_floating;
+ SvSetMagicSV(longest_sv, data->last_found);
+ data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
+
+ if (!i) /* fixed */
+ data->substrs[0].max_offset = data->substrs[0].min_offset;
+ else { /* float */
+ data->substrs[1].max_offset = (l
? data->last_start_max
: (data->pos_delta > SSize_t_MAX - data->pos_min
? SSize_t_MAX
: data->pos_min + data->pos_delta));
if (is_inf
- || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
- data->offset_float_max = SSize_t_MAX;
- if (data->flags & SF_BEFORE_EOL)
- data->flags
- |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
- else
- data->flags &= ~SF_FL_BEFORE_EOL;
- data->minlen_float=minlenp;
- data->lookbehind_float=0;
- }
+ || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
+ data->substrs[1].max_offset = SSize_t_MAX;
+ }
+
+ if (data->flags & SF_BEFORE_EOL)
+ data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
+ else
+ data->substrs[i].flags &= ~SF_BEFORE_EOL;
+ data->substrs[i].minlenp = minlenp;
+ data->substrs[i].lookbehind = 0;
}
+
SvCUR_set(data->last_found, 0);
{
SV * const sv = data->last_found;
}
data->last_end = -1;
data->flags &= ~SF_BEFORE_EOL;
- DEBUG_STUDYDATA("commit: ",data,0);
+ DEBUG_STUDYDATA("commit", data, 0, is_inf);
}
/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
regnode_charclass_posixl temp;
int add = 1; /* To calculate the index of the complement */
+ Zero(&temp, 1, regnode_charclass_posixl);
ANYOF_POSIXL_ZERO(&temp);
for (i = 0; i < ANYOF_MAX; i++) {
assert(i % 2 != 0
} STMT_END
#define TRIE_LIST_NEW(state) STMT_START { \
- Newxz( trie->states[ state ].trans.list, \
+ Newx( trie->states[ state ].trans.list, \
4, reg_trie_trans_le ); \
TRIE_LIST_CUR( state ) = 1; \
TRIE_LIST_LEN( state ) = 4; \
/* we just use folder as a flag in utf8 */
const U8 * folder = NULL;
+ /* in the below add_data call we are storing either 'tu' or 'tuaa'
+ * which stands for one trie structure, one hash, optionally followed
+ * by two arrays */
#ifdef DEBUGGING
- const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
+ const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
AV *trie_words = NULL;
/* along with revcharmap, this only used during construction but both are
* useful during debugging so we store them in the struct when debugging.
aho->trie=trie_offset;
aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
Copy( trie->states, aho->states, numstates, reg_trie_state );
- Newxz( q, numstates, U32);
+ Newx( q, numstates, U32);
aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
aho->refcount = 1;
fail = aho->fail;
}
-#define DEBUG_PEEP(str,scan,depth) \
- DEBUG_OPTIMISE_r({if (scan){ \
- regnode *Next = regnext(scan); \
- regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);\
- Perl_re_indentf( aTHX_ "" str ">%3d: %s (%d)", \
- depth, REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
- Next ? (REG_NODE_NUM(Next)) : 0 );\
- DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
- Perl_re_printf( aTHX_ "\n"); \
- }});
-
/* The below joins as many adjacent EXACTish nodes as possible into a single
* one. The regop may be changed if the node(s) contain certain sequences that
* require special handling. The joining is only done if:
* input nodes.
*
* And *unfolded_multi_char is set to indicate whether or not the node contains
- * an unfolded multi-char fold. This happens when whether the fold is valid or
- * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
- * SMALL LETTER SHARP S, as only if the target string being matched against
- * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
- * folding rules depend on the locale in force at runtime. (Multi-char folds
- * whose components are all above the Latin1 range are not run-time locale
- * dependent, and have already been folded by the time this function is
- * called.)
+ * an unfolded multi-char fold. This happens when it won't be known until
+ * runtime whether the fold is valid or not; namely
+ * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
+ * target string being matched against turns out to be UTF-8 is that fold
+ * valid; or
+ * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
+ * runtime.
+ * (Multi-char folds whose components are all above the Latin1 range are not
+ * run-time locale dependent, and have already been folded by the time this
+ * function is called.)
*
* This is as good a place as any to discuss the design of handling these
* multi-character fold sequences. It's been wrong in Perl for a very long
PERL_UNUSED_ARG(flags);
PERL_UNUSED_ARG(val);
#endif
- DEBUG_PEEP("join",scan,depth);
+ DEBUG_PEEP("join", scan, depth, 0);
/* Look through the subsequent nodes in the chain. Skip NOTHING, merge
* EXACT ones that are mergeable to the current one. */
if (OP(n) == TAIL || n > next)
stringok = 0;
if (PL_regkind[OP(n)] == NOTHING) {
- DEBUG_PEEP("skip:",n,depth);
+ DEBUG_PEEP("skip:", n, depth, 0);
NEXT_OFF(scan) += NEXT_OFF(n);
next = n + NODE_STEP_REGNODE;
#ifdef DEBUGGING
if (oldl + STR_LEN(n) > U8_MAX)
break;
- DEBUG_PEEP("merg",n,depth);
+ DEBUG_PEEP("merg", n, depth, 0);
merged++;
NEXT_OFF(scan) += NEXT_OFF(n);
#ifdef EXPERIMENTAL_INPLACESCAN
if (flags && !NEXT_OFF(n)) {
- DEBUG_PEEP("atch", val, depth);
+ DEBUG_PEEP("atch", val, depth, 0);
if (reg_off_by_arg[OP(n)]) {
ARG_SET(n, val - n);
}
n++;
}
#endif
- DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
+ DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
return stopnow;
}
} while (f);
}
-
+/* the return from this sub is the minimum length that could possibly match */
STATIC SSize_t
S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
SSize_t *minlenp, SSize_t *deltap,
PERL_ARGS_ASSERT_STUDY_CHUNK;
RExC_study_started= 1;
+ Zero(&data_fake, 1, scan_data_t);
if ( depth == 0 ) {
while (first_non_open && OP(first_non_open) == OPEN)
the folded version may be shorter) */
bool unfolded_multi_char = FALSE;
/* Peephole optimizer: */
- DEBUG_STUDYDATA("Peep:", data, depth);
- DEBUG_PEEP("Peep", scan, depth);
+ DEBUG_STUDYDATA("Peep", data, depth, is_inf);
+ DEBUG_PEEP("Peep", scan, depth, flags);
/* The reason we do this here is that we need to deal with things like
StructCopy(&zero_scan_data, &data_fake, scan_data_t);
scan = regnext(scan);
assert( OP(scan) == IFTHEN );
- DEBUG_PEEP("expect IFTHEN", scan, depth);
+ DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
data_fake.last_closep= &fake_last_close;
minlen = *minlenp;
next = regnext(scan);
scan = NEXTOPER(NEXTOPER(scan));
- DEBUG_PEEP("scan", scan, depth);
- DEBUG_PEEP("next", next, depth);
+ DEBUG_PEEP("scan", scan, depth, flags);
+ DEBUG_PEEP("next", next, depth, flags);
/* we suppose the run is continuous, last=next...
* NOTE we dont use the return here! */
+ /* DEFINEP study_chunk() recursion */
(void)study_chunk(pRExC_state, &scan, &minlen,
&deltanext, next, &data_fake, stopparen,
recursed_depth, NULL, f, depth+1);
I32 f = 0;
regnode_ssc this_class;
- DEBUG_PEEP("Branch", scan, depth);
+ DEBUG_PEEP("Branch", scan, depth, flags);
num++;
StructCopy(&zero_scan_data, &data_fake, scan_data_t);
f |= SCF_WHILEM_VISITED_POS;
/* we suppose the run is continuous, last=next...*/
+ /* recurse study_chunk() for each BRANCH in an alternation */
minnext = study_chunk(pRExC_state, &scan, minlenp,
&deltanext, next, &data_fake, stopparen,
recursed_depth, NULL, f,depth+1);
else
data->pos_delta += max1 - min1;
if (max1 != min1 || is_inf)
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1;
}
min += min1;
if (delta == SSize_t_MAX
RExC_study_chunk_recursed_bytes, U8);
}
/* we havent recursed into this paren yet, so recurse into it */
- DEBUG_STUDYDATA("gosub-set:", data,depth);
+ DEBUG_STUDYDATA("gosub-set", data, depth, is_inf);
PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
my_recursed_depth= recursed_depth + 1;
} else {
- DEBUG_STUDYDATA("gosub-inf:", data,depth);
+ DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf);
/* some form of infinite recursion, assume infinite length
* */
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1;
}
is_inf = is_inf_internal = 1;
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
newframe->prev_recursed_depth = recursed_depth;
newframe->this_prev_frame= frame;
- DEBUG_STUDYDATA("frame-new:",data,depth);
- DEBUG_PEEP("fnew", scan, depth);
+ DEBUG_STUDYDATA("frame-new", data, depth, is_inf);
+ DEBUG_PEEP("fnew", scan, depth, flags);
frame = newframe;
scan = start;
else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
SSize_t l = STR_LEN(scan);
UV uc;
+ assert(l);
if (UTF) {
const U8 * const s = (U8*)STRING(scan);
uc = utf8_to_uvchr_buf(s, s + l, NULL);
}
data->pos_delta += min_subtract;
if (min_subtract) {
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
}
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
/* Cannot extend fixed substrings */
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
is_inf = is_inf_internal = 1;
scan = regnext(scan);
f &= ~SCF_WHILEM_VISITED_POS;
/* This will finish on WHILEM, setting scan, or on NULL: */
+ /* recurse study_chunk() on loop bodies */
minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
last, data, stopparen, recursed_depth, NULL,
(mincount == 0
}
if (!scan) /* It was not CURLYX, but CURLY. */
scan = next;
- if (!(flags & SCF_TRIE_DOING_RESTUDY)
+ if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
/* ? quantifier ok, except for (?{ ... }) */
&& (next_is_eval || !(mincount == 0 && maxcount == 1))
&& (minnext == 0) && (deltanext == 0)
}
#endif
/* Optimize again: */
+ /* recurse study_chunk() on optimised CURLYX => CURLYM */
study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
NULL, stopparen, recursed_depth, NULL, 0,depth+1);
}
? SSize_t_MAX
: data->pos_min + data->pos_delta - last_chrs;
}
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
SvREFCNT_dec(last_str);
}
if (flags & SCF_DO_SUBSTR) {
/* Cannot expect anything... */
scan_commit(pRExC_state, data, minlenp, is_inf);
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
is_inf = is_inf_internal = 1;
if (flags & SCF_DO_STCLASS_OR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
data->pos_min += 1;
data->pos_delta += 1;
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
}
else if (REGNODE_SIMPLE(OP(scan))) {
(regnode_charclass *) scan);
break;
+ case ANYOFM:
+ {
+ SV* cp_list = get_ANYOFM_contents(scan);
+
+ if (flags & SCF_DO_STCLASS_OR) {
+ ssc_union(data->start_class,
+ cp_list,
+ FALSE /* don't invert */
+ );
+ }
+ else if (flags & SCF_DO_STCLASS_AND) {
+ ssc_intersection(data->start_class,
+ cp_list,
+ FALSE /* don't invert */
+ );
+ }
+
+ SvREFCNT_dec_NN(cp_list);
+ break;
+ }
+
case NPOSIXL:
invert = 1;
/* FALLTHROUGH */
}
break;
+ case NASCII:
+ invert = 1;
+ /* FALLTHROUGH */
+ case ASCII:
+ my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
+
+ /* This can be handled as a Posix class */
+ goto join_posix_and_ascii;
+
case NPOSIXA: /* For these, we always know the exact set of
what's matched */
invert = 1;
/* FALLTHROUGH */
case POSIXA:
- if (FLAGS(scan) == _CC_ASCII) {
- my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
- }
- else {
- _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
- PL_XPosix_ptrs[_CC_ASCII],
- &my_invlist);
- }
- goto join_posix;
+ assert(FLAGS(scan) != _CC_ASCII);
+ _invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
+ PL_XPosix_ptrs[_CC_ASCII],
+ &my_invlist);
+ goto join_posix_and_ascii;
case NPOSIXD:
case NPOSIXU:
&my_invlist);
}
- join_posix:
+ join_posix_and_ascii:
if (flags & SCF_DO_STCLASS_AND) {
ssc_intersection(data->start_class, my_invlist, invert);
f |= SCF_WHILEM_VISITED_POS;
next = regnext(scan);
nscan = NEXTOPER(NEXTOPER(scan));
+
+ /* recurse study_chunk() for lookahead body */
minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
last, &data_fake, stopparen,
recursed_depth, NULL, f, depth+1);
else
data_fake.last_closep = &fake;
data_fake.flags = 0;
+ data_fake.substrs[0].flags = 0;
+ data_fake.substrs[1].flags = 0;
data_fake.pos_delta = delta;
if (is_inf)
data_fake.flags |= SF_IS_INF;
next = regnext(scan);
nscan = NEXTOPER(NEXTOPER(scan));
+ /* positive lookahead study_chunk() recursion */
*minnextp = study_chunk(pRExC_state, &nscan, minnextp,
&deltanext, last, &data_fake,
stopparen, recursed_depth, NULL,
data->flags |= SF_HAS_EVAL;
data->whilem_c = data_fake.whilem_c;
if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
+ int i;
if (RExC_rx->minlen<*minnextp)
RExC_rx->minlen=*minnextp;
scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
SvREFCNT_dec_NN(data_fake.last_found);
- if ( data_fake.minlen_fixed != minlenp )
- {
- data->offset_fixed= data_fake.offset_fixed;
- data->minlen_fixed= data_fake.minlen_fixed;
- data->lookbehind_fixed+= scan->flags;
- }
- if ( data_fake.minlen_float != minlenp )
- {
- data->minlen_float= data_fake.minlen_float;
- data->offset_float_min=data_fake.offset_float_min;
- data->offset_float_max=data_fake.offset_float_max;
- data->lookbehind_float+= scan->flags;
+ for (i = 0; i < 2; i++) {
+ if (data_fake.substrs[i].minlenp != minlenp) {
+ data->substrs[i].min_offset =
+ data_fake.substrs[i].min_offset;
+ data->substrs[i].max_offset =
+ data_fake.substrs[i].max_offset;
+ data->substrs[i].minlenp =
+ data_fake.substrs[i].minlenp;
+ data->substrs[i].lookbehind += scan->flags;
+ }
}
}
}
}
#endif
}
+
else if (OP(scan) == OPEN) {
if (stopparen != (I32)ARG(scan))
pars++;
{
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
is_inf = is_inf_internal = 1;
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
/* We go from the jump point to the branch that follows
it. Note this means we need the vestigal unused
branches even though they arent otherwise used. */
+ /* optimise study_chunk() for TRIE */
minnext = study_chunk(pRExC_state, &scan, minlenp,
&deltanext, (regnode *)nextbranch, &data_fake,
stopparen, recursed_depth, NULL, f,depth+1);
data->pos_min += min1;
data->pos_delta += max1 - min1;
if (max1 != min1 || is_inf)
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
min += min1;
- if (delta != SSize_t_MAX)
- delta += max1 - min1;
+ if (delta != SSize_t_MAX) {
+ if (SSize_t_MAX - (max1 - min1) >= delta)
+ delta += max1 - min1;
+ else
+ delta = SSize_t_MAX;
+ }
if (flags & SCF_DO_STCLASS_OR) {
ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
if (min1) {
data->pos_min += trie->minlen;
data->pos_delta += (trie->maxlen - trie->minlen);
if (trie->maxlen != trie->minlen)
- data->longest = &(data->longest_float);
+ data->cur_is_floating = 1; /* float */
}
if (trie->jump) /* no more substrings -- for now /grr*/
flags &= ~SCF_DO_SUBSTR;
/* we need to unwind recursion. */
depth = depth - 1;
- DEBUG_STUDYDATA("frame-end:",data,depth);
- DEBUG_PEEP("fend", scan, depth);
+ DEBUG_STUDYDATA("frame-end", data, depth, is_inf);
+ DEBUG_PEEP("fend", scan, depth, flags);
/* restore previous context */
last = frame->last_regnode;
}
assert(!frame);
- DEBUG_STUDYDATA("pre-fin:",data,depth);
+ DEBUG_STUDYDATA("pre-fin", data, depth, is_inf);
*scanp = scan;
*deltap = is_inf_internal ? SSize_t_MAX : delta;
if (flags & SCF_TRIE_RESTUDY)
data->flags |= SCF_TRIE_RESTUDY;
- DEBUG_STUDYDATA("post-fin:",data,depth);
+ DEBUG_STUDYDATA("post-fin", data, depth, is_inf);
{
SSize_t final_minlen= min < stopmin ? min : stopmin;
STATIC bool
-S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest,
- SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
- SSize_t lookbehind, SSize_t offset, SSize_t *minlen,
- STRLEN longest_length, bool eol, bool meol)
+S_setup_longest(pTHX_ RExC_state_t *pRExC_state,
+ struct reg_substr_datum *rsd,
+ struct scan_data_substrs *sub,
+ STRLEN longest_length)
{
/* This is the common code for setting up the floating and fixed length
* string data extracted from Perl_re_op_compile() below. Returns a boolean
I32 t;
SSize_t ml;
+ bool eol = cBOOL(sub->flags & SF_BEFORE_EOL);
+ bool meol = cBOOL(sub->flags & SF_BEFORE_MEOL);
if (! (longest_length
|| (eol /* Can't have SEOL and MULTI */
/* copy the information about the longest from the reg_scan_data
over to the program. */
- if (SvUTF8(sv_longest)) {
- *rx_utf8 = sv_longest;
- *rx_substr = NULL;
+ if (SvUTF8(sub->str)) {
+ rsd->substr = NULL;
+ rsd->utf8_substr = sub->str;
} else {
- *rx_substr = sv_longest;
- *rx_utf8 = NULL;
+ rsd->substr = sub->str;
+ rsd->utf8_substr = NULL;
}
/* end_shift is how many chars that must be matched that
follow this item. We calculate it ahead of time as once the
lookbehind offset is added in we lose the ability to correctly
calculate it.*/
- ml = minlen ? *(minlen) : (SSize_t)longest_length;
- *rx_end_shift = ml - offset
+ ml = sub->minlenp ? *(sub->minlenp) : (SSize_t)longest_length;
+ rsd->end_shift = ml - sub->min_offset
- longest_length
/* XXX SvTAIL is always false here - did you mean FBMcf_TAIL
* intead? - DAPM
- + (SvTAIL(sv_longest) != 0)
+ + (SvTAIL(sub->str) != 0)
*/
- + lookbehind;
+ + sub->lookbehind;
t = (eol/* Can't have SEOL and MULTI */
&& (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
- fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
+ fbm_compile(sub->str, t ? FBMcf_TAIL : 0);
return TRUE;
}
if ( ! dump_len_string
|| ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL))
{
- PL_dump_re_max_len = 0;
+ PL_dump_re_max_len = 60; /* A reasonable default */
}
#endif
}
RExC_seen_unfolded_sharp_s = 0;
RExC_contains_locale = 0;
RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
+ RExC_in_script_run = 0;
RExC_study_started = 0;
pRExC_state->runtime_code_qr = NULL;
RExC_frame_head= NULL;
});
DEBUG_COMPILE_r({
SV *dsv= sv_newmortal();
- RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
+ RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, PL_dump_re_max_len);
Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n",
PL_colors[4],PL_colors[5],s);
});
/* make sure PL_bitcount bounds not exceeded */
assert(sizeof(STD_PAT_MODS) <= 8);
- Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
- r->xpv_len_u.xpvlenu_pv = p;
+ p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
+ SvPOK_on(rx);
if (RExC_utf8)
SvFLAGS(rx) |= SVf_UTF8;
*p++='('; *p++='?';
3-units-long substrs field. */
Newx(r->substrs, 1, struct reg_substr_data);
if (RExC_recurse_count) {
- Newxz(RExC_recurse,RExC_recurse_count,regnode *);
+ Newx(RExC_recurse,RExC_recurse_count,regnode *);
SAVEFREEPV(RExC_recurse);
}
if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN)) { /* Only one top-level choice.
*/
SSize_t fake;
- STRLEN longest_float_length, longest_fixed_length;
+ STRLEN longest_length[2];
regnode_ssc ch_class; /* pointed to by data */
int stclass_flag;
SSize_t last_close = 0; /* pointed to by data */
regnode *first= scan;
regnode *first_next= regnext(first);
+ int i;
+
/*
* Skip introductions and multiplicators >= 1
* so that we can extract the 'meat' of the pattern that must
/* Starting-point info. */
again:
- DEBUG_PEEP("first:",first,0);
+ DEBUG_PEEP("first:", first, 0, 0);
/* Ignore EXACT as we deal with it later. */
if (PL_regkind[OP(first)] == EXACT) {
if (OP(first) == EXACT || OP(first) == EXACTL)
* earlier string may buy us something the later one won't.]
*/
- data.longest_fixed = newSVpvs("");
- data.longest_float = newSVpvs("");
+ data.substrs[0].str = newSVpvs("");
+ data.substrs[1].str = newSVpvs("");
data.last_found = newSVpvs("");
- data.longest = &(data.longest_fixed);
+ data.cur_is_floating = 0; /* initially any found substring is fixed */
ENTER_with_name("study_chunk");
- SAVEFREESV(data.longest_fixed);
- SAVEFREESV(data.longest_float);
+ SAVEFREESV(data.substrs[0].str);
+ SAVEFREESV(data.substrs[1].str);
SAVEFREESV(data.last_found);
first = scan;
if (!ri->regstclass) {
data.last_closep = &last_close;
DEBUG_RExC_seen();
+ /*
+ * MAIN ENTRY FOR study_chunk() FOR m/PATTERN/
+ * (NO top level branches)
+ */
minlen = study_chunk(pRExC_state, &first, &minlen, &fake,
scan + RExC_size, /* Up to end */
&data, -1, 0, NULL,
CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk"));
- if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
+ if ( RExC_npar == 1 && !data.cur_is_floating
&& data.last_start_min == 0 && data.last_end > 0
&& !RExC_seen_zerolen
&& !(RExC_seen & REG_VERBARG_SEEN)
}
scan_commit(pRExC_state, &data,&minlen,0);
- longest_float_length = CHR_SVLEN(data.longest_float);
-
- if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
- && data.offset_fixed == data.offset_float_min
- && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
- && S_setup_longest (aTHX_ pRExC_state,
- data.longest_float,
- &(r->float_utf8),
- &(r->float_substr),
- &(r->float_end_shift),
- data.lookbehind_float,
- data.offset_float_min,
- data.minlen_float,
- longest_float_length,
- cBOOL(data.flags & SF_FL_BEFORE_EOL),
- cBOOL(data.flags & SF_FL_BEFORE_MEOL)))
- {
- r->float_min_offset = data.offset_float_min - data.lookbehind_float;
- r->float_max_offset = data.offset_float_max;
- if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
- r->float_max_offset -= data.lookbehind_float;
- SvREFCNT_inc_simple_void_NN(data.longest_float);
- }
- else {
- r->float_substr = r->float_utf8 = NULL;
- longest_float_length = 0;
- }
- longest_fixed_length = CHR_SVLEN(data.longest_fixed);
-
- if (S_setup_longest (aTHX_ pRExC_state,
- data.longest_fixed,
- &(r->anchored_utf8),
- &(r->anchored_substr),
- &(r->anchored_end_shift),
- data.lookbehind_fixed,
- data.offset_fixed,
- data.minlen_fixed,
- longest_fixed_length,
- cBOOL(data.flags & SF_FIX_BEFORE_EOL),
- cBOOL(data.flags & SF_FIX_BEFORE_MEOL)))
- {
- r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
- SvREFCNT_inc_simple_void_NN(data.longest_fixed);
- }
- else {
- r->anchored_substr = r->anchored_utf8 = NULL;
- longest_fixed_length = 0;
- }
+ /* XXX this is done in reverse order because that's the way the
+ * code was before it was parameterised. Don't know whether it
+ * actually needs doing in reverse order. DAPM */
+ for (i = 1; i >= 0; i--) {
+ longest_length[i] = CHR_SVLEN(data.substrs[i].str);
+
+ if ( !( i
+ && SvCUR(data.substrs[0].str) /* ok to leave SvCUR */
+ && data.substrs[0].min_offset
+ == data.substrs[1].min_offset
+ && SvCUR(data.substrs[0].str)
+ == SvCUR(data.substrs[1].str)
+ )
+ && S_setup_longest (aTHX_ pRExC_state,
+ &(r->substrs->data[i]),
+ &(data.substrs[i]),
+ longest_length[i]))
+ {
+ r->substrs->data[i].min_offset =
+ data.substrs[i].min_offset - data.substrs[i].lookbehind;
+
+ r->substrs->data[i].max_offset = data.substrs[i].max_offset;
+ /* Don't offset infinity */
+ if (data.substrs[i].max_offset < SSize_t_MAX)
+ r->substrs->data[i].max_offset -= data.substrs[i].lookbehind;
+ SvREFCNT_inc_simple_void_NN(data.substrs[i].str);
+ }
+ else {
+ r->substrs->data[i].substr = NULL;
+ r->substrs->data[i].utf8_substr = NULL;
+ longest_length[i] = 0;
+ }
+ }
+
LEAVE_with_name("study_chunk");
if (ri->regstclass
&& (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
ri->regstclass = NULL;
- if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
+ if ((!(r->substrs->data[0].substr || r->substrs->data[0].utf8_substr)
+ || r->substrs->data[0].min_offset)
&& stclass_flag
&& ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
&& is_ssc_worth_it(pRExC_state, data.start_class))
data.start_class = NULL;
}
- /* A temporary algorithm prefers floated substr to fixed one to dig
- * more info. */
- if (longest_fixed_length > longest_float_length) {
- r->substrs->check_ix = 0;
- r->check_end_shift = r->anchored_end_shift;
- r->check_substr = r->anchored_substr;
- r->check_utf8 = r->anchored_utf8;
- r->check_offset_min = r->check_offset_max = r->anchored_offset;
- if (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS))
- r->intflags |= PREGf_NOSCAN;
- }
- else {
- r->substrs->check_ix = 1;
- r->check_end_shift = r->float_end_shift;
- r->check_substr = r->float_substr;
- r->check_utf8 = r->float_utf8;
- r->check_offset_min = r->float_min_offset;
- r->check_offset_max = r->float_max_offset;
- }
+ /* A temporary algorithm prefers floated substr to fixed one of
+ * same length to dig more info. */
+ i = (longest_length[0] <= longest_length[1]);
+ r->substrs->check_ix = i;
+ r->check_end_shift = r->substrs->data[i].end_shift;
+ r->check_substr = r->substrs->data[i].substr;
+ r->check_utf8 = r->substrs->data[i].utf8_substr;
+ r->check_offset_min = r->substrs->data[i].min_offset;
+ r->check_offset_max = r->substrs->data[i].max_offset;
+ if (!i && (r->intflags & (PREGf_ANCH_SBOL|PREGf_ANCH_GPOS)))
+ r->intflags |= PREGf_NOSCAN;
+
if ((r->check_substr || r->check_utf8) ) {
r->extflags |= RXf_USE_INTUIT;
if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
r->extflags |= RXf_INTUIT_TAIL;
}
- r->substrs->data[0].max_offset = r->substrs->data[0].min_offset;
/* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
- if ( (STRLEN)minlen < longest_float_length )
- minlen= longest_float_length;
- if ( (STRLEN)minlen < longest_fixed_length )
- minlen= longest_fixed_length;
+ if ( (STRLEN)minlen < longest_length[1] )
+ minlen= longest_length[1];
+ if ( (STRLEN)minlen < longest_length[0] )
+ minlen= longest_length[0];
*/
}
else {
data.last_closep = &last_close;
DEBUG_RExC_seen();
+ /*
+ * MAIN ENTRY FOR study_chunk() FOR m/P1|P2|.../
+ * (patterns WITH top level branches)
+ */
minlen = study_chunk(pRExC_state,
&scan, &minlen, &fake, scan + RExC_size, &data, -1, 0, NULL,
SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS|(restudied
CHECK_RESTUDY_GOTO_butfirst(NOOP);
- r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
- = r->float_substr = r->float_utf8 = NULL;
+ r->check_substr = NULL;
+ r->check_utf8 = NULL;
+ r->substrs->data[0].substr = NULL;
+ r->substrs->data[0].utf8_substr = NULL;
+ r->substrs->data[1].substr = NULL;
+ r->substrs->data[1].utf8_substr = NULL;
if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
&& is_ssc_worth_it(pRExC_state, data.start_class))
if (RExC_seen & REG_RECURSE_SEEN ) {
r->intflags |= PREGf_RECURSE_SEEN;
- Newxz(r->recurse_locinput, r->nparens + 1, char *);
+ Newx(r->recurse_locinput, r->nparens + 1, char *);
}
if (RExC_seen & REG_GPOS_SEEN)
r->intflags |= PREGf_GPOS_SEEN;
* here (if paren ==2). The forms '(*VERB' and '(?...' disallow such
* intervening space, as the sequence is a token, and a token should be
* indivisible */
- bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '(';
+ bool has_intervening_patws = (paren == 2 || paren == 's')
+ && *(RExC_parse - 1) != '(';
if (RExC_parse >= RExC_end) {
vFAIL("Unmatched (");
}
- if ( *RExC_parse == '*') { /* (*VERB:ARG) */
+ if (paren == 's') {
+
+ /* A nested script run is a no-op besides clustering */
+ if (RExC_in_script_run) {
+ paren = ':';
+ nextchar(pRExC_state);
+ ret = NULL;
+ goto parse_rest;
+ }
+ RExC_in_script_run = 1;
+
+ ret = reg_node(pRExC_state, SROPEN);
+ is_open = 1;
+ }
+ else if ( *RExC_parse == '*') { /* (*VERB:ARG) */
char *start_verb = RExC_parse + 1;
STRLEN verb_len;
char *start_arg = NULL;
if (RExC_parse >= RExC_end) {
goto unterminated_verb_pattern;
}
+
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
while ( RExC_parse < RExC_end && *RExC_parse != ')' )
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
nextchar(pRExC_state);
return ret;
}
+ else if (*RExC_parse == '+') { /* (+...) */
+ RExC_parse++;
+
+ if (has_intervening_patws) {
+ /* XXX Note that a potential gotcha is that outside of /x '( +
+ * ...)' means to match a space at least once ... This is a
+ * problem elsewhere too */
+ vFAIL("In '(+...)', the '(' and '+' must be adjacent");
+ }
+
+ if (! memBEGINPs(RExC_parse, (STRLEN) (RExC_end - RExC_parse),
+ "script_run:"))
+ {
+ RExC_parse += strcspn(RExC_parse, ":)");
+ vFAIL("Unknown (+ pattern");
+ }
+ else {
+
+ /* This indicates Unicode rules. */
+ REQUIRE_UNI_RULES(flagp, NULL);
+
+ RExC_parse += sizeof("script_run:") - 1;
+
+ if (PASS2) {
+ Perl_ck_warner_d(aTHX_
+ packWARN(WARN_EXPERIMENTAL__SCRIPT_RUN),
+ "The script_run feature is experimental"
+ REPORT_LOCATION, REPORT_LOCATION_ARGS(RExC_parse));
+ }
+
+ ret = reg(pRExC_state, 's', &flags, depth+1);
+ if (flags & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flags & (RESTART_PASS1|NEED_UTF8);
+ return NULL;
+ }
+
+ return ret;
+ }
+ }
else if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
ret = reganode(pRExC_state,NGROUPP,num);
goto insert_if_check_paren;
}
- else if (RExC_end - RExC_parse >= DEFINE_len
- && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
+ else if (memBEGINs(RExC_parse,
+ (STRLEN) (RExC_end - RExC_parse),
+ "DEFINE"))
{
ret = reganode(pRExC_state,DEFINEP,0);
RExC_parse += DEFINE_len;
vFAIL("Unknown switch condition (?(...))");
}
case '[': /* (?[ ... ]) */
- return handle_regex_sets(pRExC_state, NULL, flagp, depth,
+ return handle_regex_sets(pRExC_state, NULL, flagp, depth+1,
oregcomp_parse);
case 0: /* A NUL */
RExC_parse--; /* for vFAIL to print correctly */
goto parse_rest;
} /* end switch */
}
- else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
+ else {
+ if (*RExC_parse == '{' && PASS2) {
+ ckWARNregdep(RExC_parse + 1,
+ "Unescaped left brace in regex is "
+ "deprecated here (and will be fatal "
+ "in Perl 5.32), passed through");
+ }
+ /* Not bothering to indent here, as the above 'else' is temporary
+ * */
+ if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
capturing_parens:
parno = RExC_npar;
RExC_npar++;
paren = ':';
ret = NULL;
}
+ }
}
else /* ! paren */
ret = NULL;
Set_Node_Offset(ender,RExC_parse+1); /* MJD */
Set_Node_Length(ender,1); /* MJD */
break;
+ case 's':
+ ender = reg_node(pRExC_state, SRCLOSE);
+ RExC_in_script_run = 0;
+ break;
case '<':
case ',':
case '=':
if ((flags&SIMPLE)) {
if (min == 0 && max == REG_INFTY) {
reginsert(pRExC_state, STAR, ret, depth+1);
- ret->flags = 0;
MARK_NAUGHTY(4);
RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
goto nest_check;
}
if (min == 1 && max == REG_INFTY) {
reginsert(pRExC_state, PLUS, ret, depth+1);
- ret->flags = 0;
MARK_NAUGHTY(3);
RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
goto nest_check;
ender = reg_node(pRExC_state, SUCCEED);
REGTAIL(pRExC_state, ret, ender);
reginsert(pRExC_state, SUSPEND, ret, depth+1);
- ret->flags = 0;
ender = reg_node(pRExC_state, TAIL);
REGTAIL(pRExC_state, ret, ender);
}
RExC_parse++; /* Skip past the '{' */
- if (! (endbrace = strchr(RExC_parse, '}'))) { /* no trailing brace */
+ endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
+ if (! endbrace) { /* no trailing brace */
vFAIL2("Missing right brace on \\%c{}", 'N');
}
- else if(!(endbrace == RExC_parse /* nothing between the {} */
- || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */
- && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
- error msg) */
+ else if (!( endbrace == RExC_parse /* nothing between the {} */
+ || memBEGINs(RExC_parse, /* U+ (bad hex is checked below
+ for a better error msg) */
+ (STRLEN) (RExC_end - RExC_parse),
+ "U+")))
{
RExC_parse = endbrace; /* position msg's '<--HERE' */
vFAIL("\\N{NAME} must be resolved by the lexer");
/* Convert code point from hex */
length_of_hex = (STRLEN)(endchar - RExC_parse);
grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX
+ | PERL_SCAN_DISALLOW_PREFIX
- /* No errors in the first pass (See [perl
- * #122671].) We let the code below find the
- * errors when there are multiple chars. */
- | ((SIZE_ONLY)
- ? PERL_SCAN_SILENT_ILLDIGIT
- : 0);
+ /* No errors in the first pass (See [perl
+ * #122671].) We let the code below find the
+ * errors when there are multiple chars. */
+ | ((SIZE_ONLY)
+ ? PERL_SCAN_SILENT_ILLDIGIT
+ : 0);
/* This routine is the one place where both single- and double-quotish
* \N{U+xxxx} are evaluated. The value is a Unicode code point which
* must be converted to native. */
*code_point_p = UNI_TO_NATIVE(grok_hex(RExC_parse,
- &length_of_hex,
- &grok_hex_flags,
- NULL));
+ &length_of_hex,
+ &grok_hex_flags,
+ NULL));
/* The tokenizer should have guaranteed validity, but it's possible to
* bypass it by using single quoting, so check. Don't do the check
}
/* Fail if caller doesn't want to handle a multi-code-point sequence.
- * But don't backup up the pointer if the caller want to know how many
+ * But don't backup up the pointer if the caller wants to know how many
* code points there are (they can then handle things) */
if (! node_p) {
if (! cp_count) {
}
sv_catpv(substitute_parse, ")");
- RExC_parse = RExC_start = RExC_adjusted_start = SvPV(substitute_parse,
- len);
+ len = SvCUR(substitute_parse);
/* Don't allow empty number */
if (len < (STRLEN) 8) {
RExC_parse = endbrace;
vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
+
+ RExC_parse = RExC_start = RExC_adjusted_start
+ = SvPV_nolen(substitute_parse);
RExC_end = RExC_parse + len;
/* The values are Unicode, and therefore not subject to recoding, but
RExC_recode_x_to_native = 1;
#endif
- if (node_p) {
- if (!(*node_p = reg(pRExC_state, 1, &flags, depth+1))) {
- if (flags & (RESTART_PASS1|NEED_UTF8)) {
- *flagp = flags & (RESTART_PASS1|NEED_UTF8);
- return FALSE;
- }
- FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#" UVxf,
- (UV) flags);
- }
- *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
- }
+ *node_p = reg(pRExC_state, 1, &flags, depth+1);
/* Restore the saved values */
RExC_start = RExC_adjusted_start = save_start;
#ifdef EBCDIC
RExC_recode_x_to_native = 0;
#endif
-
SvREFCNT_dec_NN(substitute_parse);
+
+ if (! *node_p) {
+ if (flags & (RESTART_PASS1|NEED_UTF8)) {
+ *flagp = flags & (RESTART_PASS1|NEED_UTF8);
+ return FALSE;
+ }
+ FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#" UVxf,
+ (UV) flags);
+ }
+ *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
+
nextchar(pRExC_state);
return TRUE;
}
}
+STATIC bool
+S_new_regcurly(const char *s, const char *e)
+{
+ /* This is a temporary function designed to match the most lenient form of
+ * a {m,n} quantifier we ever envision, with either number omitted, and
+ * spaces anywhere between/before/after them.
+ *
+ * If this function fails, then the string it matches is very unlikely to
+ * ever be considered a valid quantifier, so we can allow the '{' that
+ * begins it to be considered as a literal */
+
+ bool has_min = FALSE;
+ bool has_max = FALSE;
+
+ PERL_ARGS_ASSERT_NEW_REGCURLY;
+
+ if (s >= e || *s++ != '{')
+ return FALSE;
+
+ while (s < e && isSPACE(*s)) {
+ s++;
+ }
+ while (s < e && isDIGIT(*s)) {
+ has_min = TRUE;
+ s++;
+ }
+ while (s < e && isSPACE(*s)) {
+ s++;
+ }
+
+ if (*s == ',') {
+ s++;
+ while (s < e && isSPACE(*s)) {
+ s++;
+ }
+ while (s < e && isDIGIT(*s)) {
+ has_max = TRUE;
+ s++;
+ }
+ while (s < e && isSPACE(*s)) {
+ s++;
+ }
+ }
+
+ return s < e && *s == '}' && (has_min || has_max);
+}
/* Parse backref decimal value, unless it's too big to sensibly be a backref,
* in which case return I32_MAX (rather than possibly 32-bit wrapping) */
else {
STRLEN length;
char name = *RExC_parse;
- char * endbrace;
+ char * endbrace = NULL;
RExC_parse += 2;
- endbrace = strchr(RExC_parse, '}');
+ if (RExC_parse < RExC_end) {
+ endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
+ }
if (! endbrace) {
vFAIL2("Missing right brace on \\%c{}", name);
}*/
switch (*RExC_parse) {
case 'g':
- if (length != 1
- && (length != 3 || strnNE(RExC_parse + 1, "cb", 2)))
+ if ( length != 1
+ && (memNEs(RExC_parse + 1, length - 1, "cb")))
{
goto bad_bound_type;
}
/* FALLTHROUGH */
finish_meta_pat:
+ if ( UCHARAT(RExC_parse + 1) == '{'
+ && UNLIKELY(! new_regcurly(RExC_parse + 1, RExC_end)))
+ {
+ RExC_parse += 2;
+ vFAIL("Unescaped left brace in regex is illegal here");
+ }
nextchar(pRExC_state);
Set_Node_Length(ret, 2); /* MJD */
break;
UV ender = 0;
char *p;
char *s;
-#define MAX_NODE_STRING_SIZE 127
- char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
+
+/* This allows us to fill a node with just enough spare so that if the final
+ * character folds, its expansion is guaranteed to fit */
+#define MAX_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
+ char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE+1];
+
char *s0;
U8 upper_parse = MAX_NODE_STRING_SIZE;
U8 node_type = compute_EXACTish(pRExC_state);
/* If a folding node contains only code points that don't
* participate in folds, it can be changed into an EXACT node,
- * which allows the optimizer more things to look for */
+ * which allows the optimizer more things to look for, and is
+ * faster to match */
bool maybe_exact;
+ /* The node_type may change below, but since the size of the node
+ * doesn't change, it works */
ret = reg_node(pRExC_state, node_type);
/* In pass1, folded, we use a temporary buffer instead of the
* use a pseudo regnode like 'EXACT_ORIG_FOLD' */
maybe_exact = FOLD && PASS2;
- /* XXX The node can hold up to 255 bytes, yet this only goes to
- * 127. I (khw) do not know why. Keeping it somewhat less than
- * 255 allows us to not have to worry about overflow due to
- * converting to utf8 and fold expansion, but that value is
- * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
- * split up by this limit into a single one using the real max of
- * 255. Even at 127, this breaks under rare circumstances. If
- * folding, we do not want to split a node at a character that is a
- * non-final in a multi-char fold, as an input string could just
- * happen to want to match across the node boundary. The join
- * would solve that problem if the join actually happens. But a
- * series of more than two nodes in a row each of 127 would cause
- * the first join to succeed to get to 254, but then there wouldn't
- * be room for the next one, which could at be one of those split
- * multi-char folds. I don't know of any fool-proof solution. One
- * could back off to end with only a code point that isn't such a
- * non-final, but it is possible for there not to be any in the
- * entire node. */
-
- assert( ! UTF /* Is at the beginning of a character */
+ /* This breaks under rare circumstances. If folding, we do not
+ * want to split a node at a character that is a non-final in a
+ * multi-char fold, as an input string could just happen to want to
+ * match across the node boundary. The code at the end of the loop
+ * looks for this, and backs off until it finds not such a
+ * character, but it is possible (though extremely, extremely
+ * unlikely) for all characters in the node to be non-final fold
+ * ones, in which case we just leave the node fully filled, and
+ * hope that it doesn't match the string in just the wrong place */
+
+ assert( ! UTF /* Is at the beginning of a character */
|| UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
|| UTF8_IS_START(UCHARAT(RExC_parse)));
/* Here, we have a literal character. Find the maximal string of
* them in the input that we can fit into a single EXACTish node.
* We quit at the first non-literal or when the node gets full */
- for (p = RExC_parse;
- len < upper_parse && p < RExC_end;
- len++)
- {
+ for (p = RExC_parse; len < upper_parse && p < RExC_end; ) {
+
+ /* In most cases each iteration adds one byte to the output.
+ * The exceptions override this */
+ Size_t added_len = 1;
+
oldp = p;
/* White space has already been ignored */
goto loopdone;
}
p = RExC_parse;
+ RExC_parse = parse_start;
if (ender > 0xff) {
REQUIRE_UTF8(flagp);
}
const char* error_msg;
bool valid = grok_bslash_o(&p,
+ RExC_end,
&result,
&error_msg,
PASS2, /* out warnings */
const char* error_msg;
bool valid = grok_bslash_x(&p,
+ RExC_end,
&result,
&error_msg,
PASS2, /* out warnings */
} /* End of switch on '\' */
break;
case '{':
- /* Currently we don't care if the lbrace is at the start
- * of a construct. This catches it in the middle of a
- * literal string, or when it's the first thing after
- * something like "\b" */
- if (len || (p > RExC_start && isALPHA_A(*(p -1)))) {
-
- /* GNU Autoconf is depended on by a lot of code, and
- * can't seem to release a new version that avoids the
- * deprecation now made fatal. (A commit to do so has
- * been in its repository since early 2013; only one
- * pattern is affected.) As a work-around, don't
- * fatalize this if the pattern being compiled is the
- * precise one that trips up Autoconf. See [perl
- * #130497] for more details. */
- if (memNEs(RExC_start, RExC_end - RExC_start,
- "\\${[^\\}]*}"))
+ /* Currently we allow an lbrace at the start of a construct
+ * without raising a warning. This is because we think we
+ * will never want such a brace to be meant to be other
+ * than taken literally. */
+ if (len || (p > RExC_start && isALPHA_A(*(p - 1)))) {
+
+ /* But, we raise a fatal warning otherwise, as the
+ * deprecation cycle has come and gone. Except that it
+ * turns out that some heavily-relied on upstream
+ * software, notably GNU Autoconf, have failed to fix
+ * their uses. For these, don't make it fatal unless
+ * we anticipate using the '{' for something else.
+ * This happens after any alpha, and for a looser {m,n}
+ * quantifier specification */
+ if ( RExC_strict
+ || ( p > parse_start + 1
+ && isALPHA_A(*(p - 1))
+ && *(p - 2) == '\\')
+ || new_regcurly(p, RExC_end))
{
RExC_parse = p + 1;
vFAIL("Unescaped left brace in regex is "
break;
} /* End of switch on the literal */
- /* Here, have looked at the literal character and <ender>
- * contains its ordinal, <p> points to the character after it.
+ /* Here, have looked at the literal character, and <ender>
+ * contains its ordinal; <p> points to the character after it.
* We need to check if the next non-ignored thing is a
* quantifier. Move <p> to after anything that should be
* ignored, which, as a side effect, positions <p> for the next
* this character again next time through, when it will be the
* only thing in its new node */
- if ((next_is_quantifier = ( LIKELY(p < RExC_end)
- && UNLIKELY(ISMULT2(p))))
- && LIKELY(len))
- {
+ next_is_quantifier = LIKELY(p < RExC_end)
+ && UNLIKELY(ISMULT2(p));
+
+ if (next_is_quantifier && LIKELY(len)) {
p = oldp;
goto loopdone;
}
* character we are appending, hence we can delay getting
* its representation until PASS2. */
if (SIZE_ONLY) {
- if (UTF) {
+ if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
const STRLEN unilen = UVCHR_SKIP(ender);
s += unilen;
-
- /* We have to subtract 1 just below (and again in
- * the corresponding PASS2 code) because the loop
- * increments <len> each time, as all but this path
- * (and one other) through it add a single byte to
- * the EXACTish node. But these paths would change
- * len to be the correct final value, so cancel out
- * the increment that follows */
- len += unilen - 1;
+ added_len = unilen;
}
else {
s++;
}
} else { /* PASS2 */
not_fold_common:
- if (UTF) {
+ if (UTF && ! UVCHR_IS_INVARIANT(ender)) {
U8 * new_s = uvchr_to_utf8((U8*)s, ender);
- len += (char *) new_s - s - 1;
+ added_len = (char *) new_s - s;
s = (char *) new_s;
}
else {
/* Here are folding under /l, and the code point is
* problematic. First, we know we can't simplify things */
maybe_exact = FALSE;
+
+ /* This code point means we can't simplify things */
maybe_exactfu = FALSE;
/* A problematic code point in this context means that its
#endif
)) {
/* Here, are folding and are not UTF-8 encoded; therefore
- * the character must be in the range 0-255, and is not /l
+ * the character must be in the range 0-255, and is not /l.
* (Not /l because we already handled these under /l in
* is_PROBLEMATIC_LOCALE_FOLD_cp) */
if (IS_IN_SOME_FOLD_L1(ender)) {
}
}
- /* Even when folding, we store just the input character, as
- * we have an array that finds its fold quickly */
- *(s++) = (char) ender;
+ /* Even when folding, we store just the input
+ * character, as we have an array that finds its fold
+ * quickly */
+ *(s++) = (char) ender;
}
else { /* FOLD, and UTF (or sharp s) */
/* Unlike the non-fold case, we do actually have to
- * calculate the results here in pass 1. This is for two
- * reasons, the folded length may be longer than the
- * unfolded, and we have to calculate how many EXACTish
- * nodes it will take; and we may run out of room in a node
- * in the middle of a potential multi-char fold, and have
- * to back off accordingly. */
+ * calculate the fold in pass 1. This is for two reasons,
+ * the folded length may be longer than the unfolded, and
+ * we have to calculate how many EXACTish nodes it will
+ * take; and we may run out of room in a node in the middle
+ * of a potential multi-char fold, and have to back off
+ * accordingly. */
UV folded;
if (isASCII_uni(ender)) {
folded = toFOLD(ender);
*(s)++ = (U8) folded;
}
- else {
+ else { /* Not ASCII */
STRLEN foldlen;
folded = _to_uni_fold_flags(
? FOLD_FLAGS_NOMIX_ASCII
: 0));
s += foldlen;
-
- /* The loop increments <len> each time, as all but this
- * path (and one other) through it add a single byte to
- * the EXACTish node. But this one has changed len to
- * be the correct final value, so subtract one to
- * cancel out the increment that follows */
- len += foldlen - 1;
+ added_len = foldlen;
}
/* If this node only contains non-folding code points so
* far, see if this new one is also non-folding */
ender = folded;
}
+ len += added_len;
+
if (next_is_quantifier) {
/* Here, the next input is a quantifier, and to get here,
- * the current character is the only one in the node.
- * Also, here <len> doesn't include the final byte for this
- * character */
- len++;
+ * the current character is the only one in the node. */
goto loopdone;
}
s = (char *) utf8_hop((U8 *) s, -1);
while (s >= s0) { /* Search backwards until find
- non-problematic char */
+ a non-problematic char */
if (UTF8_IS_INVARIANT(*s)) {
/* There are no ascii characters that participate
: EXACTFU;
}
}
+
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
FALSE /* Don't look to see if could
be turned into an EXACT
REPORT_LOCATION_ARGS(p))); \
} \
} STMT_END
+#define CLEAR_POSIX_WARNINGS() \
+ STMT_START { \
+ if (posix_warnings && RExC_warn_text) \
+ av_clear(RExC_warn_text); \
+ } STMT_END
+
+#define CLEAR_POSIX_WARNINGS_AND_RETURN(ret) \
+ STMT_START { \
+ CLEAR_POSIX_WARNINGS(); \
+ return ret; \
+ } STMT_END
STATIC int
S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state,
*
* The syntax for a legal posix class is:
*
- * qr/(?xa: \[ : \^? [:lower:]{4,6} : \] )/
+ * qr/(?xa: \[ : \^? [[:lower:]]{4,6} : \] )/
*
* What this routine considers syntactically to be an intended posix class
* is this (the comments indicate some restrictions that the pattern
* # for it to be considered to be
* # an intended posix class.
* \h*
- * [:punct:]? # The closing class character,
+ * [[:punct:]]? # The closing class character,
* # possibly omitted. If not a colon
* # nor semi colon, the class name
* # must be even closer to a valid
* decide that no posix class was intended. Should be at least
* sizeof("alphanumeric") */
UV input_text[15];
+ STATIC_ASSERT_DECL(C_ARRAY_LENGTH(input_text) >= sizeof "alphanumeric");
PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX;
- if (posix_warnings && RExC_warn_text)
- av_clear(RExC_warn_text);
+ CLEAR_POSIX_WARNINGS();
if (p >= e) {
return NOT_MEANT_TO_BE_A_POSIX_CLASS;
*updated_parse_ptr = (char *) temp_ptr;
}
- return OOB_NAMEDCLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(OOB_NAMEDCLASS);
}
}
/* We consider something like [^:^alnum:]] to not have been intended to
* be a posix class, but XXX maybe we should */
if (complement) {
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
complement = 1;
* this leaves this construct looking like [:] or [:^], which almost
* certainly weren't intended to be posix classes */
if (has_opening_bracket) {
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
/* But this function can be called when we parse the colon for
/* XXX We are currently very restrictive here, so this code doesn't
* consider the possibility that, say, /[alpha.]]/ was intended to
* be a posix class. */
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
/* Here we have something like 'foo:]'. There was no initial colon,
}
/* Otherwise, it can't have meant to have been a class */
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
/* If we ran off the end, and the final character was a punctuation
* class name. (We can do this on the first pass, as any second pass
* will yield an even shorter name) */
if (name_len < 3) {
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
/* Find which class it is. Initially switch on the length of the name.
* */
switch (name_len) {
case 4:
- if (memEQ(name_start, "word", 4)) {
+ if (memEQs(name_start, 4, "word")) {
/* this is not POSIX, this is the Perl \w */
class_number = ANYOF_WORDCHAR;
}
* Offset 4 gives the best switch position. */
switch (name_start[4]) {
case 'a':
- if (memEQ(name_start, "alph", 4)) /* alpha */
+ if (memBEGINs(name_start, 5, "alph")) /* alpha */
class_number = ANYOF_ALPHA;
break;
case 'e':
- if (memEQ(name_start, "spac", 4)) /* space */
+ if (memBEGINs(name_start, 5, "spac")) /* space */
class_number = ANYOF_SPACE;
break;
case 'h':
- if (memEQ(name_start, "grap", 4)) /* graph */
+ if (memBEGINs(name_start, 5, "grap")) /* graph */
class_number = ANYOF_GRAPH;
break;
case 'i':
- if (memEQ(name_start, "asci", 4)) /* ascii */
+ if (memBEGINs(name_start, 5, "asci")) /* ascii */
class_number = ANYOF_ASCII;
break;
case 'k':
- if (memEQ(name_start, "blan", 4)) /* blank */
+ if (memBEGINs(name_start, 5, "blan")) /* blank */
class_number = ANYOF_BLANK;
break;
case 'l':
- if (memEQ(name_start, "cntr", 4)) /* cntrl */
+ if (memBEGINs(name_start, 5, "cntr")) /* cntrl */
class_number = ANYOF_CNTRL;
break;
case 'm':
- if (memEQ(name_start, "alnu", 4)) /* alnum */
+ if (memBEGINs(name_start, 5, "alnu")) /* alnum */
class_number = ANYOF_ALPHANUMERIC;
break;
case 'r':
- if (memEQ(name_start, "lowe", 4)) /* lower */
+ if (memBEGINs(name_start, 5, "lowe")) /* lower */
class_number = (FOLD) ? ANYOF_CASED : ANYOF_LOWER;
- else if (memEQ(name_start, "uppe", 4)) /* upper */
+ else if (memBEGINs(name_start, 5, "uppe")) /* upper */
class_number = (FOLD) ? ANYOF_CASED : ANYOF_UPPER;
break;
case 't':
- if (memEQ(name_start, "digi", 4)) /* digit */
+ if (memBEGINs(name_start, 5, "digi")) /* digit */
class_number = ANYOF_DIGIT;
- else if (memEQ(name_start, "prin", 4)) /* print */
+ else if (memBEGINs(name_start, 5, "prin")) /* print */
class_number = ANYOF_PRINT;
- else if (memEQ(name_start, "punc", 4)) /* punct */
+ else if (memBEGINs(name_start, 5, "punc")) /* punct */
class_number = ANYOF_PUNCT;
break;
}
break;
case 6:
- if (memEQ(name_start, "xdigit", 6))
+ if (memEQs(name_start, 6, "xdigit"))
class_number = ANYOF_XDIGIT;
break;
}
}
/* Here neither pass found a close-enough class name */
- return NOT_MEANT_TO_BE_A_POSIX_CLASS;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(NOT_MEANT_TO_BE_A_POSIX_CLASS);
}
probably_meant_to_be:
/* If it is a known class, return the class. The class number
* #defines are structured so each complement is +1 to the normal
* one */
- return class_number + complement;
+ CLEAR_POSIX_WARNINGS_AND_RETURN(class_number + complement);
}
else if (! check_only) {
'stack' of where the undealt-with left
parens would be if they were actually
put there */
- /* The 'VOL' (expanding to 'volatile') is a workaround for an optimiser bug
+ /* The 'volatile' is a workaround for an optimiser bug
* in Solaris Studio 12.3. See RT #127455 */
- VOL IV fence = 0; /* Position of where most recent undealt-
+ volatile IV fence = 0; /* Position of where most recent undealt-
with left paren in stack is; -1 if none.
*/
STRLEN len; /* Temporary */
PERL_ARGS_ASSERT_HANDLE_REGEX_SETS;
+ DEBUG_PARSE("xcls");
+
if (in_locale) {
set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
}
* these things, we need to realize that something preceded by a backslash
* is escaped, so we have to keep track of backslashes */
if (SIZE_ONLY) {
- UV depth = 0; /* how many nested (?[...]) constructs */
+ UV nest_depth = 0; /* how many nested (?[...]) constructs */
while (RExC_parse < RExC_end) {
SV* current = NULL;
TRUE /* Force /x */ );
switch (*RExC_parse) {
- case '?':
- if (RExC_parse[1] == '[') depth++, RExC_parse++;
+ case '(':
+ if (RExC_parse[1] == '?' && RExC_parse[2] == '[')
+ nest_depth++, RExC_parse+=2;
/* FALLTHROUGH */
default:
break;
}
case ']':
- if (depth--) break;
- RExC_parse++;
- if (*RExC_parse == ')') {
+ if (RExC_parse[1] == ')') {
+ RExC_parse++;
+ if (nest_depth--) break;
node = reganode(pRExC_state, ANYOF, 0);
RExC_size += ANYOF_SKIP;
nextchar(pRExC_state);
return node;
}
- goto no_close;
+ /* We output the messages even if warnings are off, because we'll fail
+ * the very next thing, and these give a likely diagnosis for that */
+ if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
+ output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
+ }
+ RExC_parse++;
+ vFAIL("Unexpected ']' with no following ')' in (?[...");
}
RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
}
- no_close:
/* We output the messages even if warnings are off, because we'll fail
* the very next thing, and these give a likely diagnosis for that */
if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) {
output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL);
}
- FAIL("Syntax error in (?[...])");
+ vFAIL("Syntax error in (?[...])");
}
/* Pass 2 only after this. */
* inversion list, and RExC_parse points to the trailing
* ']'; the next character should be the ')' */
RExC_parse++;
- assert(UCHARAT(RExC_parse) == ')');
+ if (UCHARAT(RExC_parse) != ')')
+ vFAIL("Expecting close paren for nested extended charclass");
/* Then the ')' matching the original '(' handled by this
* case: statement */
RExC_parse++;
- assert(UCHARAT(RExC_parse) == ')');
+ if (UCHARAT(RExC_parse) != ')')
+ vFAIL("Expecting close paren for wrapper for nested extended charclass");
RExC_parse++;
RExC_flags = save_flags;
do_posix_warnings ? &posix_warnings : NULL,
TRUE /* checking only */);
}
+ else if ( strict && ! skip_white
+ && ( _generic_isCC(value, _CC_VERTSPACE)
+ || is_VERTWS_cp_high(value)))
+ {
+ vFAIL("Literal vertical space in [] is illegal except under /x");
+ }
else if (value == '\\') {
/* Is a backslash; get the code point of the char after it */
vFAIL2("Empty \\%c", (U8)value);
if (*RExC_parse == '{') {
const U8 c = (U8)value;
- e = strchr(RExC_parse, '}');
+ e = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse);
if (!e) {
RExC_parse++;
vFAIL2("Missing right brace on \\%c{}", c);
* referred to outside it. [perl #121777] */
if (! has_pkg && curpkg) {
char* pkgname = HvNAME(curpkg);
- if (strNE(pkgname, "main")) {
+ if (memNEs(pkgname, HvNAMELEN(curpkg), "main")) {
char* full_name = Perl_form(aTHX_
"%s::%s",
pkgname,
{
const char* error_msg;
bool valid = grok_bslash_o(&RExC_parse,
+ RExC_end,
&value,
&error_msg,
PASS2, /* warnings only in
{
const char* error_msg;
bool valid = grok_bslash_x(&RExC_parse,
+ RExC_end,
&value,
&error_msg,
PASS2, /* Output warnings */
{
/* Here <value> is indeed a multi-char fold. Get what it is */
- U8 foldbuf[UTF8_MAXBYTES_CASE];
+ U8 foldbuf[UTF8_MAXBYTES_CASE+1];
STRLEN foldlen;
UV folded = _to_uni_fold_flags(
" be some subset of \"0-9\","
" \"A-Z\", or \"a-z\"");
}
- else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
+ else if (prevvalue >= FIRST_NON_ASCII_DECIMAL_DIGIT) {
SSize_t index_start;
SSize_t index_final;
* can't do the same checks for above-ASCII ranges,
* except in the case of digit ones. These should
* contain only digits from the same group of 10. The
- * ASCII case is handled just above. 0x660 is the
- * first digit character beyond ASCII. Hence here, the
+ * ASCII case is handled just above. Hence here, the
* range could be a range of digits. First some
* unlikely special cases. Grandfather in that a range
* ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad
/* The actual POSIXish node for all the rest depends on the
* charset modifier. The ones in the first set depend only on
* ASCII or, if available on this platform, also locale */
+
case ANYOF_ASCII:
case ANYOF_NASCII:
+
#ifdef HAS_ISASCII
- op = (LOC) ? POSIXL : POSIXA;
-#else
- op = POSIXA;
+ if (LOC) {
+ op = POSIXL;
+ goto join_posix;
+ }
#endif
- goto join_posix;
+ /* (named_class - ANYOF_ASCII) is 0 or 1. xor'ing with
+ * invert converts that to 1 or 0 */
+ op = ASCII + ((namedclass - ANYOF_ASCII) ^ invert);
+ break;
/* The following don't have any matches in the upper Latin1
* range, hence /d is equivalent to /u for them. Making it /u
TRUE /* downgradable to EXACT */
);
}
+ else {
+ *flagp |= HASWIDTH|SIMPLE;
+ }
RExC_parse = (char *) cur_parse;
if (_invlist_len(only_non_utf8_list) != 0) {
ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
}
+ SvREFCNT_dec_NN(only_non_utf8_list);
}
else {
/* Here there were no complemented posix classes. That means
* certain common classes that are easy to test. Getting to this point in
* the code means that the class didn't get optimized there. Since this
* code is only executed in Pass 2, it is too late to save space--it has
- * been allocated in Pass 1, and currently isn't given back. But turning
- * things into an EXACTish node can allow the optimizer to join it to any
- * adjacent such nodes. And if the class is equivalent to things like /./,
- * expensive run-time swashes can be avoided. Now that we have more
- * complete information, we can find things necessarily missed by the
- * earlier code. Another possible "optimization" that isn't done is that
- * something like [Ee] could be changed into an EXACTFU. khw tried this
- * and found that the ANYOF is faster, including for code points not in the
- * bitmap. This still might make sense to do, provided it got joined with
- * an adjacent node(s) to create a longer EXACTFU one. This could be
- * accomplished by creating a pseudo ANYOF_EXACTFU node type that the join
- * routine would know is joinable. If that didn't happen, the node type
- * could then be made a straight ANYOF */
+ * been allocated in Pass 1, and currently isn't given back. XXX Why not?
+ * But turning things into an EXACTish node can allow the optimizer to join
+ * it to any adjacent such nodes. And if the class is equivalent to things
+ * like /./, expensive run-time swashes can be avoided. Now that we have
+ * more complete information, we can find things necessarily missed by the
+ * earlier code. */
if (optimizable && cp_list && ! invert) {
UV start, end;
U8 op = END; /* The optimzation node-type */
int posix_class = -1; /* Illegal value */
const char * cur_parse= RExC_parse;
+ U8 ANYOFM_mask = 0xFF;
+ U32 anode_arg = 0;
invlist_iterinit(cp_list);
if (! invlist_iternext(cp_list, &start, &end)) {
invlist_iterfinish(cp_list);
if (op == END) {
- const UV cp_list_len = _invlist_len(cp_list);
- const UV* cp_list_array = invlist_array(cp_list);
/* Here, didn't find an optimization. See if this matches any of
- * the POSIX classes. These run slightly faster for above-Unicode
- * code points, so don't bother with POSIXA ones nor the 2 that
- * have no above-Unicode matches. We can avoid these checks unless
- * the ANYOF matches at least as high as the lowest POSIX one
- * (which was manually found to be \v. The actual code point may
- * increase in later Unicode releases, if a higher code point is
- * assigned to be \v, but this code will never break. It would
- * just mean we could execute the checks for posix optimizations
- * unnecessarily) */
-
- if (cp_list_array[cp_list_len-1] > 0x2029) {
+ * the POSIX classes. First try ASCII */
+
+ if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 0)) {
+ op = ASCII;
+ *flagp |= HASWIDTH|SIMPLE;
+ }
+ else if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 1)) {
+ op = NASCII;
+ *flagp |= HASWIDTH|SIMPLE;
+ }
+ else if (invlist_highest(cp_list) >= 0x2029) {
+
+ /* Then try the other POSIX classes. The POSIXA ones are about
+ * the same speed as ANYOF ops, but the ones that have
+ * above-Latin1 code point matches are somewhat faster than
+ * ANYOF. So optimize those, but don't bother with the POSIXA
+ * ones nor [:cntrl:] which has no above-Latin1 matches. If
+ * this ANYOF node has a lower highest possible matching code
+ * point than any of the XPosix ones, we know that it can't
+ * possibly be the same as any of them, so we can avoid
+ * executing this code. The 0x2029 above for the lowest max
+ * was determined by manual inspection of the classes, and
+ * comes from \v. Suppose Unicode in a later version adds a
+ * higher code point to \v. All that means is that this code
+ * can be executed unnecessarily. It will still give the
+ * correct answer. */
+
for (posix_class = 0;
posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
posix_class++)
{
int try_inverted;
- if (posix_class == _CC_ASCII || posix_class == _CC_CNTRL) {
+
+ if (posix_class == _CC_CNTRL) {
continue;
}
+
for (try_inverted = 0; try_inverted < 2; try_inverted++) {
/* Check if matches normal or inverted */
}
found_posix: ;
}
+
+ /* If it didn't match a POSIX class, it might be able to be turned
+ * into an ANYOFM node. Compare two different bytes, bit-by-bit.
+ * In some positions, the bits in each will be 1; and in other
+ * positions both will be 0; and in some positions the bit will be
+ * 1 in one byte, and 0 in the other. Let 'n' be the number of
+ * positions where the bits differ. We create a mask which has
+ * exactly 'n' 0 bits, each in a position where the two bytes
+ * differ. Now take the set of all bytes that when ANDed with the
+ * mask yield the same result. That set has 2**n elements, and is
+ * representable by just two 8 bit numbers: the result and the
+ * mask. Importantly, matching the set can be vectorized by
+ * creating a word full of the result bytes, and a word full of the
+ * mask bytes, yielding a significant speed up. Here, see if this
+ * node matches such a set. As a concrete example consider [01],
+ * and the byte representing '0' which is 0x30 on ASCII machines.
+ * It has the bits 0011 0000. Take the mask 1111 1110. If we AND
+ * 0x31 and 0x30 with that mask we get 0x30. Any other bytes ANDed
+ * yield something else. So [01], which is a common usage, is
+ * optimizable into ANYOFM, and can benefit from the speed up. We
+ * can only do this on UTF-8 invariant bytes, because the variance
+ * would throw this off. */
+ if ( op == END
+ && invlist_highest(cp_list) <=
+#ifdef EBCDIC
+ 0xFF
+#else
+ 0x7F
+#endif
+ ) {
+ Size_t cp_count = 0;
+ bool first_time = TRUE;
+ unsigned int lowest_cp = 0xFF;
+ U8 bits_differing = 0;
+
+ /* Only needed on EBCDIC, as there, variants and non- are mixed
+ * together. Could #ifdef it out on ASCII, but probably the
+ * compiler will optimize it out */
+ bool has_variant = FALSE;
+
+ /* Go through the bytes and find the bit positions that differ */
+ invlist_iterinit(cp_list);
+ while (invlist_iternext(cp_list, &start, &end)) {
+ unsigned int i = start;
+
+ cp_count += end - start + 1;
+
+ if (first_time) {
+ if (! UVCHR_IS_INVARIANT(i)) {
+ has_variant = TRUE;
+ continue;
+ }
+
+ first_time = FALSE;
+ lowest_cp = start;
+
+ i++;
+ }
+
+ /* Find the bit positions that differ from the lowest code
+ * point in the node. Keep track of all such positions by
+ * OR'ing */
+ for (; i <= end; i++) {
+ if (! UVCHR_IS_INVARIANT(i)) {
+ has_variant = TRUE;
+ continue;
+ }
+
+ bits_differing |= i ^ lowest_cp;
+ }
+ }
+ invlist_iterfinish(cp_list);
+
+ /* At the end of the loop, we count how many bits differ from
+ * the bits in lowest code point, call the count 'd'. If the
+ * set we found contains 2**d elements, it is the closure of
+ * all code points that differ only in those bit positions. To
+ * convince yourself of that, first note that the number in the
+ * closure must be a power of 2, which we test for. The only
+ * way we could have that count and it be some differing set,
+ * is if we got some code points that don't differ from the
+ * lowest code point in any position, but do differ from each
+ * other in some other position. That means one code point has
+ * a 1 in that position, and another has a 0. But that would
+ * mean that one of them differs from the lowest code point in
+ * that position, which possibility we've already excluded. */
+ if ( ! has_variant
+ && cp_count == 1U << PL_bitcount[bits_differing])
+ {
+ assert(cp_count > 1);
+ op = ANYOFM;
+
+ /* We need to make the bits that differ be 0's */
+ ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */
+
+ /* The argument is the lowest code point */
+ anode_arg = lowest_cp;
+ *flagp |= HASWIDTH|SIMPLE;
+ }
+ }
}
if (op != END) {
RExC_emit = (regnode *)orig_emit;
if (regarglen[op]) {
- ret = reganode(pRExC_state, op, 0);
+ ret = reganode(pRExC_state, op, anode_arg);
} else {
ret = reg_node(pRExC_state, op);
}
else if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) {
FLAGS(ret) = posix_class;
}
+ else if (PL_regkind[op] == ANYOFM) {
+ FLAGS(ret) = ANYOFM_mask;
+ }
SvREFCNT_dec_NN(cp_list);
return ret;
* if (PASS2)
* NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE;
*
+* ALSO NOTE - operand->flags will be set to 0 as well.
*/
STATIC void
S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *operand, U32 depth)
#endif
}
-
place = operand; /* Op node, where operand used to be. */
#ifdef RE_TRACK_PATTERN_OFFSETS
if (RExC_offsets) { /* MJD */
}
#endif
src = NEXTOPER(place);
+ place->flags = 0;
FILL_ADVANCE_NODE(place, op);
Zero(src, offset, regnode);
}
}
#endif
+STATIC SV*
+S_get_ANYOFM_contents(pTHX_ const regnode * n) {
+
+ /* Returns an inversion list of all the code points matched by the ANYOFM
+ * node 'n' */
+
+ SV * cp_list = _new_invlist(-1);
+ const U8 lowest = ARG(n);
+ unsigned int i;
+ U8 count = 0;
+ U8 needed = 1U << PL_bitcount[ (U8) ~ FLAGS(n)];
+
+ PERL_ARGS_ASSERT_GET_ANYOFM_CONTENTS;
+
+ /* Starting with the lowest code point, any code point that ANDed with the
+ * mask yields the lowest code point is in the set */
+ for (i = lowest; i <= 0xFF; i++) {
+ if ((i & FLAGS(n)) == ARG(n)) {
+ cp_list = add_cp_to_invlist(cp_list, i);
+ count++;
+
+ /* We know how many code points (a power of two) that are in the
+ * set. No use looking once we've got that number */
+ if (count >= needed) break;
+ }
+ }
+
+ return cp_list;
+}
+
/*
- regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
*/
Perl_regdump(pTHX_ const regexp *r)
{
#ifdef DEBUGGING
+ int i;
SV * const sv = sv_newmortal();
SV *dsv= sv_newmortal();
RXi_GET_DECL(r,ri);
(void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
/* Header fields of interest. */
- if (r->anchored_substr) {
- RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
- RE_SV_DUMPLEN(r->anchored_substr), 30);
- Perl_re_printf( aTHX_
- "anchored %s%s at %" IVdf " ",
- s, RE_SV_TAIL(r->anchored_substr),
- (IV)r->anchored_offset);
- } else if (r->anchored_utf8) {
- RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
- RE_SV_DUMPLEN(r->anchored_utf8), 30);
- Perl_re_printf( aTHX_
- "anchored utf8 %s%s at %" IVdf " ",
- s, RE_SV_TAIL(r->anchored_utf8),
- (IV)r->anchored_offset);
- }
- if (r->float_substr) {
- RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
- RE_SV_DUMPLEN(r->float_substr), 30);
- Perl_re_printf( aTHX_
- "floating %s%s at %" IVdf "..%" UVuf " ",
- s, RE_SV_TAIL(r->float_substr),
- (IV)r->float_min_offset, (UV)r->float_max_offset);
- } else if (r->float_utf8) {
- RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
- RE_SV_DUMPLEN(r->float_utf8), 30);
- Perl_re_printf( aTHX_
- "floating utf8 %s%s at %" IVdf "..%" UVuf " ",
- s, RE_SV_TAIL(r->float_utf8),
- (IV)r->float_min_offset, (UV)r->float_max_offset);
+ for (i = 0; i < 2; i++) {
+ if (r->substrs->data[i].substr) {
+ RE_PV_QUOTED_DECL(s, 0, dsv,
+ SvPVX_const(r->substrs->data[i].substr),
+ RE_SV_DUMPLEN(r->substrs->data[i].substr),
+ PL_dump_re_max_len);
+ Perl_re_printf( aTHX_
+ "%s %s%s at %" IVdf "..%" UVuf " ",
+ i ? "floating" : "anchored",
+ s,
+ RE_SV_TAIL(r->substrs->data[i].substr),
+ (IV)r->substrs->data[i].min_offset,
+ (UV)r->substrs->data[i].max_offset);
+ }
+ else if (r->substrs->data[i].utf8_substr) {
+ RE_PV_QUOTED_DECL(s, 1, dsv,
+ SvPVX_const(r->substrs->data[i].utf8_substr),
+ RE_SV_DUMPLEN(r->substrs->data[i].utf8_substr),
+ 30);
+ Perl_re_printf( aTHX_
+ "%s utf8 %s%s at %" IVdf "..%" UVuf " ",
+ i ? "floating" : "anchored",
+ s,
+ RE_SV_TAIL(r->substrs->data[i].utf8_substr),
+ (IV)r->substrs->data[i].min_offset,
+ (UV)r->substrs->data[i].max_offset);
+ }
}
+
if (r->check_substr || r->check_utf8)
Perl_re_printf( aTHX_
(const char *)
- (r->check_substr == r->float_substr
- && r->check_utf8 == r->float_utf8
+ ( r->check_substr == r->substrs->data[1].substr
+ && r->check_utf8 == r->substrs->data[1].utf8_substr
? "(checking floating" : "(checking anchored"));
if (r->intflags & PREGf_NOSCAN)
Perl_re_printf( aTHX_ " noscan");
* is a crude hack but it may be the best for now since
* we have no flag "this EXACTish node was UTF-8"
* --jhi */
- pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
+ pv_pretty(sv, STRING(o), STR_LEN(o), PL_dump_re_max_len,
+ PL_colors[0], PL_colors[1],
PERL_PV_ESCAPE_UNI_DETECT |
PERL_PV_ESCAPE_NONASCII |
PERL_PV_PRETTY_ELLIPSES |
if ( k == REF && reginfo) {
U32 n = ARG(o); /* which paren pair */
I32 ln = prog->offs[n].start;
- if (prog->lastparen < n || ln == -1)
+ if (prog->lastparen < n || ln == -1 || prog->offs[n].end == -1)
Perl_sv_catpvf(aTHX_ sv, ": FAIL");
else if (ln == prog->offs[n].end)
Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
SV* contents;
/* See if truncation size is overridden */
- const STRLEN dump_len = (PL_dump_re_max_len)
+ const STRLEN dump_len = (PL_dump_re_max_len > 256)
? PL_dump_re_max_len
: 256;
SvREFCNT_dec(unresolved);
}
+ else if (k == ANYOFM) {
+ SV * cp_list = get_ANYOFM_contents(o);
+
+ Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
+ put_charclass_bitmap_innards(sv, NULL, cp_list, NULL, NULL, TRUE);
+ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
+
+ SvREFCNT_dec(cp_list);
+ }
else if (k == POSIXD || k == NPOSIXD) {
U8 index = FLAGS(o) * 2;
if (index < C_ARRAY_LENGTH(anyofs)) {
/* add on the verb argument if there is one */
if ( ( k == VERB || OP(o) == ACCEPT || OP(o) == OPFAIL ) && o->flags) {
- Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
+ if ( ARG(o) )
+ Perl_sv_catpvf(aTHX_ sv, ":%" SVf,
SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
+ else
+ sv_catpvs(sv, ":NULL");
}
#else
PERL_UNUSED_CONTEXT;
PL_colors[5],PL_colors[0],
s,
PL_colors[1],
- (strlen(s) > 60 ? "..." : ""));
+ (strlen(s) > PL_dump_re_max_len ? "..." : ""));
} );
/* use UTF8 check substring if regexp pattern itself is in UTF8 */
} else {
CALLREGFREE_PVT(rx); /* free the private data */
SvREFCNT_dec(RXp_PAREN_NAMES(r));
- Safefree(r->xpv_len_u.xpvlenu_pv);
}
if (r->substrs) {
- SvREFCNT_dec(r->anchored_substr);
- SvREFCNT_dec(r->anchored_utf8);
- SvREFCNT_dec(r->float_substr);
- SvREFCNT_dec(r->float_utf8);
+ int i;
+ for (i = 0; i < 2; i++) {
+ SvREFCNT_dec(r->substrs->data[i].substr);
+ SvREFCNT_dec(r->substrs->data[i].utf8_substr);
+ }
Safefree(r->substrs);
}
RX_MATCH_COPY_FREE(rx);
SvREFCNT_dec(r->qr_anoncv);
if (r->recurse_locinput)
Safefree(r->recurse_locinput);
- rx->sv_u.svu_rx = 0;
}
+
/* reg_temp_copy()
- This is a hacky workaround to the structural issue of match results
+ Copy ssv to dsv, both of which should of type SVt_REGEXP or SVt_PVLV,
+ except that dsv will be created if NULL.
+
+ This function is used in two main ways. First to implement
+ $r = qr/....; $s = $$r;
+
+ Secondly, it is used as a hacky workaround to the structural issue of
+ match results
being stored in the regexp structure which is in turn stored in
PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
could be PL_curpm in multiple contexts, and could require multiple
REGEXP *
-Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
+Perl_reg_temp_copy(pTHX_ REGEXP *dsv, REGEXP *ssv)
{
- struct regexp *ret;
- struct regexp *const r = ReANY(rx);
- const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
+ struct regexp *drx;
+ struct regexp *const srx = ReANY(ssv);
+ const bool islv = dsv && SvTYPE(dsv) == SVt_PVLV;
PERL_ARGS_ASSERT_REG_TEMP_COPY;
- if (!ret_x)
- ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
+ if (!dsv)
+ dsv = (REGEXP*) newSV_type(SVt_REGEXP);
else {
- SvOK_off((SV *)ret_x);
+ SvOK_off((SV *)dsv);
if (islv) {
- /* For PVLVs, SvANY points to the xpvlv body while sv_u points
- to the regexp. (For SVt_REGEXPs, sv_upgrade has already
- made both spots point to the same regexp body.) */
+ /* For PVLVs, the head (sv_any) points to an XPVLV, while
+ * the LV's xpvlenu_rx will point to a regexp body, which
+ * we allocate here */
REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
- assert(!SvPVX(ret_x));
- ret_x->sv_u.svu_rx = temp->sv_any;
+ assert(!SvPVX(dsv));
+ ((XPV*)SvANY(dsv))->xpv_len_u.xpvlenu_rx = temp->sv_any;
temp->sv_any = NULL;
SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
SvREFCNT_dec_NN(temp);
/* SvCUR still resides in the xpvlv struct, so the regexp copy-
ing below will not set it. */
- SvCUR_set(ret_x, SvCUR(rx));
+ SvCUR_set(dsv, SvCUR(ssv));
}
}
/* This ensures that SvTHINKFIRST(sv) is true, and hence that
sv_force_normal(sv) is called. */
- SvFAKE_on(ret_x);
- ret = ReANY(ret_x);
+ SvFAKE_on(dsv);
+ drx = ReANY(dsv);
- SvFLAGS(ret_x) |= SvUTF8(rx);
+ SvFLAGS(dsv) |= SvFLAGS(ssv) & (SVf_POK|SVp_POK|SVf_UTF8);
+ SvPV_set(dsv, RX_WRAPPED(ssv));
/* We share the same string buffer as the original regexp, on which we
hold a reference count, incremented when mother_re is set below.
The string pointer is copied here, being part of the regexp struct.
*/
- memcpy(&(ret->xpv_cur), &(r->xpv_cur),
+ memcpy(&(drx->xpv_cur), &(srx->xpv_cur),
sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
- if (r->offs) {
- const I32 npar = r->nparens+1;
- Newx(ret->offs, npar, regexp_paren_pair);
- Copy(r->offs, ret->offs, npar, regexp_paren_pair);
- }
- if (r->substrs) {
- Newx(ret->substrs, 1, struct reg_substr_data);
- StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
+ if (!islv)
+ SvLEN_set(dsv, 0);
+ if (srx->offs) {
+ const I32 npar = srx->nparens+1;
+ Newx(drx->offs, npar, regexp_paren_pair);
+ Copy(srx->offs, drx->offs, npar, regexp_paren_pair);
+ }
+ if (srx->substrs) {
+ int i;
+ Newx(drx->substrs, 1, struct reg_substr_data);
+ StructCopy(srx->substrs, drx->substrs, struct reg_substr_data);
- SvREFCNT_inc_void(ret->anchored_substr);
- SvREFCNT_inc_void(ret->anchored_utf8);
- SvREFCNT_inc_void(ret->float_substr);
- SvREFCNT_inc_void(ret->float_utf8);
+ for (i = 0; i < 2; i++) {
+ SvREFCNT_inc_void(drx->substrs->data[i].substr);
+ SvREFCNT_inc_void(drx->substrs->data[i].utf8_substr);
+ }
/* check_substr and check_utf8, if non-NULL, point to either their
anchored or float namesakes, and don't hold a second reference. */
}
- RX_MATCH_COPIED_off(ret_x);
+ RX_MATCH_COPIED_off(dsv);
#ifdef PERL_ANY_COW
- ret->saved_copy = NULL;
+ drx->saved_copy = NULL;
#endif
- ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
- SvREFCNT_inc_void(ret->qr_anoncv);
- if (r->recurse_locinput)
- Newxz(ret->recurse_locinput,r->nparens + 1,char *);
+ drx->mother_re = ReREFCNT_inc(srx->mother_re ? srx->mother_re : ssv);
+ SvREFCNT_inc_void(drx->qr_anoncv);
+ if (srx->recurse_locinput)
+ Newx(drx->recurse_locinput,srx->nparens + 1,char *);
- return ret_x;
+ return dsv;
}
#endif
+
/* regfree_internal()
Free the private data in a regexp. This is overloadable by
{
SV *dsv= sv_newmortal();
RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
- dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
+ dsv, RX_PRECOMP(rx), RX_PRELEN(rx), PL_dump_re_max_len);
Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n",
PL_colors[4],PL_colors[5],s);
}
/* Do it this way to avoid reading from *r after the StructCopy().
That way, if any of the sv_dup_inc()s dislodge *r from the L1
cache, it doesn't matter. */
+ int i;
const bool anchored = r->check_substr
- ? r->check_substr == r->anchored_substr
- : r->check_utf8 == r->anchored_utf8;
+ ? r->check_substr == r->substrs->data[0].substr
+ : r->check_utf8 == r->substrs->data[0].utf8_substr;
Newx(ret->substrs, 1, struct reg_substr_data);
StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
- ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
- ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
- ret->float_substr = sv_dup_inc(ret->float_substr, param);
- ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
+ for (i = 0; i < 2; i++) {
+ ret->substrs->data[i].substr =
+ sv_dup_inc(ret->substrs->data[i].substr, param);
+ ret->substrs->data[i].utf8_substr =
+ sv_dup_inc(ret->substrs->data[i].utf8_substr, param);
+ }
/* check_substr and check_utf8, if non-NULL, point to either their
anchored or float namesakes, and don't hold a second reference. */
if (ret->check_substr) {
if (anchored) {
- assert(r->check_utf8 == r->anchored_utf8);
- ret->check_substr = ret->anchored_substr;
- ret->check_utf8 = ret->anchored_utf8;
+ assert(r->check_utf8 == r->substrs->data[0].utf8_substr);
+
+ ret->check_substr = ret->substrs->data[0].substr;
+ ret->check_utf8 = ret->substrs->data[0].utf8_substr;
} else {
- assert(r->check_substr == r->float_substr);
- assert(r->check_utf8 == r->float_utf8);
- ret->check_substr = ret->float_substr;
- ret->check_utf8 = ret->float_utf8;
+ assert(r->check_substr == r->substrs->data[1].substr);
+ assert(r->check_utf8 == r->substrs->data[1].utf8_substr);
+
+ ret->check_substr = ret->substrs->data[1].substr;
+ ret->check_utf8 = ret->substrs->data[1].utf8_substr;
}
} else if (ret->check_utf8) {
if (anchored) {
- ret->check_utf8 = ret->anchored_utf8;
+ ret->check_utf8 = ret->substrs->data[0].utf8_substr;
} else {
- ret->check_utf8 = ret->float_utf8;
+ ret->check_utf8 = ret->substrs->data[1].utf8_substr;
}
}
}
RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
if (r->recurse_locinput)
- Newxz(ret->recurse_locinput,r->nparens + 1,char *);
+ Newx(ret->recurse_locinput,r->nparens + 1,char *);
if (ret->pprivate)
RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
1: a buffer in a different thread
2: something we no longer hold a reference on
so we need to copy it locally. */
- RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
+ RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED_const(sstr), SvCUR(sstr)+1);
ret->mother_re = NULL;
}
#endif /* PERL_IN_XSUB_RE */
d->what[i] = ri->data->what[i];
switch (d->what[i]) {
/* see also regcomp.h and regfree_internal() */
- case 'a': /* actually an AV, but the dup function is identical. */
- case 'r':
- case 's':
- case 'S':
- case 'u': /* actually an HV, but the dup function is identical. */
+ case 'a': /* actually an AV, but the dup function is identical.
+ values seem to be "plain sv's" generally. */
+ case 'r': /* a compiled regex (but still just another SV) */
+ case 's': /* an RV (currently only used for an RV to an AV by the ANYOF code)
+ this use case should go away, the code could have used
+ 'a' instead - see S_set_ANYOF_arg() for array contents. */
+ case 'S': /* actually an SV, but the dup function is identical. */
+ case 'u': /* actually an HV, but the dup function is identical.
+ values are "plain sv's" */
d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
break;
case 'f':
+ /* Synthetic Start Class - "Fake" charclass we generate to optimize
+ * patterns which could start with several different things. Pre-TRIE
+ * this was more important than it is now, however this still helps
+ * in some places, for instance /x?a+/ might produce a SSC equivalent
+ * to [xa]. This is used by Perl_re_intuit_start() and S_find_byclass()
+ * in regexec.c
+ */
/* This is cheating. */
Newx(d->data[i], 1, regnode_ssc);
StructCopy(ri->data->data[i], d->data[i], regnode_ssc);
reti->regstclass = (regnode*)d->data[i];
break;
case 'T':
- /* Trie stclasses are readonly and can thus be shared
+ /* AHO-CORASICK fail table */
+ /* Trie stclasses are readonly and can thus be shared
* without duplication. We free the stclass in pregfree
* when the corresponding reg_ac_data struct is freed.
*/
reti->regstclass= ri->regstclass;
/* FALLTHROUGH */
case 't':
+ /* TRIE transition table */
OP_REFCNT_LOCK;
((reg_trie_data*)ri->data->data[i])->refcount++;
OP_REFCNT_UNLOCK;
/* FALLTHROUGH */
- case 'l':
- case 'L':
+ case 'l': /* (?{...}) or (??{ ... }) code (cb->block) */
+ case 'L': /* same when RExC_pm_flags & PMf_HAS_CV and code
+ is not from another regexp */
d->data[i] = ri->data->data[i];
break;
default:
#else
format = "\\x%02" UVXf "-\\x%02" UVXf;
#endif
- GCC_DIAG_IGNORE(-Wformat-nonliteral);
+ GCC_DIAG_IGNORE_STMT(-Wformat-nonliteral);
Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
- GCC_DIAG_RESTORE;
+ GCC_DIAG_RESTORE_STMT;
break;
}
}
{
/* Appends to 'sv' a displayable version of the innards of the bracketed
* character class defined by the other arguments:
- * 'bitmap' points to the bitmap.
+ * 'bitmap' points to the bitmap, or NULL if to ignore that.
* 'nonbitmap_invlist' is an inversion list of the code points that are in
* the bitmap range, but for some reason aren't in the bitmap; NULL if
* none. The reasons for this could be that they require some
* was not resolved at the time of the regex compilation (under /u)
* 'only_utf8_locale_invlist' is an inversion list of the code points that
* are valid only if the runtime locale is a UTF-8 one; NULL if none
- * 'node' is the regex pattern node. It is needed only when the above two
- * parameters are not null, and is passed so that this routine can
- * tease apart the various reasons for them.
+ * 'node' is the regex pattern ANYOF node. It is needed only when the
+ * above two parameters are not null, and is passed so that this
+ * routine can tease apart the various reasons for them.
* 'force_as_is_display' is TRUE if this routine should definitely NOT try
* to invert things to see if that leads to a cleaner display. If
* FALSE, this routine is free to use its judgment about doing this.
}
/* Accumulate the bit map into the unconditional match list */
- for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
- if (BITMAP_TEST(bitmap, i)) {
- int start = i++;
- for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) {
- /* empty */
+ if (bitmap) {
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+ if (BITMAP_TEST(bitmap, i)) {
+ int start = i++;
+ for (;
+ i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i);
+ i++)
+ { /* empty */ }
+ invlist = _add_range_to_invlist(invlist, start, i-1);
}
- invlist = _add_range_to_invlist(invlist, start, i-1);
}
}
/* While that wasn't END last time... */
NODE_ALIGN(node);
op = OP(node);
- if (op == CLOSE || op == WHILEM)
+ if (op == CLOSE || op == SRCLOSE || op == WHILEM)
indent--;
next = regnext((regnode *)node);
indent+3,
elem_ptr
? pv_pretty(sv, SvPV_nolen_const(*elem_ptr),
- SvCUR(*elem_ptr), 60,
+ SvCUR(*elem_ptr), PL_dump_re_max_len,
PL_colors[0], PL_colors[1],
(SvUTF8(*elem_ptr)
? PERL_PV_ESCAPE_UNI
node = NEXTOPER(node);
node += regarglen[(U8)op];
}
- if (op == CURLYX || op == OPEN)
+ if (op == CURLYX || op == OPEN || op == SROPEN)
indent++;
}
CLEAR_OPTSTART;