#define STATIC static
#endif
+#ifndef MIN
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
+/* this is a chain of data about sub patterns we are processing that
+ need to be handled separately/specially in study_chunk. Its so
+ we can simulate recursion without losing state. */
+struct scan_frame;
+typedef struct scan_frame {
+ regnode *last_regnode; /* last node to process in this frame */
+ regnode *next_regnode; /* next node to process when last is reached */
+ U32 prev_recursed_depth;
+ I32 stopparen; /* what stopparen do we use */
+ U32 is_top_frame; /* what flags do we use? */
+
+ struct scan_frame *this_prev_frame; /* this previous frame */
+ struct scan_frame *prev_frame; /* previous frame */
+ struct scan_frame *next_frame; /* next frame */
+} scan_frame;
+
+/* Certain characters are output as a sequence with the first being a
+ * backslash. */
+#define isBACKSLASHED_PUNCT(c) \
+ ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
+
struct RExC_state_t {
U32 flags; /* RXf_* are we folding, multilining? */
regnode **recurse; /* Recurse regops */
I32 recurse_count; /* Number of recurse regops */
- U8 *study_chunk_recursed; /* bitmap of which parens we have moved
+ U8 *study_chunk_recursed; /* bitmap of which subs we have moved
through */
U32 study_chunk_recursed_bytes; /* bytes in bitmap */
I32 in_lookbehind;
int num_code_blocks; /* size of code_blocks[] */
int code_index; /* next code_blocks[] slot */
SSize_t maxlen; /* mininum possible number of chars in string to match */
+ scan_frame *frame_head;
+ scan_frame *frame_last;
+ U32 frame_count;
+ U32 strict;
#ifdef ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
I32 lastnum;
AV *paren_name_list; /* idx -> name */
U32 study_chunk_recursed_count;
+ SV *mysv1;
+ SV *mysv2;
#define RExC_lastparse (pRExC_state->lastparse)
#define RExC_lastnum (pRExC_state->lastnum)
#define RExC_paren_name_list (pRExC_state->paren_name_list)
#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
+#define RExC_mysv (pRExC_state->mysv1)
+#define RExC_mysv1 (pRExC_state->mysv1)
+#define RExC_mysv2 (pRExC_state->mysv2)
+
#endif
};
#define RExC_emit_dummy (pRExC_state->emit_dummy)
#define RExC_emit_start (pRExC_state->emit_start)
#define RExC_emit_bound (pRExC_state->emit_bound)
-#define RExC_naughty (pRExC_state->naughty)
#define RExC_sawback (pRExC_state->sawback)
#define RExC_seen (pRExC_state->seen)
#define RExC_size (pRExC_state->size)
#define RExC_contains_i (pRExC_state->contains_i)
#define RExC_override_recoding (pRExC_state->override_recoding)
#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
-
+#define RExC_frame_head (pRExC_state->frame_head)
+#define RExC_frame_last (pRExC_state->frame_last)
+#define RExC_frame_count (pRExC_state->frame_count)
+#define RExC_strict (pRExC_state->strict)
+
+/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
+ * a flag to disable back-off on the fixed/floating substrings - if it's
+ * a high complexity pattern we assume the benefit of avoiding a full match
+ * is worth the cost of checking for the substrings even if they rarely help.
+ */
+#define RExC_naughty (pRExC_state->naughty)
+#define TOO_NAUGHTY (10)
+#define MARK_NAUGHTY(add) \
+ if (RExC_naughty < TOO_NAUGHTY) \
+ RExC_naughty += (add)
+#define MARK_NAUGHTY_EXP(exp, add) \
+ if (RExC_naughty < TOO_NAUGHTY) \
+ RExC_naughty += RExC_naughty / (exp) + (add)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
#define SCF_SEEN_ACCEPT 0x8000
#define SCF_TRIE_DOING_RESTUDY 0x10000
+#define SCF_IN_DEFINE 0x20000
+
+
+
#define UTF cBOOL(RExC_utf8)
REPORT_LOCATION_ARGS(offset)); \
} STMT_END
+#define vWARN(loc, m) STMT_START { \
+ const IV offset = loc - RExC_precomp; \
+ __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
+ REPORT_LOCATION_ARGS(offset)); \
+} STMT_END
+
#define vWARN_dep(loc, m) STMT_START { \
const IV offset = loc - RExC_precomp; \
__ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
} STMT_END
-
-/* Allow for side effects in s */
-#define REGC(c,s) STMT_START { \
- if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
-} STMT_END
-
/* Macros for recording node offsets. 20001227 mjd@plover.com
* Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
* element 2*n-1 of the array. Element #2n holds the byte length node #n.
PerlIO_printf(Perl_debug_log,"\n"); \
});
+#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
+ if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
+
+#define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
+ if ( ( flags ) ) { \
+ PerlIO_printf(Perl_debug_log, "%s", open_str); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
+ DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
+ PerlIO_printf(Perl_debug_log, "%s", close_str); \
+ }
+
+
#define DEBUG_STUDYDATA(str,data,depth) \
DEBUG_OPTIMISE_MORE_r(if(data){ \
PerlIO_printf(Perl_debug_log, \
"%*s" str "Pos:%"IVdf"/%"IVdf \
- " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
+ " Flags: 0x%"UVXf, \
(int)(depth)*2, "", \
(IV)((data)->pos_min), \
(IV)((data)->pos_delta), \
- (UV)((data)->flags), \
+ (UV)((data)->flags) \
+ ); \
+ DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
+ PerlIO_printf(Perl_debug_log, \
+ " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
(IV)((data)->whilem_c), \
(IV)((data)->last_closep ? *((data)->last_closep) : -1), \
is_inf ? "INF " : "" \
PerlIO_printf(Perl_debug_log,"\n"); \
});
-#ifdef DEBUGGING
-
/* is c a control character for which we have a mnemonic? */
#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
return NULL;
}
-#endif
-
/* Mark that we cannot extend a found fixed substring at this point.
Update the longest found anchored substring and the longest found
floating substrings if needed. */
else { /* *data->longest == data->longest_float */
data->offset_float_min = l ? data->last_start_min : data->pos_min;
data->offset_float_max = (l
- ? data->last_start_max
- : (data->pos_delta == SSize_t_MAX
+ ? data->last_start_max
+ : (data->pos_delta > SSize_t_MAX - data->pos_min
? SSize_t_MAX
: data->pos_min + data->pos_delta));
if (is_inf
ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
ssc_anything(ssc);
- /* If any portion of the regex is to operate under locale rules,
- * initialization includes it. The reason this isn't done for all regexes
- * is that the optimizer was written under the assumption that locale was
- * all-or-nothing. Given the complexity and lack of documentation in the
- * optimizer, and that there are inadequate test cases for locale, many
- * parts of it may not work properly, it is safest to avoid locale unless
- * necessary. */
+ /* If any portion of the regex is to operate under locale rules that aren't
+ * fully known at compile time, initialization includes it. The reason
+ * this isn't done for all regexes is that the optimizer was written under
+ * the assumption that locale was all-or-nothing. Given the complexity and
+ * lack of documentation in the optimizer, and that there are inadequate
+ * test cases for locale, many parts of it may not work properly, it is
+ * safest to avoid locale unless necessary. */
if (RExC_contains_locale) {
ANYOF_POSIXL_SETALL(ssc);
}
ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
}
+#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
+
+STATIC bool
+S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
+{
+ /* The synthetic start class is used to hopefully quickly winnow down
+ * places where a pattern could start a match in the target string. If it
+ * doesn't really narrow things down that much, there isn't much point to
+ * having the overhead of using it. This function uses some very crude
+ * heuristics to decide if to use the ssc or not.
+ *
+ * It returns TRUE if 'ssc' rules out more than half what it considers to
+ * be the "likely" possible matches, but of course it doesn't know what the
+ * actual things being matched are going to be; these are only guesses
+ *
+ * For /l matches, it assumes that the only likely matches are going to be
+ * in the 0-255 range, uniformly distributed, so half of that is 127
+ * For /a and /d matches, it assumes that the likely matches will be just
+ * the ASCII range, so half of that is 63
+ * For /u and there isn't anything matching above the Latin1 range, it
+ * assumes that that is the only range likely to be matched, and uses
+ * half that as the cut-off: 127. If anything matches above Latin1,
+ * it assumes that all of Unicode could match (uniformly), except for
+ * non-Unicode code points and things in the General Category "Other"
+ * (unassigned, private use, surrogates, controls and formats). This
+ * is a much large number. */
+
+ const U32 max_match = (LOC)
+ ? 127
+ : (! UNI_SEMANTICS)
+ ? 63
+ : (invlist_highest(ssc->invlist) < 256)
+ ? 127
+ : ((NON_OTHER_COUNT + 1) / 2) - 1;
+ U32 count = 0; /* Running total of number of code points matched by
+ 'ssc' */
+ UV start, end; /* Start and end points of current range in inversion
+ list */
+
+ PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
+
+ invlist_iterinit(ssc->invlist);
+ while (invlist_iternext(ssc->invlist, &start, &end)) {
+
+ /* /u is the only thing that we expect to match above 255; so if not /u
+ * and even if there are matches above 255, ignore them. This catches
+ * things like \d under /d which does match the digits above 255, but
+ * since the pattern is /d, it is not likely to be expecting them */
+ if (! UNI_SEMANTICS) {
+ if (start > 255) {
+ break;
+ }
+ end = MIN(end, 255);
+ }
+ count += end - start + 1;
+ if (count > max_match) {
+ invlist_iterfinish(ssc->invlist);
+ return FALSE;
+ }
+ }
+
+ return TRUE;
+}
+
+
STATIC void
S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
{
May be the same as tail.
tail : item following the branch sequence
count : words in the sequence
- flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
+ flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
depth : indent depth
Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
#endif
switch (flags) {
- case EXACT: break;
+ case EXACT: case EXACTL: break;
case EXACTFA:
case EXACTFU_SS:
- case EXACTFU: folder = PL_fold_latin1; break;
+ case EXACTFU:
+ case EXACTFLU8: folder = PL_fold_latin1; break;
case EXACTF: folder = PL_fold; break;
default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
}
trie->wordcount = word_count;
RExC_rxi->data->data[ data_slot ] = (void*)trie;
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
- if (flags == EXACT)
+ if (flags == EXACT || flags == EXACTL)
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
trie->wordcount+1, sizeof(reg_trie_wordinfo));
StructCopy(source,op,struct regnode_charclass);
stclass = (regnode *)op;
}
- OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
+ OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
ARG_SET( stclass, data_slot );
aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
#define DEBUG_PEEP(str,scan,depth) \
DEBUG_OPTIMISE_r({if (scan){ \
- SV * const mysv=sv_newmortal(); \
regnode *Next = regnext(scan); \
- regprop(RExC_rx, mysv, scan, NULL); \
- PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
- (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
- Next ? (REG_NODE_NUM(Next)) : 0 ); \
+ regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
+ PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
+ (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
+ Next ? (REG_NODE_NUM(Next)) : 0 ); \
+ DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
+ PerlIO_printf(Perl_debug_log, "\n"); \
}});
-
/* The below joins as many adjacent EXACTish nodes as possible into a single
* one. The regop may be changed if the node(s) contain certain sequences that
* require special handling. The joining is only done if:
* this final joining, sequences could have been split over boundaries, and
* hence missed). The sequences only happen in folding, hence for any
* non-EXACT EXACTish node */
- if (OP(scan) != EXACT) {
+ if (OP(scan) != EXACT && OP(scan) != EXACTL) {
U8* s0 = (U8*) STRING(scan);
U8* s = s0;
U8* s_end = s0 + STR_LEN(scan);
Newx(and_withp,1, regnode_ssc); \
SAVEFREEPV(and_withp)
-/* this is a chain of data about sub patterns we are processing that
- need to be handled separately/specially in study_chunk. Its so
- we can simulate recursion without losing state. */
-struct scan_frame;
-typedef struct scan_frame {
- regnode *last; /* last node to process in this frame */
- regnode *next; /* next node to process when last is reached */
- struct scan_frame *prev; /*previous frame*/
- U32 prev_recursed_depth;
- I32 stop; /* what stopparen do we use */
-} scan_frame;
+
+static void
+S_unwind_scan_frames(pTHX_ const void *p)
+{
+ scan_frame *f= (scan_frame *)p;
+ do {
+ scan_frame *n= f->next_frame;
+ Safefree(f);
+ f= n;
+ } while (f);
+}
STATIC SSize_t
PERL_ARGS_ASSERT_STUDY_CHUNK;
-#ifdef DEBUGGING
- StructCopy(&zero_scan_data, &data_fake, scan_data_t);
-#endif
+
if ( depth == 0 ) {
while (first_non_open && OP(first_non_open) == OPEN)
first_non_open=regnext(first_non_open);
DEBUG_r(
RExC_study_chunk_recursed_count++;
);
+ DEBUG_OPTIMISE_MORE_r(
+ {
+ PerlIO_printf(Perl_debug_log,
+ "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
+ (int)(depth*2), "", (long)stopparen,
+ (unsigned long)RExC_study_chunk_recursed_count,
+ (unsigned long)depth, (unsigned long)recursed_depth,
+ scan,
+ last);
+ if (recursed_depth) {
+ U32 i;
+ U32 j;
+ for ( j = 0 ; j < recursed_depth ; j++ ) {
+ for ( i = 0 ; i < (U32)RExC_npar ; i++ ) {
+ if (
+ PAREN_TEST(RExC_study_chunk_recursed +
+ ( j * RExC_study_chunk_recursed_bytes), i )
+ && (
+ !j ||
+ !PAREN_TEST(RExC_study_chunk_recursed +
+ (( j - 1 ) * RExC_study_chunk_recursed_bytes), i)
+ )
+ ) {
+ PerlIO_printf(Perl_debug_log," %d",(int)i);
+ break;
+ }
+ }
+ if ( j + 1 < recursed_depth ) {
+ PerlIO_printf(Perl_debug_log, ",");
+ }
+ }
+ }
+ PerlIO_printf(Perl_debug_log,"\n");
+ }
+ );
while ( scan && OP(scan) != END && scan < last ){
UV min_subtract = 0; /* How mmany chars to subtract from the minimum
node length to get a real minimum (because
the folded version may be shorter) */
bool unfolded_multi_char = FALSE;
/* Peephole optimizer: */
- DEBUG_OPTIMISE_MORE_r(
- {
- PerlIO_printf(Perl_debug_log,
- "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu ",
- ((int) depth*2), "", (long)stopparen,
- (unsigned long)RExC_study_chunk_recursed_count,
- (unsigned long)depth, (unsigned long)recursed_depth);
- if (recursed_depth) {
- U32 i;
- U32 j;
- for ( j = 0 ; j < recursed_depth ; j++ ) {
- PerlIO_printf(Perl_debug_log,"[");
- for ( i = 0 ; i < (U32)RExC_npar ; i++ )
- PerlIO_printf(Perl_debug_log,"%d",
- PAREN_TEST(RExC_study_chunk_recursed +
- (j * RExC_study_chunk_recursed_bytes), i)
- ? 1 : 0
- );
- PerlIO_printf(Perl_debug_log,"]");
- }
- }
- PerlIO_printf(Perl_debug_log,"\n");
- }
- );
DEBUG_STUDYDATA("Peep:", data, depth);
DEBUG_PEEP("Peep", scan, depth);
NEXT_OFF(scan) = off;
}
-
-
/* The principal pseudo-switch. Cannot be a switch, since we
look into several different things. */
- if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
- || OP(scan) == IFTHEN) {
+ if ( OP(scan) == DEFINEP ) {
+ SSize_t minlen = 0;
+ SSize_t deltanext = 0;
+ SSize_t fake_last_close = 0;
+ I32 f = SCF_IN_DEFINE;
+
+ StructCopy(&zero_scan_data, &data_fake, scan_data_t);
+ scan = regnext(scan);
+ assert( OP(scan) == IFTHEN );
+ DEBUG_PEEP("expect IFTHEN", scan, depth);
+
+ data_fake.last_closep= &fake_last_close;
+ minlen = *minlenp;
+ next = regnext(scan);
+ scan = NEXTOPER(NEXTOPER(scan));
+ DEBUG_PEEP("scan", scan, depth);
+ DEBUG_PEEP("next", next, depth);
+
+ /* we suppose the run is continuous, last=next...
+ * NOTE we dont use the return here! */
+ (void)study_chunk(pRExC_state, &scan, &minlen,
+ &deltanext, next, &data_fake, stopparen,
+ recursed_depth, NULL, f, depth+1);
+
+ scan = next;
+ } else
+ if (
+ OP(scan) == BRANCH ||
+ OP(scan) == BRANCHJ ||
+ OP(scan) == IFTHEN
+ ) {
next = regnext(scan);
code = OP(scan);
- /* demq: the op(next)==code check is to see if we have
- * "branch-branch" AFAICT */
+ /* The op(next)==code check below is to see if we
+ * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
+ * IFTHEN is special as it might not appear in pairs.
+ * Not sure whether BRANCH-BRANCHJ is possible, regardless
+ * we dont handle it cleanly. */
if (OP(next) == code || code == IFTHEN) {
/* NOTE - There is similar code to this block below for
* handling TRIE nodes on a re-study. If you change stuff here
I32 f = 0;
regnode_ssc this_class;
+ DEBUG_PEEP("Branch", scan, depth);
+
num++;
- data_fake.flags = 0;
+ StructCopy(&zero_scan_data, &data_fake, scan_data_t);
if (data) {
data_fake.whilem_c = data->whilem_c;
data_fake.last_closep = data->last_closep;
data_fake.pos_delta = delta;
next = regnext(scan);
- scan = NEXTOPER(scan);
- if (code != BRANCH)
+
+ scan = NEXTOPER(scan); /* everything */
+ if (code != BRANCH) /* everything but BRANCH */
scan = NEXTOPER(scan);
+
if (flags & SCF_DO_STCLASS) {
ssc_init(pRExC_state, &this_class);
data_fake.start_class = &this_class;
minnext = study_chunk(pRExC_state, &scan, minlenp,
&deltanext, next, &data_fake, stopparen,
recursed_depth, NULL, f,depth+1);
+
if (min1 > minnext)
min1 = minnext;
if (deltanext == SSize_t_MAX) {
U8 trietype = 0;
U32 count=0;
-#ifdef DEBUGGING
- SV * const mysv = sv_newmortal(); /* for dumping */
-#endif
/* var tail is used because there may be a TAIL
regop in the way. Ie, the exacts will point to the
thing following the TAIL, but the last branch will
DEBUG_TRIE_COMPILE_r({
- regprop(RExC_rx, mysv, tail, NULL);
+ regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
(int)depth * 2 + 2, "",
"Looking for TRIE'able sequences. Tail node is: ",
- SvPV_nolen_const( mysv )
+ SvPV_nolen_const( RExC_mysv )
);
});
EXACTFU | EXACTFU
EXACTFU_SS | EXACTFU
EXACTFA | EXACTFA
+ EXACTL | EXACTL
+ EXACTFLU8 | EXACTFLU8
*/
-#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
- ( EXACT == (X) ) ? EXACT : \
- ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
- ( EXACTFA == (X) ) ? EXACTFA : \
- 0 )
+#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
+ ? NOTHING \
+ : ( EXACT == (X) ) \
+ ? EXACT \
+ : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \
+ ? EXACTFU \
+ : ( EXACTFA == (X) ) \
+ ? EXACTFA \
+ : ( EXACTL == (X) ) \
+ ? EXACTL \
+ : ( EXACTFLU8 == (X) ) \
+ ? EXACTFLU8 \
+ : 0 )
/* dont use tail as the end marker for this traverse */
for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
#endif
DEBUG_TRIE_COMPILE_r({
- regprop(RExC_rx, mysv, cur, NULL);
+ regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
- (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
+ (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
- regprop(RExC_rx, mysv, noper, NULL);
+ regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
PerlIO_printf( Perl_debug_log, " -> %s",
- SvPV_nolen_const(mysv));
+ SvPV_nolen_const(RExC_mysv));
if ( noper_next ) {
- regprop(RExC_rx, mysv, noper_next, NULL);
+ regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
PerlIO_printf( Perl_debug_log,"\t=> %s\t",
- SvPV_nolen_const(mysv));
+ SvPV_nolen_const(RExC_mysv));
}
PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
} /* end handle unmergable node */
} /* loop over branches */
DEBUG_TRIE_COMPILE_r({
- regprop(RExC_rx, mysv, cur, NULL);
+ regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
PerlIO_printf( Perl_debug_log,
"%*s- %s (%d) <SCAN FINISHED>\n",
(int)depth * 2 + 2,
- "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
+ "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
});
if ( last && trietype ) {
* something like this: (?:|) So we can
* turn it into a plain NOTHING op. */
DEBUG_TRIE_COMPILE_r({
- regprop(RExC_rx, mysv, cur, NULL);
+ regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
PerlIO_printf( Perl_debug_log,
"%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
- "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
+ "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur));
});
OP(startbranch)= NOTHING;
scan = NEXTOPER(scan);
continue;
} else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
- scan_frame *newframe = NULL;
- I32 paren;
- regnode *start;
- regnode *end;
+ I32 paren = 0;
+ regnode *start = NULL;
+ regnode *end = NULL;
U32 my_recursed_depth= recursed_depth;
+
if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */
- /* set the pointer */
+ /* Do setup, note this code has side effects beyond
+ * the rest of this block. Specifically setting
+ * RExC_recurse[] must happen at least once during
+ * study_chunk(). */
if (OP(scan) == GOSUB) {
paren = ARG(scan);
RExC_recurse[ARG2L(scan)] = scan;
start = RExC_open_parens[paren-1];
end = RExC_close_parens[paren-1];
} else {
- paren = 0;
start = RExC_rxi->program + 1;
end = RExC_opend;
}
- /* this code is intended to handle expanding regex "subs" so
- * we can apply various optimizations. For instance with
- * /(?(DEFINE)(?<foo>foo)(?<bar>bar))(?&foo)(?&bar)/ we
- * want to recognize that the mandatory substr is going to be
- * "foobar".
- * However if we are not in SCF_DO_SUBSTR mode then there is
- * no point in doing this, and it can cause a serious slowdown.
- * See RT #122283.
- * Note also that this was a workaround for the core problem
- * which was that during compilation logic the excessive
- * recursion resulted in slowly consuming all the memory on
- * the box. Exactly what causes this is unclear. It does not
- * appear to be directly related to allocating the "visited"
- * bitmaps that is RExC_study_chunk_recursed.
- *
- * In reality study_chunk() does far far too much, and probably
- * this an other issues would go away if we split it into
- * multiple components.
- *
- * - Yves
- * */
- if (flags & SCF_DO_SUBSTR) {
+ /* NOTE we MUST always execute the above code, even
+ * if we do nothing with a GOSUB/GOSTART */
+ if (
+ ( flags & SCF_IN_DEFINE )
+ ||
+ (
+ (is_inf_internal || is_inf || data->flags & SF_IS_INF)
+ &&
+ ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
+ )
+ ) {
+ /* no need to do anything here if we are in a define. */
+ /* or we are after some kind of infinite construct
+ * so we can skip recursing into this item.
+ * Since it is infinite we will not change the maxlen
+ * or delta, and if we miss something that might raise
+ * the minlen it will merely pessimise a little.
+ *
+ * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
+ * might result in a minlen of 1 and not of 4,
+ * but this doesn't make us mismatch, just try a bit
+ * harder than we should.
+ * */
+ scan= regnext(scan);
+ continue;
+ }
+
if (
!recursed_depth
||
DEBUG_STUDYDATA("set:", data,depth);
PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren);
my_recursed_depth= recursed_depth + 1;
- Newx(newframe,1,scan_frame);
} else {
DEBUG_STUDYDATA("inf:", data,depth);
/* some form of infinite recursion, assume infinite length
if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
ssc_anything(data->start_class);
flags &= ~SCF_DO_STCLASS;
+
+ start= NULL; /* reset start so we dont recurse later on. */
}
- }
} else {
- Newx(newframe,1,scan_frame);
paren = stopparen;
- start = scan+2;
+ start = scan + 2;
end = regnext(scan);
}
- if (newframe) {
- assert(start);
+ if (start) {
+ scan_frame *newframe;
assert(end);
- SAVEFREEPV(newframe);
- newframe->next = regnext(scan);
- newframe->last = last;
- newframe->stop = stopparen;
- newframe->prev = frame;
+ if (!RExC_frame_last) {
+ Newxz(newframe, 1, scan_frame);
+ SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
+ RExC_frame_head= newframe;
+ RExC_frame_count++;
+ } else if (!RExC_frame_last->next_frame) {
+ Newxz(newframe,1,scan_frame);
+ RExC_frame_last->next_frame= newframe;
+ newframe->prev_frame= RExC_frame_last;
+ RExC_frame_count++;
+ } else {
+ newframe= RExC_frame_last->next_frame;
+ }
+ RExC_frame_last= newframe;
+
+ newframe->next_regnode = regnext(scan);
+ newframe->last_regnode = last;
+ newframe->stopparen = stopparen;
newframe->prev_recursed_depth = recursed_depth;
+ newframe->this_prev_frame= frame;
DEBUG_STUDYDATA("frame-new:",data,depth);
DEBUG_PEEP("fnew", scan, depth);
continue;
}
}
- else if (OP(scan) == EXACT) {
+ else if (OP(scan) == EXACT || OP(scan) == EXACTL) {
SSize_t l = STR_LEN(scan);
UV uc;
if (UTF) {
}
flags &= ~SCF_DO_STCLASS;
}
- else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
- EXACTFish */
+ else if (PL_regkind[OP(scan)] == EXACT) {
+ /* But OP != EXACT!, so is EXACTFish */
SSize_t l = STR_LEN(scan);
- UV uc = *((U8*)STRING(scan));
- SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
- separate code points */
const U8 * s = (U8*)STRING(scan);
/* Search for fixed substrings supports EXACT only. */
scan_commit(pRExC_state, data, minlenp, is_inf);
}
if (UTF) {
- uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
}
if (unfolded_multi_char) {
}
}
- if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
- ssc_clear_locale(data->start_class);
- }
+ if (flags & SCF_DO_STCLASS) {
+ SV* EXACTF_invlist = _make_exactf_invlist(pRExC_state, scan);
- if (! UTF) {
-
- /* We punt and assume can match anything if the node begins
- * with a multi-character fold. Things are complicated. For
- * example, /ffi/i could match any of:
- * "\N{LATIN SMALL LIGATURE FFI}"
- * "\N{LATIN SMALL LIGATURE FF}I"
- * "F\N{LATIN SMALL LIGATURE FI}"
- * plus several other things; and making sure we have all the
- * possibilities is hard. */
- if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
- EXACTF_invlist =
- _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
+ assert(EXACTF_invlist);
+ if (flags & SCF_DO_STCLASS_AND) {
+ if (OP(scan) != EXACTFL)
+ ssc_clear_locale(data->start_class);
+ ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
+ ANYOF_POSIXL_ZERO(data->start_class);
+ ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
}
- else {
-
- /* Any Latin1 range character can potentially match any
- * other depending on the locale */
- if (OP(scan) == EXACTFL) {
- _invlist_union(EXACTF_invlist, PL_Latin1,
- &EXACTF_invlist);
- }
- else {
- /* But otherwise, it matches at least itself. We can
- * quickly tell if it has a distinct fold, and if so,
- * it matches that as well */
- EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
- if (IS_IN_SOME_FOLD_L1(uc)) {
- EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
- PL_fold_latin1[uc]);
- }
- }
+ else { /* SCF_DO_STCLASS_OR */
+ ssc_union(data->start_class, EXACTF_invlist, FALSE);
+ ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
- /* Some characters match above-Latin1 ones under /i. This
- * is true of EXACTFL ones when the locale is UTF-8 */
- if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
- && (! isASCII(uc) || (OP(scan) != EXACTFA
- && OP(scan) != EXACTFA_NO_TRIE)))
- {
- add_above_Latin1_folds(pRExC_state,
- (U8) uc,
- &EXACTF_invlist);
- }
+ /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
+ ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
}
+ flags &= ~SCF_DO_STCLASS;
+ SvREFCNT_dec(EXACTF_invlist);
}
- else { /* Pattern is UTF-8 */
- U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
- STRLEN foldlen = UTF8SKIP(s);
- const U8* e = s + STR_LEN(scan);
- SV** listp;
-
- /* The only code points that aren't folded in a UTF EXACTFish
- * node are are the problematic ones in EXACTFL nodes */
- if (OP(scan) == EXACTFL
- && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
- {
- /* We need to check for the possibility that this EXACTFL
- * node begins with a multi-char fold. Therefore we fold
- * the first few characters of it so that we can make that
- * check */
- U8 *d = folded;
- int i;
-
- for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
- if (isASCII(*s)) {
- *(d++) = (U8) toFOLD(*s);
- s++;
- }
- else {
- STRLEN len;
- to_utf8_fold(s, d, &len);
- d += len;
- s += UTF8SKIP(s);
- }
- }
-
- /* And set up so the code below that looks in this folded
- * buffer instead of the node's string */
- e = d;
- foldlen = UTF8SKIP(folded);
- s = folded;
- }
-
- /* When we reach here 's' points to the fold of the first
- * character(s) of the node; and 'e' points to far enough along
- * the folded string to be just past any possible multi-char
- * fold. 'foldlen' is the length in bytes of the first
- * character in 's'
- *
- * Unlike the non-UTF-8 case, the macro for determining if a
- * string is a multi-char fold requires all the characters to
- * already be folded. This is because of all the complications
- * if not. Note that they are folded anyway, except in EXACTFL
- * nodes. Like the non-UTF case above, we punt if the node
- * begins with a multi-char fold */
-
- if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
- EXACTF_invlist =
- _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
- }
- else { /* Single char fold */
-
- /* It matches all the things that fold to it, which are
- * found in PL_utf8_foldclosures (including itself) */
- EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
- if (! PL_utf8_foldclosures) {
- _load_PL_utf8_foldclosures();
- }
- if ((listp = hv_fetch(PL_utf8_foldclosures,
- (char *) s, foldlen, FALSE)))
- {
- AV* list = (AV*) *listp;
- IV k;
- for (k = 0; k <= av_tindex(list); k++) {
- SV** c_p = av_fetch(list, k, FALSE);
- UV c;
- assert(c_p);
-
- c = SvUV(*c_p);
-
- /* /aa doesn't allow folds between ASCII and non- */
- if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
- && isASCII(c) != isASCII(uc))
- {
- continue;
- }
-
- EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
- }
- }
- }
- }
- if (flags & SCF_DO_STCLASS_AND) {
- ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
- ANYOF_POSIXL_ZERO(data->start_class);
- ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
- }
- else if (flags & SCF_DO_STCLASS_OR) {
- ssc_union(data->start_class, EXACTF_invlist, FALSE);
- ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
-
- /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
- ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
- }
- flags &= ~SCF_DO_STCLASS;
- SvREFCNT_dec(EXACTF_invlist);
}
else if (REGNODE_VARIES(OP(scan))) {
SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
case PLUS:
if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
next = NEXTOPER(scan);
- if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
+ if (OP(next) == EXACT
+ || OP(next) == EXACTL
+ || (flags & SCF_DO_STCLASS))
+ {
mincount = 1;
maxcount = REG_INFTY;
next = regnext(scan);
{
/* Fatal warnings may leak the regexp without this: */
SAVEFREESV(RExC_rx_sv);
- ckWARNreg(RExC_parse,
- "Quantifier unexpected on zero-length expression");
+ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
+ "Quantifier unexpected on zero-length expression "
+ "in regex m/%"UTF8f"/",
+ UTF8fARG(UTF, RExC_end - RExC_precomp,
+ RExC_precomp));
(void)ReREFCNT_inc(RExC_rx_sv);
}
} else {
/* start offset must point into the last copy */
data->last_start_min += minnext * (mincount - 1);
- data->last_start_max += is_inf ? SSize_t_MAX
- : (maxcount - 1) * (minnext + data->pos_delta);
+ data->last_start_max =
+ is_inf
+ ? SSize_t_MAX
+ : data->last_start_max +
+ (maxcount - 1) * (minnext + data->pos_delta);
}
}
/* It is counted once already... */
flags &= ~SCF_DO_STCLASS;
}
min++;
- delta++; /* Because of the 2 char string cr-lf */
+ if (delta != SSize_t_MAX)
+ delta++; /* Because of the 2 char string cr-lf */
if (flags & SCF_DO_SUBSTR) {
/* Cannot expect anything... */
scan_commit(pRExC_state, data, minlenp, is_inf);
min++;
if (flags & SCF_DO_STCLASS) {
bool invert = 0;
- SV* my_invlist = sv_2mortal(_new_invlist(0));
+ SV* my_invlist = NULL;
U8 namedclass;
/* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
}
break;
+ case ANYOFL:
case ANYOF:
if (flags & SCF_DO_STCLASS_AND)
ssc_and(pRExC_state, data->start_class,
/* FALLTHROUGH */
case POSIXA:
if (FLAGS(scan) == _CC_ASCII) {
- my_invlist = PL_XPosix_ptrs[_CC_ASCII];
+ my_invlist = invlist_clone(PL_XPosix_ptrs[_CC_ASCII]);
}
else {
_invlist_intersection(PL_XPosix_ptrs[FLAGS(scan)],
assert(flags & SCF_DO_STCLASS_OR);
ssc_union(data->start_class, my_invlist, invert);
}
+ SvREFCNT_dec(my_invlist);
}
if (flags & SCF_DO_STCLASS_OR)
ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
&& (scan->flags || data || (flags & SCF_DO_STCLASS))
&& (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
{
- if ( OP(scan) == UNLESSM &&
- scan->flags == 0 &&
- OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
- OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
- ) {
- regnode *opt;
- regnode *upto= regnext(scan);
- DEBUG_PARSE_r({
- SV * const mysv_val=sv_newmortal();
- DEBUG_STUDYDATA("OPFAIL",data,depth);
-
- /*DEBUG_PARSE_MSG("opfail");*/
- regprop(RExC_rx, mysv_val, upto, NULL);
- PerlIO_printf(Perl_debug_log,
- "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
- SvPV_nolen_const(mysv_val),
- (IV)REG_NODE_NUM(upto),
- (IV)(upto - scan)
- );
- });
- OP(scan) = OPFAIL;
- NEXT_OFF(scan) = upto - scan;
- for (opt= scan + 1; opt < upto ; opt++)
- OP(opt) = OPTIMIZED;
- scan= upto;
- continue;
- }
if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
|| OP(scan) == UNLESSM )
{
regnode_ssc intrnl;
int f = 0;
- data_fake.flags = 0;
+ StructCopy(&zero_scan_data, &data_fake, scan_data_t);
if (data) {
data_fake.whilem_c = data->whilem_c;
data_fake.last_closep = data->last_closep;
SSize_t deltanext=0, minnext=0, f = 0, fake;
regnode_ssc this_class;
- data_fake.flags = 0;
+ StructCopy(&zero_scan_data, &data_fake, scan_data_t);
if (data) {
data_fake.whilem_c = data->whilem_c;
data_fake.last_closep = data->last_closep;
data->longest = &(data->longest_float);
}
min += min1;
- delta += max1 - min1;
+ if (delta != SSize_t_MAX)
+ delta += max1 - min1;
if (flags & SCF_DO_STCLASS_OR) {
ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
if (min1) {
}
*/
if (frame) {
+ depth = depth - 1;
+
DEBUG_STUDYDATA("frame-end:",data,depth);
DEBUG_PEEP("fend", scan, depth);
+
/* restore previous context */
- last = frame->last;
- scan = frame->next;
- stopparen = frame->stop;
+ last = frame->last_regnode;
+ scan = frame->next_regnode;
+ stopparen = frame->stopparen;
recursed_depth = frame->prev_recursed_depth;
- depth = depth - 1;
- frame = frame->prev;
+ RExC_frame_last = frame->prev_frame;
+ frame = frame->this_prev_frame;
goto fake_study_recurse;
}
{
SSize_t final_minlen= min < stopmin ? min : stopmin;
- if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) {
- RExC_maxlen = final_minlen + delta;
+ if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
+ if (final_minlen > SSize_t_MAX - delta)
+ RExC_maxlen = SSize_t_MAX;
+ else if (RExC_maxlen < final_minlen + delta)
+ RExC_maxlen = final_minlen + delta;
}
return final_minlen;
}
- /* not-reached */
+ NOT_REACHED;
}
STATIC U32
if (oplist) {
assert(oplist->op_type == OP_PADAV
|| oplist->op_type == OP_RV2AV);
- oplist = OP_SIBLING(oplist);
+ oplist = OpSIBLING(oplist);
}
if (SvRMAGICAL(av)) {
pRExC_state->code_blocks[n].src_regex = NULL;
n++;
code = 1;
- oplist = OP_SIBLING(oplist); /* skip CONST */
+ oplist = OpSIBLING(oplist); /* skip CONST */
assert(oplist);
}
- oplist = OP_SIBLING(oplist);;
+ oplist = OpSIBLING(oplist);;
}
/* apply magic and QR overloading to arg */
OP *o;
int ncode = 0;
- for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
+ for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o))
if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
ncode++; /* count of DO blocks */
if (ncode) {
if (expr->op_type == OP_CONST)
n = 1;
else
- for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
+ for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
if (o->op_type == OP_CONST)
n++;
}
if (expr->op_type == OP_CONST)
new_patternp[n] = cSVOPx_sv(expr);
else
- for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
+ for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) {
if (o->op_type == OP_CONST)
new_patternp[n++] = cSVOPo_sv;
}
assert( expr->op_type == OP_PUSHMARK
|| (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
|| expr->op_type == OP_PADRANGE);
- expr = OP_SIBLING(expr);
+ expr = OpSIBLING(expr);
}
pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
RExC_uni_semantics = 0;
RExC_contains_locale = 0;
RExC_contains_i = 0;
+ RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT);
pRExC_state->runtime_code_qr = NULL;
+ RExC_frame_head= NULL;
+ RExC_frame_last= NULL;
+ RExC_frame_count= 0;
+ DEBUG_r({
+ RExC_mysv1= sv_newmortal();
+ RExC_mysv2= sv_newmortal();
+ });
DEBUG_COMPILE_r({
SV *dsv= sv_newmortal();
RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60);
RExC_recurse_count = 0;
pRExC_state->code_index = 0;
-#if 0 /* REGC() is (currently) a NOP at the first pass.
- * Clever compilers notice this and complain. --jhi */
- REGC((U8)REG_MAGIC, (char*)RExC_emit);
-#endif
DEBUG_PARSE_r(
PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
RExC_lastnum=0;
== REG_RUN_ON_COMMENT_SEEN);
U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
>> RXf_PMf_STD_PMMOD_SHIFT);
- const char *fptr = STD_PAT_MODS; /*"msix"*/
+ const char *fptr = STD_PAT_MODS; /*"msixn"*/
char *p;
/* Allocate for the worst case, which is all the std flags are turned
* on. If more precision is desired, we could do a population count of
RExC_emit_bound = ri->program + RExC_size + 1;
pRExC_state->code_index = 0;
- REGC((U8)REG_MAGIC, (char*) RExC_emit++);
+ *((char*) RExC_emit++) = (char) REG_MAGIC;
if (reg(pRExC_state, 0, &flags,1) == NULL) {
ReREFCNT_dec(rx);
Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags);
SAVEFREEPV(RExC_recurse);
}
-reStudy:
+ reStudy:
r->minlen = minlen = sawlookahead = sawplus = sawopen = sawminmod = 0;
DEBUG_r(
RExC_study_chunk_recursed_count= 0;
);
Zero(r->substrs, 1, struct reg_substr_data);
- if (RExC_study_chunk_recursed)
+ if (RExC_study_chunk_recursed) {
Zero(RExC_study_chunk_recursed,
RExC_study_chunk_recursed_bytes * RExC_npar, U8);
+ }
+
#ifdef TRIE_STUDY_OPT
if (!restudied) {
if (UTF)
SvUTF8_on(rx); /* Unicode in it? */
ri->regstclass = NULL;
- if (RExC_naughty >= 10) /* Probably an expensive pattern. */
+ if (RExC_naughty >= TOO_NAUGHTY) /* Probably an expensive pattern. */
r->intflags |= PREGf_NAUGHTY;
scan = ri->program + 1; /* First BRANCH. */
DEBUG_PEEP("first:",first,0);
/* Ignore EXACT as we deal with it later. */
if (PL_regkind[OP(first)] == EXACT) {
- if (OP(first) == EXACT)
+ if (OP(first) == EXACT || OP(first) == EXACTL)
NOOP; /* Empty, get anchored substr later. */
else
ri->regstclass = first;
if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
&& stclass_flag
&& ! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
- && !ssc_is_anything(data.start_class))
+ && is_ssc_worth_it(pRExC_state, data.start_class))
{
const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
ri->regstclass = (regnode*)RExC_rxi->data->data[n];
r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
- regprop(r, sv, (regnode*)data.start_class, NULL);
+ regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
PerlIO_printf(Perl_debug_log,
"synthetic stclass \"%s\".\n",
SvPVX_const(sv));});
= r->float_substr = r->float_utf8 = NULL;
if (! (ANYOF_FLAGS(data.start_class) & SSC_MATCHES_EMPTY_STRING)
- && ! ssc_is_anything(data.start_class))
+ && is_ssc_worth_it(pRExC_state, data.start_class))
{
const U32 n = add_data(pRExC_state, STR_WITH_LEN("f"));
ri->regstclass = (regnode*)RExC_rxi->data->data[n];
r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
- regprop(r, sv, (regnode*)data.start_class, NULL);
+ regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state);
PerlIO_printf(Perl_debug_log,
"synthetic stclass \"%s\".\n",
SvPVX_const(sv));});
&& OP(regnext(first)) == END)
r->extflags |= RXf_WHITE;
else if ( r->extflags & RXf_SPLIT
- && fop == EXACT
+ && (fop == EXACT || fop == EXACTL)
&& STR_LEN(first) == 1
&& *(STRING(first)) == ' '
&& OP(regnext(first)) == END )
/* assume we don't need to swap parens around before we match */
DEBUG_TEST_r({
PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n",
- RExC_study_chunk_recursed_count);
+ (unsigned long)RExC_study_chunk_recursed_count);
});
DEBUG_DUMP_r({
DEBUG_RExC_seen();
Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
(unsigned long) flags);
}
- assert(0); /* NOT REACHED */
+ NOT_REACHED; /* NOT REACHED */
}
return NULL;
}
#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
- int rem=(int)(RExC_end - RExC_parse); \
- int cut; \
int num; \
- int iscut=0; \
- if (rem>10) { \
- rem=10; \
- iscut=1; \
- } \
- cut=10-rem; \
- if (RExC_lastparse!=RExC_parse) \
- PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
- rem, RExC_parse, \
- cut + 4, \
- iscut ? "..." : "<" \
+ if (RExC_lastparse!=RExC_parse) { \
+ PerlIO_printf(Perl_debug_log, "%s", \
+ Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \
+ RExC_end - RExC_parse, 16, \
+ "", "", \
+ PERL_PV_ESCAPE_UNI_DETECT | \
+ PERL_PV_PRETTY_ELLIPSES | \
+ PERL_PV_PRETTY_LTGT | \
+ PERL_PV_ESCAPE_RE | \
+ PERL_PV_PRETTY_EXACTSIZE \
+ ) \
); \
- else \
+ } else \
PerlIO_printf(Perl_debug_log,"%16s",""); \
\
if (SIZE_ONLY) \
assert(SvLEN(invlist) == 0 || SvCUR(invlist) <= SvLEN(invlist));
}
+#ifndef PERL_IN_XSUB_RE
+
PERL_STATIC_INLINE IV*
S_get_invlist_previous_index_addr(SV* invlist)
{
*get_invlist_previous_index_addr(invlist) = index;
}
+PERL_STATIC_INLINE void
+S_invlist_trim(SV* const invlist)
+{
+ PERL_ARGS_ASSERT_INVLIST_TRIM;
+
+ assert(SvTYPE(invlist) == SVt_INVLIST);
+
+ /* Change the length of the inversion list to how many entries it currently
+ * has */
+ SvPV_shrink_to_cur((SV *) invlist);
+}
+
+PERL_STATIC_INLINE bool
+S_invlist_is_iterating(SV* const invlist)
+{
+ PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
+
+ return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
+}
+
+#endif /* ifndef PERL_IN_XSUB_RE */
+
PERL_STATIC_INLINE UV
S_invlist_max(SV* const invlist)
{
SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max + 1));
}
-PERL_STATIC_INLINE void
-S_invlist_trim(SV* const invlist)
-{
- PERL_ARGS_ASSERT_INVLIST_TRIM;
-
- assert(SvTYPE(invlist) == SVt_INVLIST);
-
- /* Change the length of the inversion list to how many entries it currently
- * has */
- SvPV_shrink_to_cur((SV *) invlist);
-}
-
STATIC void
S__append_range_to_invlist(pTHX_ SV* const invlist,
const UV start, const UV end)
/* Add the range from 'start' to 'end' inclusive to the inversion list's
* set. A pointer to the inversion list is returned. This may actually be
* a new list, in which case the passed in one has been destroyed. The
- * passed in inversion list can be NULL, in which case a new one is created
+ * passed-in inversion list can be NULL, in which case a new one is created
* with just the one range in it */
SV* range_invlist;
return TRUE;
}
-PERL_STATIC_INLINE bool
-S_invlist_is_iterating(SV* const invlist)
-{
- PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
-
- return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX;
-}
-
PERL_STATIC_INLINE UV
S_invlist_highest(SV* const invlist)
{
}
#endif
+/*
+ * As best we can, determine the characters that can match the start of
+ * the given EXACTF-ish node.
+ *
+ * Returns the invlist as a new SV*; it is the caller's responsibility to
+ * call SvREFCNT_dec() when done with it.
+ */
+STATIC SV*
+S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
+{
+ const U8 * s = (U8*)STRING(node);
+ SSize_t bytelen = STR_LEN(node);
+ UV uc;
+ /* Start out big enough for 2 separate code points */
+ SV* invlist = _new_invlist(4);
+
+ PERL_ARGS_ASSERT__MAKE_EXACTF_INVLIST;
+
+ if (! UTF) {
+ uc = *s;
+
+ /* We punt and assume can match anything if the node begins
+ * with a multi-character fold. Things are complicated. For
+ * example, /ffi/i could match any of:
+ * "\N{LATIN SMALL LIGATURE FFI}"
+ * "\N{LATIN SMALL LIGATURE FF}I"
+ * "F\N{LATIN SMALL LIGATURE FI}"
+ * plus several other things; and making sure we have all the
+ * possibilities is hard. */
+ if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
+ invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
+ }
+ else {
+ /* Any Latin1 range character can potentially match any
+ * other depending on the locale */
+ if (OP(node) == EXACTFL) {
+ _invlist_union(invlist, PL_Latin1, &invlist);
+ }
+ else {
+ /* But otherwise, it matches at least itself. We can
+ * quickly tell if it has a distinct fold, and if so,
+ * it matches that as well */
+ invlist = add_cp_to_invlist(invlist, uc);
+ if (IS_IN_SOME_FOLD_L1(uc))
+ invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
+ }
+
+ /* Some characters match above-Latin1 ones under /i. This
+ * is true of EXACTFL ones when the locale is UTF-8 */
+ if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
+ && (! isASCII(uc) || (OP(node) != EXACTFA
+ && OP(node) != EXACTFA_NO_TRIE)))
+ {
+ add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
+ }
+ }
+ }
+ else { /* Pattern is UTF-8 */
+ U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
+ STRLEN foldlen = UTF8SKIP(s);
+ const U8* e = s + bytelen;
+ SV** listp;
+
+ uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
+
+ /* The only code points that aren't folded in a UTF EXACTFish
+ * node are are the problematic ones in EXACTFL nodes */
+ if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
+ /* We need to check for the possibility that this EXACTFL
+ * node begins with a multi-char fold. Therefore we fold
+ * the first few characters of it so that we can make that
+ * check */
+ U8 *d = folded;
+ int i;
+
+ for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
+ if (isASCII(*s)) {
+ *(d++) = (U8) toFOLD(*s);
+ s++;
+ }
+ else {
+ STRLEN len;
+ to_utf8_fold(s, d, &len);
+ d += len;
+ s += UTF8SKIP(s);
+ }
+ }
+
+ /* And set up so the code below that looks in this folded
+ * buffer instead of the node's string */
+ e = d;
+ foldlen = UTF8SKIP(folded);
+ s = folded;
+ }
+
+ /* When we reach here 's' points to the fold of the first
+ * character(s) of the node; and 'e' points to far enough along
+ * the folded string to be just past any possible multi-char
+ * fold. 'foldlen' is the length in bytes of the first
+ * character in 's'
+ *
+ * Unlike the non-UTF-8 case, the macro for determining if a
+ * string is a multi-char fold requires all the characters to
+ * already be folded. This is because of all the complications
+ * if not. Note that they are folded anyway, except in EXACTFL
+ * nodes. Like the non-UTF case above, we punt if the node
+ * begins with a multi-char fold */
+
+ if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
+ invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
+ }
+ else { /* Single char fold */
+
+ /* It matches all the things that fold to it, which are
+ * found in PL_utf8_foldclosures (including itself) */
+ invlist = add_cp_to_invlist(invlist, uc);
+ if (! PL_utf8_foldclosures)
+ _load_PL_utf8_foldclosures();
+ if ((listp = hv_fetch(PL_utf8_foldclosures,
+ (char *) s, foldlen, FALSE)))
+ {
+ AV* list = (AV*) *listp;
+ IV k;
+ for (k = 0; k <= av_tindex(list); k++) {
+ SV** c_p = av_fetch(list, k, FALSE);
+ UV c;
+ assert(c_p);
+
+ c = SvUV(*c_p);
+
+ /* /aa doesn't allow folds between ASCII and non- */
+ if ((OP(node) == EXACTFA || OP(node) == EXACTFA_NO_TRIE)
+ && isASCII(c) != isASCII(uc))
+ {
+ continue;
+ }
+
+ invlist = add_cp_to_invlist(invlist, c);
+ }
+ }
+ }
+ }
+
+ return invlist;
+}
+
#undef HEADER_LENGTH
#undef TO_INTERNAL_SIZE
#undef FROM_INTERNAL_SIZE
regex_charset cs;
bool has_use_defaults = FALSE;
const char* const seqstart = RExC_parse - 1; /* Point to the '?' */
+ int x_mod_count = 0;
PERL_ARGS_ASSERT_PARSE_LPAREN_QUESTION_FLAGS;
and must be globally applied -- japhy */
switch (*RExC_parse) {
- /* Code for the imsx flags */
- CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
+ /* Code for the imsxn flags */
+ CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count);
case LOCALE_PAT_MOD:
if (has_charset_modifier) {
else {
vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
}
- /*NOTREACHED*/
+ NOT_REACHED; /*NOTREACHED*/
neg_modifier:
RExC_parse++;
vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"",
*(RExC_parse - 1));
- /*NOTREACHED*/
+ NOT_REACHED; /*NOTREACHED*/
case ONCE_PAT_MOD: /* 'o' */
case GLOBAL_PAT_MOD: /* 'g' */
if (PASS2 && ckWARN(WARN_REGEXP)) {
if (RExC_flags & RXf_PMf_FOLD) {
RExC_contains_i = 1;
}
+ if (PASS2) {
+ STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
+ }
return;
/*NOTREACHED*/
default:
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized",
UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
- /*NOTREACHED*/
+ NOT_REACHED; /*NOTREACHED*/
}
++RExC_parse;
}
+
+ if (PASS2) {
+ STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count);
+ }
}
/*
/* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */
vFAIL3("Sequence (%.*s...) not recognized",
RExC_parse-seqstart, seqstart);
- /*NOTREACHED*/
+ NOT_REACHED; /*NOTREACHED*/
case '<': /* (?<...) */
if (*RExC_parse == '!')
paren = ',';
break;
case '!': /* (?!...) */
RExC_seen_zerolen++;
+ /* check if we're really just a "FAIL" assertion */
+ --RExC_parse;
+ nextchar(pRExC_state);
if (*RExC_parse == ')') {
ret=reg_node(pRExC_state, OPFAIL);
nextchar(pRExC_state);
if (RExC_parse == RExC_end || *RExC_parse != ')')
vFAIL("Sequence (?&... not terminated");
goto gen_recurse_regop;
- assert(0); /* NOT REACHED */
+ /* NOT REACHED */
case '+':
if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
RExC_parse++;
}
RExC_recurse_count++;
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
- "Recurse #%"UVuf" to %"IVdf"\n",
+ "%*s%*s Recurse #%"UVuf" to %"IVdf"\n",
+ 22, "| |", (int)(depth * 2 + 1), "",
(UV)ARG(ret), (IV)ARG2L(ret)));
}
RExC_seen |= REG_RECURSE_SEEN;
nextchar(pRExC_state);
return ret;
- assert(0); /* NOT REACHED */
+ /* NOT REACHED */
case '?': /* (??...) */
is_logical = 1;
vFAIL2utf8f(
"Sequence (%"UTF8f"...) not recognized",
UTF8fARG(UTF, RExC_parse-seqstart, seqstart));
- /*NOTREACHED*/
+ NOT_REACHED; /*NOTREACHED*/
}
*flagp |= POSTPONED;
paren = *RExC_parse++;
if (is_logical) {
regnode *eval;
ret = reg_node(pRExC_state, LOGICAL);
- eval = reganode(pRExC_state, EVAL, n);
+
+ eval = reg2Lanode(pRExC_state, EVAL,
+ n,
+
+ /* for later propagation into (??{})
+ * return value */
+ RExC_flags & RXf_PMf_COMPILETIME
+ );
if (!SIZE_ONLY) {
ret->flags = 2;
- /* for later propagation into (??{}) return value */
- eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
}
REGTAIL(pRExC_state, ret, eval);
/* deal with the length of this later - MJD */
return ret;
}
- ret = reganode(pRExC_state, EVAL, n);
+ ret = reg2Lanode(pRExC_state, EVAL, n, 0);
Set_Node_Length(ret, RExC_parse - parse_start + 1);
Set_Node_Offset(ret, parse_start);
return ret;
case '(': /* (?(?{...})...) and (?(?=...)...) */
{
int is_define= 0;
+ const int DEFINE_len = sizeof("DEFINE") - 1;
if (RExC_parse[0] == '?') { /* (?(?...)) */
if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
|| RExC_parse[1] == '<'
ret = reganode(pRExC_state,NGROUPP,num);
goto insert_if_check_paren;
}
- else if (RExC_parse[0] == 'D' &&
- RExC_parse[1] == 'E' &&
- RExC_parse[2] == 'F' &&
- RExC_parse[3] == 'I' &&
- RExC_parse[4] == 'N' &&
- RExC_parse[5] == 'E')
- {
+ else if (RExC_end - RExC_parse >= DEFINE_len
+ && strnEQ(RExC_parse, "DEFINE", DEFINE_len))
+ {
ret = reganode(pRExC_state,DEFINEP,0);
- RExC_parse +=6 ;
+ RExC_parse += DEFINE_len;
is_define = 1;
goto insert_if_check_paren;
}
goto parse_rest;
} /* end switch */
}
- else { /* (...) */
+ else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */
capturing_parens:
parno = RExC_npar;
RExC_npar++;
&& !RExC_open_parens[parno-1])
{
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
- "Setting open paren #%"IVdf" to %d\n",
+ "%*s%*s Setting open paren #%"IVdf" to %d\n",
+ 22, "| |", (int)(depth * 2 + 1), "",
(IV)parno, REG_NODE_NUM(ret)));
RExC_open_parens[parno-1]= ret;
}
Set_Node_Length(ret, 1); /* MJD */
Set_Node_Offset(ret, RExC_parse); /* MJD */
is_open = 1;
+ } else {
+ ret = NULL;
}
}
else /* ! paren */
ender = reganode(pRExC_state, CLOSE, parno);
if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) {
DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
- "Setting close paren #%"IVdf" to %d\n",
- (IV)parno, REG_NODE_NUM(ender)));
+ "%*s%*s Setting close paren #%"IVdf" to %d\n",
+ 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender)));
RExC_close_parens[parno-1]= ender;
if (RExC_nestroot == parno)
RExC_nestroot = 0;
break;
}
DEBUG_PARSE_r(if (!SIZE_ONLY) {
- SV * const mysv_val1=sv_newmortal();
- SV * const mysv_val2=sv_newmortal();
DEBUG_PARSE_MSG("lsbr");
- regprop(RExC_rx, mysv_val1, lastbr, NULL);
- regprop(RExC_rx, mysv_val2, ender, NULL);
+ regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state);
+ regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
- SvPV_nolen_const(mysv_val1),
+ SvPV_nolen_const(RExC_mysv1),
(IV)REG_NODE_NUM(lastbr),
- SvPV_nolen_const(mysv_val2),
+ SvPV_nolen_const(RExC_mysv2),
(IV)REG_NODE_NUM(ender),
(IV)(ender - lastbr)
);
if (is_nothing) {
br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
DEBUG_PARSE_r(if (!SIZE_ONLY) {
- SV * const mysv_val1=sv_newmortal();
- SV * const mysv_val2=sv_newmortal();
DEBUG_PARSE_MSG("NADA");
- regprop(RExC_rx, mysv_val1, ret, NULL);
- regprop(RExC_rx, mysv_val2, ender, NULL);
+ regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state);
+ regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state);
PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
- SvPV_nolen_const(mysv_val1),
+ SvPV_nolen_const(RExC_mysv1),
(IV)REG_NODE_NUM(ret),
- SvPV_nolen_const(mysv_val2),
+ SvPV_nolen_const(RExC_mysv2),
(IV)REG_NODE_NUM(ender),
(IV)(ender - ret)
);
}
else
FAIL("Junk on end of regexp"); /* "Can't happen". */
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
if (RExC_in_lookbehind) {
if (chain == NULL) /* First piece. */
*flagp |= flags&SPSTART;
else {
- RExC_naughty++;
+ /* FIXME adding one for every branch after the first is probably
+ * excessive now we have TRIE support. (hv) */
+ MARK_NAUGHTY(1);
REGTAIL(pRExC_state, chain, latest);
}
chain = latest;
do_curly:
if ((flags&SIMPLE)) {
- RExC_naughty += 2 + RExC_naughty / 2;
+ MARK_NAUGHTY_EXP(2, 2);
reginsert(pRExC_state, CURLY, ret, depth+1);
Set_Node_Offset(ret, parse_start+1); /* MJD */
Set_Node_Cur_Length(ret, parse_start);
REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
if (SIZE_ONLY)
RExC_whilem_seen++, RExC_extralen += 3;
- RExC_naughty += 4 + RExC_naughty; /* compound interest */
+ MARK_NAUGHTY_EXP(1, 4); /* compound interest */
}
ret->flags = 0;
if (op == '*' && (flags&SIMPLE)) {
reginsert(pRExC_state, STAR, ret, depth+1);
ret->flags = 0;
- RExC_naughty += 4;
+ MARK_NAUGHTY(4);
RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
}
else if (op == '*') {
else if (op == '+' && (flags&SIMPLE)) {
reginsert(pRExC_state, PLUS, ret, depth+1);
ret->flags = 0;
- RExC_naughty += 3;
+ MARK_NAUGHTY(3);
RExC_seen |= REG_UNBOUNDED_QUANTIFIER_SEEN;
}
else if (op == '+') {
<substitute_parse> on success.
If <valuep> is non-null, it means the caller can accept an input sequence
- consisting of a just a single code point; <*valuep> is set to the value
- of the only or first code point in the input.
+ consisting of just a single code point; <*valuep> is set to the value of the
+ only or first code point in the input.
If <substitute_parse> is non-null, it means the caller can accept an input
sequence consisting of one or more code points; <*substitute_parse> is a
nextchar(pRExC_state);
*node_p = reg_node(pRExC_state, REG_ANY);
*flagp |= HASWIDTH|SIMPLE;
- RExC_naughty++;
+ MARK_NAUGHTY(1);
Set_Node_Length(*node_p, 1); /* MJD */
return 1;
}
RExC_parse++; /* Skip past the '{' */
- if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
+ if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
|| ! (endbrace == RExC_parse /* nothing between the {} */
- || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below
- */
- && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg)
- */
+ || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */
+ && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better
+ error msg) */
{
if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
vFAIL("\\N{NAME} must be resolved by the lexer");
}
+ RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
+
if (endbrace == RExC_parse) { /* empty: \N{} */
if (node_p) {
*node_p = reg_node(pRExC_state,NOTHING);
return 0;
}
- RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
RExC_parse += 2; /* Skip past the 'U+' */
endchar = RExC_parse + strcspn(RExC_parse, ".}");
has_multiple_chars = (endchar < endbrace);
/* We get the first code point if we want it, and either there is only one,
- * or we can accept both cases of one and more than one */
+ * or we can accept both cases of one and there is more than one */
if (valuep && (substitute_parse || ! has_multiple_chars)) {
STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
}
{
-
/* What is done here is to convert this to a sub-pattern of the form
* \x{char1}\x{char2}...
* and then either return it in <*substitute_parse> if non-null; or
PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
if (! FOLD) {
- return EXACT;
+ return (LOC)
+ ? EXACTL
+ : EXACT;
}
op = get_regex_charset(RExC_flags);
if (! len_passed_in) {
if (UTF) {
- if (UNI_IS_INVARIANT(code_point)) {
+ if (UVCHR_IS_INVARIANT(code_point)) {
if (LOC || ! FOLD) { /* /l defers folding until runtime */
*character = (U8) code_point;
}
for those. */
&& ! _invlist_contains_cp(PL_utf8_foldable, code_point))
{
- OP(node) = EXACT;
+ OP(node) = (LOC)
+ ? EXACTL
+ : EXACT;
}
}
else if (code_point <= MAX_UTF8_TWO_BYTE) {
PERL_ARGS_ASSERT_REGATOM;
-tryagain:
+ tryagain:
switch ((U8)*RExC_parse) {
case '^':
RExC_seen_zerolen++;
else
ret = reg_node(pRExC_state, REG_ANY);
*flagp |= HASWIDTH|SIMPLE;
- RExC_naughty++;
+ MARK_NAUGHTY(1);
Set_Node_Length(ret, 1); /* MJD */
break;
case '[':
FALSE, /* means parse the whole char class */
TRUE, /* allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
+ (bool) RExC_strict,
NULL);
if (*RExC_parse != ']') {
RExC_parse = oregcomp_parse;
arg = ANYOF_WORDCHAR;
goto join_posix;
- case 'b':
+ case 'B':
RExC_seen_zerolen++;
RExC_seen |= REG_LOOKBEHIND_SEEN;
- op = BOUND + get_regex_charset(RExC_flags);
- if (op > BOUNDA) { /* /aa is same as /a */
- op = BOUNDA;
+ op = NBOUND + get_regex_charset(RExC_flags);
+ if (op > NBOUNDA) { /* /aa is same as /a */
+ op = NBOUNDA;
}
- else if (op == BOUNDL) {
+ else if (op == NBOUNDL) {
RExC_contains_locale = 1;
}
ret = reg_node(pRExC_state, op);
- FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
if ((U8) *(RExC_parse + 1) == '{') {
/* diag_listed_as: Use "%s" instead of "%s" */
- vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
+ vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
}
goto finish_meta_pat;
- case 'B':
+
+ case 'b':
RExC_seen_zerolen++;
RExC_seen |= REG_LOOKBEHIND_SEEN;
- op = NBOUND + get_regex_charset(RExC_flags);
- if (op > NBOUNDA) { /* /aa is same as /a */
- op = NBOUNDA;
+ op = BOUND + get_regex_charset(RExC_flags);
+ if (op > BOUNDA) { /* /aa is same as /a */
+ op = BOUNDA;
}
- else if (op == NBOUNDL) {
+ else if (op == BOUNDL) {
RExC_contains_locale = 1;
}
ret = reg_node(pRExC_state, op);
- FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
if ((U8) *(RExC_parse + 1) == '{') {
/* diag_listed_as: Use "%s" instead of "%s" */
- vFAIL("Use \"\\B\\{\" instead of \"\\B{\"");
+ vFAIL("Use \"\\b\\{\" instead of \"\\b{\"");
}
goto finish_meta_pat;
FALSE, /* don't silence non-portable warnings.
It would be a bug if these returned
non-portables */
+ (bool) RExC_strict,
NULL);
/* regclass() can only return RESTART_UTF8 if multi-char folds
are allowed. */
&result,
&error_msg,
PASS2, /* out warnings */
- FALSE, /* not strict */
+ (bool) RExC_strict,
TRUE, /* Output warnings
for non-
portables */
vFAIL(error_msg);
}
ender = result;
- if (PL_encoding && ender < 0x100) {
+ if (IN_ENCODING && ender < 0x100) {
goto recode_encoding;
}
if (ender > 0xff) {
&result,
&error_msg,
PASS2, /* out warnings */
- FALSE, /* not strict */
- TRUE, /* Output warnings
+ (bool) RExC_strict,
+ TRUE, /* Silence warnings
for non-
portables */
UTF);
}
ender = result;
- if (PL_encoding && ender < 0x100) {
+ if (IN_ENCODING && ender < 0x100) {
goto recode_encoding;
}
if (ender > 0xff) {
* from \1 - \9 is a backreference, any multi-digit
* escape which does not start with 0 and which when
* evaluated as decimal could refer to an already
- * parsed capture buffer is a backslash. Anything else
- * is octal.
+ * parsed capture buffer is a back reference. Anything
+ * else is octal.
*
* Note this implies that \118 could be interpreted as
* 118 OR as "\11" . "8" depending on whether there
form_short_octal_warning(p, numlen));
}
}
- if (PL_encoding && ender < 0x100)
+ if (IN_ENCODING && ender < 0x100)
goto recode_encoding;
break;
recode_encoding:
if (! RExC_override_recoding) {
- SV* enc = PL_encoding;
+ SV* enc = _get_encoding();
ender = reg_recode((const char)(U8)ender, &enc);
if (!enc && PASS2)
ckWARNreg(p, "Invalid escape in the specified encoding");
goto loopdone;
}
- if (! FOLD /* The simple case, just append the literal */
- || (LOC /* Also don't fold for tricky chars under /l */
- && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)))
- {
- if (UTF) {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
+ if (! FOLD) { /* The simple case, just append the literal */
- /* The loop increments <len> each time, as all but this
- * path (and one other) through it add a single byte to
- * the EXACTish node. But this one has changed len to
- * be the correct final value, so subtract one to
- * cancel out the increment that follows */
- len--;
- }
- else {
- REGC((char)ender, s++);
- }
-
- /* Can get here if folding only if is one of the /l
- * characters whose fold depends on the locale. The
- * occurrence of any of these indicate that we can't
- * simplify things */
- if (FOLD) {
- maybe_exact = FALSE;
- maybe_exactfu = FALSE;
+ /* In the sizing pass, we need only the size of the
+ * character we are appending, hence we can delay getting
+ * its representation until PASS2. */
+ if (SIZE_ONLY) {
+ if (UTF) {
+ const STRLEN unilen = UNISKIP(ender);
+ s += unilen;
+
+ /* We have to subtract 1 just below (and again in
+ * the corresponding PASS2 code) because the loop
+ * increments <len> each time, as all but this path
+ * (and one other) through it add a single byte to
+ * the EXACTish node. But these paths would change
+ * len to be the correct final value, so cancel out
+ * the increment that follows */
+ len += unilen - 1;
+ }
+ else {
+ s++;
+ }
+ } else { /* PASS2 */
+ not_fold_common:
+ if (UTF) {
+ U8 * new_s = uvchr_to_utf8((U8*)s, ender);
+ len += (char *) new_s - s - 1;
+ s = (char *) new_s;
+ }
+ else {
+ *(s++) = (char) ender;
+ }
}
}
- else /* FOLD */
- if (! ( UTF
+ else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) {
+
+ /* Here are folding under /l, and the code point is
+ * problematic. First, we know we can't simplify things */
+ maybe_exact = FALSE;
+ maybe_exactfu = FALSE;
+
+ /* A problematic code point in this context means that its
+ * fold isn't known until runtime, so we can't fold it now.
+ * (The non-problematic code points are the above-Latin1
+ * ones that fold to also all above-Latin1. Their folds
+ * don't vary no matter what the locale is.) But here we
+ * have characters whose fold depends on the locale.
+ * Unlike the non-folding case above, we have to keep track
+ * of these in the sizing pass, so that we can make sure we
+ * don't split too-long nodes in the middle of a potential
+ * multi-char fold. And unlike the regular fold case
+ * handled in the else clauses below, we don't actually
+ * fold and don't have special cases to consider. What we
+ * do for both passes is the PASS2 code for non-folding */
+ goto not_fold_common;
+ }
+ else /* A regular FOLD code point */
+ if (! ( UTF
/* See comments for join_exact() as to why we fold this
* non-UTF at compile time */
|| (node_type == EXACTFU
/* Here, are folding and are not UTF-8 encoded; therefore
* the character must be in the range 0-255, and is not /l
* (Not /l because we already handled these under /l in
- * is_PROBLEMATIC_LOCALE_FOLD_cp */
+ * is_PROBLEMATIC_LOCALE_FOLD_cp) */
if (IS_IN_SOME_FOLD_L1(ender)) {
maybe_exact = FALSE;
* unfolded, and we have to calculate how many EXACTish
* nodes it will take; and we may run out of room in a node
* in the middle of a potential multi-char fold, and have
- * to back off accordingly. (Hence we can't use REGC for
- * the simple case just below.) */
+ * to back off accordingly. */
UV folded;
- if (isASCII(ender)) {
+ if (isASCII_uni(ender)) {
folded = toFOLD(ender);
*(s)++ = (U8) folded;
}
* differently depending on UTF8ness of the target string
* (for /u), or depending on locale for /l */
if (maybe_exact) {
- OP(ret) = EXACT;
+ OP(ret) = (LOC)
+ ? EXACTL
+ : EXACT;
}
else if (maybe_exactfu) {
- OP(ret) = EXACTFU;
+ OP(ret) = (LOC)
+ ? EXACTFLU8
+ : EXACTFU;
}
}
alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
posix class */
FALSE, /* don't allow multi-char folds */
TRUE, /* silence non-portable warnings. */
- ¤t))
+ TRUE, /* strict */
+ ¤t
+ ))
FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
(UV) *flagp);
TRUE, /* means parse just the next thing */
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
- ¤t))
+ TRUE, /* strict */
+ ¤t
+ ))
FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
(UV) *flagp);
/* regclass() will return with parsing just the \ sequence,
only if not a posix class */
FALSE, /* don't allow multi-char folds */
FALSE, /* don't silence non-portable warnings. */
- ¤t))
+ TRUE, /* strict */
+ ¤t
+ ))
FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"",
(UV) *flagp);
/* function call leaves parse pointing to the ']', except if we
TRUE, /* silence non-portable warnings. The above may very
well have generated non-portable code points, but
they're valid on this machine */
- NULL);
+ FALSE, /* similarly, no need for strict */
+ NULL
+ );
if (!node)
FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf,
PTR2UV(flagp));
const bool silence_non_portable, /* Don't output warnings
about too large
characters */
- SV** ret_invlist) /* Return an inversion list, not a node */
+ const bool strict,
+ SV** ret_invlist /* Return an inversion list, not a node */
+ )
{
/* parse a bracketed class specification. Most of these will produce an
* ANYOF node; but something like [a] will produce an EXACT node; [aA], an
separate for a while from the non-complemented
versions because of complications with /d
matching */
+ SV* simple_posixes = NULL; /* But under some conditions, the classes can be
+ treated more simply than the general case,
+ leading to less compilation and execution
+ work */
UV element_count = 0; /* Number of distinct elements in the class.
Optimizations may be possible if this is tiny */
AV * multi_char_matches = NULL; /* Code points that fold to more than one
char * stop_ptr = RExC_end; /* where to stop parsing */
const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white
space? */
- const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */
/* Unicode properties are stored in a swash; this holds the current one
* being parsed. If this swash is the only above-latin1 component of the
* runtime locale is UTF-8 */
SV* only_utf8_locale_list = NULL;
-#ifdef EBCDIC
- /* In a range, counts how many 0-2 of the ends of it came from literals,
- * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
- UV literal_endpoint = 0;
-#endif
+ /* In a range, if one of the endpoints is non-character-set portable,
+ * meaning that it hard-codes a code point that may mean a different
+ * charactger in ASCII vs. EBCDIC, as opposed to, say, a literal 'A' or a
+ * mnemonic '\t' which each mean the same character no matter which
+ * character set the platform is on. */
+ unsigned int non_portable_endpoint = 0;
+
+ /* Is the range unicode? which means on a platform that isn't 1-1 native
+ * to Unicode (i.e. non-ASCII), each code point in it should be considered
+ * to be a Unicode value. */
+ bool unicode_range = FALSE;
bool invert = FALSE; /* Is this class to be complemented */
bool warn_super = ALWAYS_WARN_SUPER;
DEBUG_PARSE("clas");
/* Assume we are going to generate an ANYOF node. */
- ret = reganode(pRExC_state, ANYOF, 0);
+ ret = reganode(pRExC_state,
+ (LOC)
+ ? ANYOFL
+ : ANYOF,
+ 0);
if (SIZE_ONLY) {
RExC_size += ANYOF_SKIP;
RExC_parse++;
invert = TRUE;
allow_multi_folds = FALSE;
- RExC_naughty++;
+ MARK_NAUGHTY(1);
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
FALSE /* means don't recognize comments */ );
if (!range) {
rangebegin = RExC_parse;
element_count++;
+ non_portable_endpoint = 0;
}
if (UTF) {
value = utf8n_to_uvchr((U8*)RExC_parse,
{
namedclass = regpposixcc(pRExC_state, value, strict);
}
- else if (value != '\\') {
-#ifdef EBCDIC
- literal_endpoint++;
-#endif
- }
- else {
+ else if (value == '\\') {
/* Is a backslash; get the code point of the char after it */
- if (UTF && ! UTF8_IS_INVARIANT(RExC_parse)) {
+ if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) {
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, UTF8_ALLOW_DEFAULT);
prevvalue = save_prevvalue;
continue; /* Back to top of loop to get next char */
}
+
/* Here, is a single code point, and <value> contains it */
+ unicode_range = TRUE; /* \N{} are Unicode */
}
break;
case 'p':
vFAIL(error_msg);
}
}
- if (PL_encoding && value < 0x100) {
+ non_portable_endpoint++;
+ if (IN_ENCODING && value < 0x100) {
goto recode_encoding;
}
break;
vFAIL(error_msg);
}
}
- if (PL_encoding && value < 0x100)
+ non_portable_endpoint++;
+ if (IN_ENCODING && value < 0x100)
goto recode_encoding;
break;
case 'c':
value = grok_bslash_c(*RExC_parse++, PASS2);
+ non_portable_endpoint++;
break;
case '0': case '1': case '2': case '3': case '4':
case '5': case '6': case '7':
(void)ReREFCNT_inc(RExC_rx_sv);
}
}
- if (PL_encoding && value < 0x100)
+ non_portable_endpoint++;
+ if (IN_ENCODING && value < 0x100)
goto recode_encoding;
break;
}
recode_encoding:
if (! RExC_override_recoding) {
- SV* enc = PL_encoding;
+ SV* enc = _get_encoding();
value = reg_recode((const char)(U8)value, &enc);
if (!enc) {
if (strict) {
&cp_list);
}
}
- else { /* Garden variety class. If is NASCII, NDIGIT, ...
+ else if (UNI_SEMANTICS
+ || classnum == _CC_ASCII
+ || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT
+ || classnum == _CC_XDIGIT)))
+ {
+ /* We usually have to worry about /d and /a affecting what
+ * POSIX classes match, with special code needed for /d
+ * because we won't know until runtime what all matches.
+ * But there is no extra work needed under /u, and
+ * [:ascii:] is unaffected by /a and /d; and :digit: and
+ * :xdigit: don't have runtime differences under /d. So we
+ * can special case these, and avoid some extra work below,
+ * and at runtime. */
+ _invlist_union_maybe_complement_2nd(
+ simple_posixes,
+ PL_XPosix_ptrs[classnum],
+ namedclass % 2 != 0,
+ &simple_posixes);
+ }
+ else { /* Garden variety class. If is NUPPER, NALPHA, ...
complement and use nposixes */
SV** posixes_ptr = namedclass % 2 == 0
? &posixes
: &nposixes;
- SV** source_ptr = &PL_XPosix_ptrs[classnum];
_invlist_union_maybe_complement_2nd(
*posixes_ptr,
- *source_ptr,
+ PL_XPosix_ptrs[classnum],
namedclass % 2 != 0,
posixes_ptr);
}
* minus sign */
if (range) {
+#ifdef EBCDIC
+ /* For unicode ranges, we have to test that the Unicode as opposed
+ * to the native values are not decreasing. (Above 255, there is
+ * no difference between native and Unicode) */
+ if (unicode_range && prevvalue < 255 && value < 255) {
+ if (NATIVE_TO_LATIN1(prevvalue) > NATIVE_TO_LATIN1(value)) {
+ goto backwards_range;
+ }
+ }
+ else
+#endif
if (prevvalue > value) /* b-a */ {
- const int w = RExC_parse - rangebegin;
+ int w;
+#ifdef EBCDIC
+ backwards_range:
+#endif
+ w = RExC_parse - rangebegin;
vFAIL2utf8f(
"Invalid [] range \"%"UTF8f"\"",
UTF8fARG(UTF, w, rangebegin));
- range = 0; /* not a valid range */
+ NOT_REACHED; /* NOT REACHED */
}
}
else {
continue;
}
- /* Here, we have a single value, and <prevvalue> is the beginning of
- * the range, if any; or <value> if not */
+ /* Here, we have a single value this time through the loop, and
+ * <prevvalue> is the beginning of the range, if any; or <value> if
+ * not. */
/* non-Latin1 code point implies unicode semantics. Must be set in
* pass1 so is there for the whole of pass 2 */
}
}
+ if (strict && PASS2 && ckWARN(WARN_REGEXP)) {
+ if (range) {
+
+ /* If the range starts above 255, everything is portable and
+ * likely to be so for any forseeable character set, so don't
+ * warn. */
+ if (unicode_range && non_portable_endpoint && prevvalue < 256) {
+ vWARN(RExC_parse, "Both or neither range ends should be Unicode");
+ }
+ else if (prevvalue != value) {
+
+ /* Under strict, ranges that stop and/or end in an ASCII
+ * printable should have each end point be a portable value
+ * for it (preferably like 'A', but we don't warn if it is
+ * a (portable) Unicode name or code point), and the range
+ * must be be all digits or all letters of the same case.
+ * Otherwise, the range is non-portable and unclear as to
+ * what it contains */
+ if ((isPRINT_A(prevvalue) || isPRINT_A(value))
+ && (non_portable_endpoint
+ || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value))
+ || (isLOWER_A(prevvalue) && isLOWER_A(value))
+ || (isUPPER_A(prevvalue) && isUPPER_A(value)))))
+ {
+ vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\"");
+ }
+ else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */
+
+ /* But the nature of Unicode and languages mean we
+ * can't do the same checks for above-ASCII ranges,
+ * except in the case of digit ones. These should
+ * contain only digits from the same group of 10. The
+ * ASCII case is handled just above. 0x660 is the
+ * first digit character beyond ASCII. Hence here, the
+ * range could be a range of digits. Find out. */
+ IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
+ prevvalue);
+ IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT],
+ value);
+
+ /* If the range start and final points are in the same
+ * inversion list element, it means that either both
+ * are not digits, or both are digits in a consecutive
+ * sequence of digits. (So far, Unicode has kept all
+ * such sequences as distinct groups of 10, but assert
+ * to make sure). If the end points are not in the
+ * same element, neither should be a digit. */
+ if (index_start == index_final) {
+ assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start)
+ || invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1]
+ - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start]
+ == 10);
+ }
+ else if ((index_start >= 0
+ && ELEMENT_RANGE_MATCHES_INVLIST(index_start))
+ || (index_final >= 0
+ && ELEMENT_RANGE_MATCHES_INVLIST(index_final)))
+ {
+ vWARN(RExC_parse, "Ranges of digits should be from the same group of 10");
+ }
+ }
+ }
+ }
+ if ((! range || prevvalue == value) && non_portable_endpoint) {
+ if (isPRINT_A(value)) {
+ char literal[3];
+ unsigned d = 0;
+ if (isBACKSLASHED_PUNCT(value)) {
+ literal[d++] = '\\';
+ }
+ literal[d++] = (char) value;
+ literal[d++] = '\0';
+
+ vWARN4(RExC_parse,
+ "\"%.*s\" is more clearly written simply as \"%s\"",
+ (int) (RExC_parse - rangebegin),
+ rangebegin,
+ literal
+ );
+ }
+ else if isMNEMONIC_CNTRL(value) {
+ vWARN4(RExC_parse,
+ "\"%.*s\" is more clearly written simply as \"%s\"",
+ (int) (RExC_parse - rangebegin),
+ rangebegin,
+ cntrl_to_mnemonic((char) value)
+ );
+ }
+ }
+ }
+
/* Deal with this element of the class */
if (! SIZE_ONLY) {
+
#ifndef EBCDIC
cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
prevvalue, value);
#else
- SV* this_range = _new_invlist(1);
- _append_range_to_invlist(this_range, prevvalue, value);
-
- /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
- * If this range was specified using something like 'i-j', we want
- * to include only the 'i' and the 'j', and not anything in
- * between, so exclude non-ASCII, non-alphabetics from it.
- * However, if the range was specified with something like
- * [\x89-\x91] or [\x89-j], all code points within it should be
- * included. literal_endpoint==2 means both ends of the range used
- * a literal character, not \x{foo} */
- if (literal_endpoint == 2
- && ((prevvalue >= 'a' && value <= 'z')
- || (prevvalue >= 'A' && value <= 'Z')))
+ /* On non-ASCII platforms, for ranges that span all of 0..255, and
+ * ones that don't require special handling, we can just add the
+ * range like we do for ASCII platforms */
+ if ((UNLIKELY(prevvalue == 0) && value >= 255)
+ || ! (prevvalue < 256
+ && (unicode_range
+ || (! non_portable_endpoint
+ && ((isLOWER_A(prevvalue) && isLOWER_A(value))
+ || (isUPPER_A(prevvalue)
+ && isUPPER_A(value)))))))
{
- _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
- &this_range);
-
- /* Since this above only contains ascii, the intersection of it
- * with anything will still yield only ascii */
- _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ALPHA],
- &this_range);
+ cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
+ prevvalue, value);
+ }
+ else {
+ /* Here, requires special handling. This can be because it is
+ * a range whose code points are considered to be Unicode, and
+ * so must be individually translated into native, or because
+ * its a subrange of 'A-Z' or 'a-z' which each aren't
+ * contiguous in EBCDIC, but we have defined them to include
+ * only the "expected" upper or lower case ASCII alphabetics.
+ * Subranges above 255 are the same in native and Unicode, so
+ * can be added as a range */
+ U8 start = NATIVE_TO_LATIN1(prevvalue);
+ unsigned j;
+ U8 end = (value < 256) ? NATIVE_TO_LATIN1(value) : 255;
+ for (j = start; j <= end; j++) {
+ cp_foldable_list = add_cp_to_invlist(cp_foldable_list, LATIN1_TO_NATIVE(j));
+ }
+ if (value > 255) {
+ cp_foldable_list = _add_range_to_invlist(cp_foldable_list,
+ 256, value);
+ }
}
- _invlist_union(cp_foldable_list, this_range, &cp_foldable_list);
- literal_endpoint = 0;
#endif
}
if (! LOC && value == '\n') {
op = REG_ANY; /* Optimize [^\n] */
*flagp |= HASWIDTH|SIMPLE;
- RExC_naughty++;
+ MARK_NAUGHTY(1);
}
}
else if (value < 256 || UTF) {
op = POSIXA;
}
}
- else if (prevvalue == 'A') {
- if (value == 'Z'
+ else if (AT_LEAST_ASCII_RESTRICTED || ! FOLD) {
+ /* We can optimize A-Z or a-z, but not if they could match
+ * something like the KELVIN SIGN under /i (/a means they
+ * can't) */
+ if (prevvalue == 'A') {
+ if (value == 'Z'
#ifdef EBCDIC
- && literal_endpoint == 2
+ && ! non_portable_end_point
#endif
- ) {
- arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
- op = POSIXA;
+ ) {
+ arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
+ op = POSIXA;
+ }
}
- }
- else if (prevvalue == 'a') {
- if (value == 'z'
+ else if (prevvalue == 'a') {
+ if (value == 'z'
#ifdef EBCDIC
- && literal_endpoint == 2
+ && ! non_portable_end_point
#endif
- ) {
- arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
- op = POSIXA;
+ ) {
+ arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
+ op = POSIXA;
+ }
}
}
}
SvREFCNT_dec(posixes);
SvREFCNT_dec(nposixes);
+ SvREFCNT_dec(simple_posixes);
SvREFCNT_dec(cp_list);
SvREFCNT_dec(cp_foldable_list);
return ret;
* classes. The lists are kept separate up to now because we don't want to
* fold the classes (folding of those is automatically handled by the swash
* fetching code) */
+ if (simple_posixes) {
+ _invlist_union(cp_list, simple_posixes, &cp_list);
+ SvREFCNT_dec_NN(simple_posixes);
+ }
if (posixes || nposixes) {
if (posixes && AT_LEAST_ASCII_RESTRICTED) {
/* Under /a and /aa, nothing above ASCII matches these */
value = start;
if (! FOLD) {
- op = EXACT;
+ op = (LOC)
+ ? EXACTL
+ : EXACT;
}
else if (LOC) {
if (end == UV_MAX) {
op = SANY;
*flagp |= HASWIDTH|SIMPLE;
- RExC_naughty++;
+ MARK_NAUGHTY(1);
}
else if (end == '\n' - 1
&& invlist_iternext(cp_list, &start, &end)
{
op = REG_ANY;
*flagp |= HASWIDTH|SIMPLE;
- RExC_naughty++;
+ MARK_NAUGHTY(1);
}
}
invlist_iterfinish(cp_list);
PERL_ARGS_ASSERT_REGNODE_GUTS;
+ assert(extra_size >= regarglen[op]);
+
if (SIZE_ONLY) {
SIZE_ALIGN(RExC_size);
RExC_size += 1 + extra_size;
STATIC regnode * /* Location. */
S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
{
- regnode * const ret = regnode_guts(pRExC_state, op, 0, "reg_node");
+ regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg_node");
PERL_ARGS_ASSERT_REG_NODE;
+ assert(regarglen[op] == 0);
+
if (PASS2) {
regnode *ptr = ret;
FILL_ADVANCE_NODE(ptr, op);
STATIC regnode * /* Location. */
S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
{
- regnode * const ret = regnode_guts(pRExC_state, op, 1, "reganode");
+ regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reganode");
PERL_ARGS_ASSERT_REGANODE;
+ assert(regarglen[op] == 1);
+
if (PASS2) {
regnode *ptr = ret;
FILL_ADVANCE_NODE_ARG(ptr, op, arg);
{
/* emit a node with U32 and I32 arguments */
- regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "rega2Lnode");
+ regnode * const ret = regnode_guts(pRExC_state, op, regarglen[op], "reg2Lanode");
PERL_ARGS_ASSERT_REG2LANODE;
}
/*
-- reguni - emit (if appropriate) a Unicode character
-*/
-PERL_STATIC_INLINE STRLEN
-S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
-{
- PERL_ARGS_ASSERT_REGUNI;
-
- return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
-}
-
-/*
- reginsert - insert an operator in front of already-emitted operand
*
* Means relocating the operand.
for (;;) {
regnode * const temp = regnext(scan);
DEBUG_PARSE_r({
- SV * const mysv=sv_newmortal();
DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
- regprop(RExC_rx, mysv, scan, NULL);
+ regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
- SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
+ SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan),
(temp == NULL ? "->" : ""),
(temp == NULL ? PL_reg_name[OP(val)] : "")
);
if ( exact ) {
switch (OP(scan)) {
case EXACT:
+ case EXACTL:
case EXACTF:
case EXACTFA_NO_TRIE:
case EXACTFA:
case EXACTFU:
+ case EXACTFLU8:
case EXACTFU_SS:
case EXACTFL:
if( exact == PSEUDO )
}
}
DEBUG_PARSE_r({
- SV * const mysv=sv_newmortal();
DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
- regprop(RExC_rx, mysv, scan, NULL);
+ regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
- SvPV_nolen_const(mysv),
+ SvPV_nolen_const(RExC_mysv),
REG_NODE_NUM(scan),
PL_reg_name[exact]);
});
scan = temp;
}
DEBUG_PARSE_r({
- SV * const mysv_val=sv_newmortal();
DEBUG_PARSE_MSG("");
- regprop(RExC_rx, mysv_val, val, NULL);
+ regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state);
PerlIO_printf(Perl_debug_log,
"~ attach to %s (%"IVdf") offset to %"IVdf"\n",
- SvPV_nolen_const(mysv_val),
+ SvPV_nolen_const(RExC_mysv),
(IV)REG_NODE_NUM(val),
(IV)(val - scan)
);
PerlIO_printf(Perl_debug_log, ") ");
if (ri->regstclass) {
- regprop(r, sv, ri->regstclass, NULL);
+ regprop(r, sv, ri->regstclass, NULL, NULL);
PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
}
if (r->intflags & PREGf_ANCH) {
*/
void
-Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
+Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state)
{
#ifdef DEBUGGING
int k;
PERL_ARGS_ASSERT_REGPROP;
- sv_setpvs(sv, "");
+ sv_setpvn(sv, "", 0);
if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
/* It would be nice to FAIL() here, but this may be called from
else if (k == REF || k == OPEN || k == CLOSE
|| k == GROUPP || OP(o)==ACCEPT)
{
+ AV *name_list= NULL;
Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
if ( RXp_PAREN_NAMES(prog) ) {
+ name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
+ } else if ( pRExC_state ) {
+ name_list= RExC_paren_name_list;
+ }
+ if (name_list) {
if ( k != REF || (OP(o) < NREF)) {
- AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
- SV **name= av_fetch(list, ARG(o), 0 );
+ SV **name= av_fetch(name_list, ARG(o), 0 );
if (name)
Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
}
else {
- AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
I32 *nums=(I32*)SvPVX(sv_dat);
- SV **name= av_fetch(list, nums[0], 0 );
+ SV **name= av_fetch(name_list, nums[0], 0 );
I32 n;
if (name) {
for ( n=0; n<SvIVX(sv_dat); n++ ) {
PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
}
}
- } else if (k == GOSUB)
+ } else if (k == GOSUB) {
+ AV *name_list= NULL;
+ if ( RXp_PAREN_NAMES(prog) ) {
+ name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
+ } else if ( pRExC_state ) {
+ name_list= RExC_paren_name_list;
+ }
+
/* Paren and offset */
Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
+ if (name_list) {
+ SV **name= av_fetch(name_list, ARG(o), 0 );
+ if (name)
+ Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
+ }
+ }
else if (k == VERB) {
if (!o->flags)
Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
SV* bitmap_invlist; /* Will hold what the bit map contains */
- if (flags & ANYOF_LOCALE_FLAGS)
+ if (OP(o) == ANYOFL)
sv_catpvs(sv, "{loc}");
if (flags & ANYOF_LOC_FOLD)
sv_catpvs(sv, "{i}");
sv_catpvs(sv, "{non-utf8-latin1-all}");
}
- /* output information about the unicode matching */
if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)
sv_catpvs(sv, "{above_bitmap_all}");
- else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
+
+ if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) {
SV *lv; /* Set if there is something outside the bit map. */
- bool byte_output = FALSE; /* If something in the bitmap has
- been output */
+ bool byte_output = FALSE; /* If something has been output */
SV *only_utf8_locale;
/* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
PERL_UNUSED_ARG(o);
PERL_UNUSED_ARG(prog);
PERL_UNUSED_ARG(reginfo);
+ PERL_UNUSED_ARG(pRExC_state);
#endif /* DEBUGGING */
}
DEBUG_COMPILE_r(
{
- const char * const s = SvPV_nolen_const(prog->check_substr
- ? prog->check_substr : prog->check_utf8);
+ const char * const s = SvPV_nolen_const(RX_UTF8(r)
+ ? prog->check_utf8 : prog->check_substr);
if (!PL_colorset) reginitcolors();
PerlIO_printf(Perl_debug_log,
"%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
PL_colors[4],
- prog->check_substr ? "" : "utf8 ",
+ RX_UTF8(r) ? "utf8 " : "",
PL_colors[5],PL_colors[0],
s,
PL_colors[1],
(strlen(s) > 60 ? "..." : ""));
} );
- return prog->check_substr ? prog->check_substr : prog->check_utf8;
+ /* use UTF8 check substring if regexp pattern itself is in UTF8 */
+ return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr;
}
/*
}
#ifdef DEBUGGING
-/* Certain characters are output as a sequence with the first being a
- * backslash. */
-#define isBACKSLASHED_PUNCT(c) \
- ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
STATIC void
S_put_code_point(pTHX_ SV *sv, UV c)
#define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
-#ifndef MIN
-#define MIN(a,b) ((a) < (b) ? (a) : (b))
-#endif
-
STATIC void
S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
{
format = (this_end < 256)
? "\\x{%02"UVXf"}-\\x{%02"UVXf"}"
: "\\x{%04"UVXf"}-\\x{%04"UVXf"}";
+ GCC_DIAG_IGNORE(-Wformat-nonliteral);
Perl_sv_catpvf(aTHX_ sv, format, start, this_end);
+ GCC_DIAG_RESTORE;
break;
}
}
} else
CLEAR_OPTSTART;
- regprop(r, sv, node, NULL);
+ regprop(r, sv, node, NULL, NULL);
PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
(int)(2*indent + 1), "", SvPVX_const(sv));