# include "charclass_invlists.h"
#endif
+#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
+
#ifdef op
#undef op
#endif /* op */
#define HASWIDTH 0x01 /* Known to match non-null strings. */
/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
- * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
+ * character, and if utf8, must be invariant. Note that this is not the same
+ * thing as REGNODE_SIMPLE */
#define SIMPLE 0x02
#define SPSTART 0x04 /* Starts with * or +. */
#define TRYAGAIN 0x08 /* Weeded out a declaration. */
#define SCF_SEEN_ACCEPT 0x8000
#define UTF cBOOL(RExC_utf8)
+
+/* The enums for all these are ordered so things work out correctly */
#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
-#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
+#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
*(d++) = uv;
*/
-#define TRIE_STORE_REVCHAR \
+#define TRIE_STORE_REVCHAR(val) \
STMT_START { \
if (UTF) { \
- SV *zlopp = newSV(2); \
+ SV *zlopp = newSV(7); /* XXX: optimize me */ \
unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
- unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
+ unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
SvCUR_set(zlopp, kapow - flrbbbbb); \
SvPOK_on(zlopp); \
SvUTF8_on(zlopp); \
av_push(revcharmap, zlopp); \
} else { \
- char ooooff = (char)uvc; \
+ char ooooff = (char)val; \
av_push(revcharmap, newSVpvn(&ooooff, 1)); \
} \
} STMT_END
-#define TRIE_READ_CHAR STMT_START { \
- wordlen++; \
- if ( UTF ) { \
- if ( folder ) { \
- if ( foldlen > 0 ) { \
- uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
- foldlen -= len; \
- scan += len; \
- len = 0; \
- } else { \
- len = UTF8SKIP(uc);\
- uvc = to_utf8_fold( uc, foldbuf, &foldlen); \
- foldlen -= UNISKIP( uvc ); \
- scan = foldbuf + UNISKIP( uvc ); \
- } \
- } else { \
- uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
- } \
- } else { \
- uvc = (U32)*uc; \
- len = 1; \
- } \
+#define TRIE_READ_CHAR STMT_START { \
+ wordlen++; \
+ if ( UTF ) { \
+ /* if it is UTF then it is either already folded, or does not need folding */ \
+ uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
+ } \
+ else if (folder == PL_fold_latin1) { \
+ /* if we use this folder we have to obey unicode rules on latin-1 data */ \
+ if ( foldlen > 0 ) { \
+ uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
+ foldlen -= len; \
+ scan += len; \
+ len = 0; \
+ } else { \
+ len = 1; \
+ uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
+ skiplen = UNISKIP(uvc); \
+ foldlen -= skiplen; \
+ scan = foldbuf + skiplen; \
+ } \
+ } else { \
+ /* raw data, will be folded later if needed */ \
+ uvc = (U32)*uc; \
+ len = 1; \
+ } \
} STMT_END
switch (flags) {
case EXACT: break;
case EXACTFA:
+ case EXACTFU_SS:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU: folder = PL_fold_latin1; break;
case EXACTF: folder = PL_fold; break;
case EXACTFL: folder = PL_fold_locale; break;
- default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
+ default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
}
trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
trie->wordcount = word_count;
RExC_rxi->data->data[ data_slot ] = (void*)trie;
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
- if (!(UTF && folder))
+ if (flags == EXACT)
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
trie->wordcount+1, sizeof(reg_trie_wordinfo));
if (!SvIOK(re_trie_maxbuff)) {
sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
}
- DEBUG_OPTIMISE_r({
+ DEBUG_TRIE_COMPILE_r({
PerlIO_printf( Perl_debug_log,
"%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
(int)depth * 2 + 2, "",
*/
for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
- regnode * const noper = NEXTOPER( cur );
+ regnode *noper = NEXTOPER( cur );
const U8 *uc = (U8*)STRING( noper );
- const U8 * const e = uc + STR_LEN( noper );
+ const U8 *e = uc + STR_LEN( noper );
STRLEN foldlen = 0;
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ STRLEN skiplen = 0;
const U8 *scan = (U8*)NULL;
U32 wordlen = 0; /* required init */
STRLEN chars = 0;
bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
if (OP(noper) == NOTHING) {
- trie->minlen= 0;
- continue;
+ regnode *noper_next= regnext(noper);
+ if (noper_next != tail && OP(noper_next) == flags) {
+ noper = noper_next;
+ uc= (U8*)STRING(noper);
+ e= uc + STR_LEN(noper);
+ trie->minlen= STR_LEN(noper);
+ } else {
+ trie->minlen= 0;
+ continue;
+ }
}
- if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
+
+ if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
regardless of encoding */
-
+ if (OP( noper ) == EXACTFU_SS) {
+ /* false positives are ok, so just set this */
+ TRIE_BITMAP_SET(trie,0xDF);
+ }
+ }
for ( ; uc < e ; uc += len ) {
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
chars++;
if ( uvc < 256 ) {
+ if ( folder ) {
+ U8 folded= folder[ (U8) uvc ];
+ if ( !trie->charmap[ folded ] ) {
+ trie->charmap[ folded ]=( ++trie->uniquecharcount );
+ TRIE_STORE_REVCHAR( folded );
+ }
+ }
if ( !trie->charmap[ uvc ] ) {
trie->charmap[ uvc ]=( ++trie->uniquecharcount );
- if ( folder )
- trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
- TRIE_STORE_REVCHAR;
+ TRIE_STORE_REVCHAR( uvc );
}
if ( set_bit ) {
/* store the codepoint in the bitmap, and its folded
* equivalent. */
- TRIE_BITMAP_SET(trie,uvc);
+ TRIE_BITMAP_SET(trie, uvc);
/* store the folded codepoint */
- if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+ if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
if ( !UTF ) {
/* store first byte of utf8 representation of
if ( !SvTRUE( *svpp ) ) {
sv_setiv( *svpp, ++trie->uniquecharcount );
- TRIE_STORE_REVCHAR;
+ TRIE_STORE_REVCHAR(uvc);
}
}
}
if( cur == first ) {
- trie->minlen=chars;
- trie->maxlen=chars;
+ trie->minlen = chars;
+ trie->maxlen = chars;
} else if (chars < trie->minlen) {
- trie->minlen=chars;
+ trie->minlen = chars;
} else if (chars > trie->maxlen) {
- trie->maxlen=chars;
+ trie->maxlen = chars;
+ }
+ if (OP( noper ) == EXACTFU_SS) {
+ /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
+ if (trie->minlen > 1)
+ trie->minlen= 1;
+ }
+ if (OP( noper ) == EXACTFU_TRICKYFOLD) {
+ /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
+ * - We assume that any such sequence might match a 2 byte string */
+ if (trie->minlen > 2 )
+ trie->minlen= 2;
}
} /* end first pass */
for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
- regnode * const noper = NEXTOPER( cur );
+ regnode *noper = NEXTOPER( cur );
U8 *uc = (U8*)STRING( noper );
- const U8 * const e = uc + STR_LEN( noper );
+ const U8 *e = uc + STR_LEN( noper );
U32 state = 1; /* required init */
U16 charid = 0; /* sanity init */
U8 *scan = (U8*)NULL; /* sanity init */
STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ STRLEN skiplen = 0;
+
+ if (OP(noper) == NOTHING) {
+ regnode *noper_next= regnext(noper);
+ if (noper_next != tail && OP(noper_next) == flags) {
+ noper = noper_next;
+ uc= (U8*)STRING(noper);
+ e= uc + STR_LEN(noper);
+ }
+ }
if (OP(noper) != NOTHING) {
for ( ; uc < e ; uc += len ) {
for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
- regnode * const noper = NEXTOPER( cur );
+ regnode *noper = NEXTOPER( cur );
const U8 *uc = (U8*)STRING( noper );
- const U8 * const e = uc + STR_LEN( noper );
+ const U8 *e = uc + STR_LEN( noper );
U32 state = 1; /* required init */
STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
+ STRLEN skiplen = 0;
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ if (OP(noper) == NOTHING) {
+ regnode *noper_next= regnext(noper);
+ if (noper_next != tail && OP(noper_next) == flags) {
+ noper = noper_next;
+ uc= (U8*)STRING(noper);
+ e= uc + STR_LEN(noper);
+ }
+ }
+
if ( OP(noper) != NOTHING ) {
for ( ; uc < e ; uc += len ) {
* optimizer doesn't reject these possibilities based on size constraints.
* 2) These sequences are not currently correctly handled by the trie code
* either, so it changes the joined node type to ops that are not handled
- * by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
+ * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
* 3) This is sufficient for the two Greek sequences (described below), but
* the one involving the Sharp s (\xDF) needs more. The node type
* EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
* is. (I (khw) think it doesn't matter in regexec.c
* for UTF patterns, but no need to change it */
if (OP(scan) == EXACTFU) {
- OP(scan) = EXACTFU_NO_TRIE;
+ OP(scan) = EXACTFU_TRICKYFOLD;
}
s += 6; /* We already know what this sequence is. Skip
the rest of it */
regnode *first = (regnode *)NULL;
regnode *last = (regnode *)NULL;
regnode *tail = scan;
- U8 optype = 0;
+ U8 trietype = 0;
U32 count=0;
#ifdef DEBUGGING
}
- DEBUG_OPTIMISE_r({
+ DEBUG_TRIE_COMPILE_r({
regprop(RExC_rx, mysv, tail );
PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
(int)depth * 2 + 2, "",
/*
- step through the branches, cur represents each
- branch, noper is the first thing to be matched
- as part of that branch and noper_next is the
- regnext() of that node. if noper is an EXACT
- and noper_next is the same as scan (our current
- position in the regex) then the EXACT branch is
- a possible optimization target. Once we have
- two or more consecutive such branches we can
- create a trie of the EXACT's contents and stich
- it in place. If the sequence represents all of
- the branches we eliminate the whole thing and
- replace it with a single TRIE. If it is a
- subsequence then we need to stitch it in. This
- means the first branch has to remain, and needs
- to be repointed at the item on the branch chain
- following the last branch optimized. This could
- be either a BRANCH, in which case the
- subsequence is internal, or it could be the
- item following the branch sequence in which
- case the subsequence is at the end.
+ Step through the branches
+ cur represents each branch,
+ noper is the first thing to be matched as part of that branch
+ noper_next is the regnext() of that node.
+
+ We normally handle a case like this /FOO[xyz]|BAR[pqr]/
+ via a "jump trie" but we also support building with NOJUMPTRIE,
+ which restricts the trie logic to structures like /FOO|BAR/.
+
+ If noper is a trieable nodetype then the branch is a possible optimization
+ target. If we are building under NOJUMPTRIE then we require that noper_next
+ is the same as scan (our current position in the regex program).
+
+ Once we have two or more consecutive such branches we can create a
+ trie of the EXACT's contents and stitch it in place into the program.
+
+ If the sequence represents all of the branches in the alternation we
+ replace the entire thing with a single TRIE node.
+
+ Otherwise when it is a subsequence we need to stitch it in place and
+ replace only the relevant branches. This means the first branch has
+ to remain as it is used by the alternation logic, and its next pointer,
+ and needs to be repointed at the item on the branch chain following
+ the last branch we have optimized away.
+
+ This could be either a BRANCH, in which case the subsequence is internal,
+ or it could be the item following the branch sequence in which case the
+ subsequence is at the end (which does not necessarily mean the first node
+ is the start of the alternation).
+
+ TRIE_TYPE(X) is a define which maps the optype to a trietype.
+
+ optype | trietype
+ ----------------+-----------
+ NOTHING | NOTHING
+ EXACT | EXACT
+ EXACTFU | EXACTFU
+ EXACTFU_SS | EXACTFU
+ EXACTFU_TRICKYFOLD | EXACTFU
+ EXACTFA | 0
+
*/
+#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
+ ( EXACT == (X) ) ? EXACT : \
+ ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
+ 0 )
/* dont use tail as the end marker for this traverse */
for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
regnode * const noper = NEXTOPER( cur );
+ U8 noper_type = OP( noper );
+ U8 noper_trietype = TRIE_TYPE( noper_type );
#if defined(DEBUGGING) || defined(NOJUMPTRIE)
regnode * const noper_next = regnext( noper );
+ U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
+ U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
#endif
- DEBUG_OPTIMISE_r({
+ DEBUG_TRIE_COMPILE_r({
regprop(RExC_rx, mysv, cur);
PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
(int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
PerlIO_printf( Perl_debug_log,"\t=> %s\t",
SvPV_nolen_const(mysv));
}
- PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
- REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
+ PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
+ REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
+ PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
+ );
});
- if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
- : PL_regkind[ OP( noper ) ] == EXACT )
- || OP(noper) == NOTHING )
+
+ /* Is noper a trieable nodetype that can be merged with the
+ * current trie (if there is one)? */
+ if ( noper_trietype
+ &&
+ (
+ ( noper_trietype == NOTHING)
+ || ( trietype == NOTHING )
+ || ( trietype == noper_trietype )
+ )
#ifdef NOJUMPTRIE
&& noper_next == tail
#endif
&& count < U16_MAX)
{
- count++;
- if ( !first || optype == NOTHING ) {
- if (!first) first = cur;
- optype = OP( noper );
+ /* Handle mergable triable node
+ * Either we are the first node in a new trieable sequence,
+ * in which case we do some bookkeeping, otherwise we update
+ * the end pointer. */
+ if ( !first ) {
+ first = cur;
+ trietype = noper_trietype;
+ if ( noper_trietype == NOTHING ) {
+#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
+ regnode * const noper_next = regnext( noper );
+ U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
+ U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
+#endif
+
+ if ( noper_next_trietype )
+ trietype = noper_next_trietype;
+ }
} else {
+ if ( trietype == NOTHING )
+ trietype = noper_trietype;
last = cur;
}
- } else {
-/*
- Currently the trie logic handles case insensitive matching properly only
- when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
- semantics).
-
- If/when this is fixed the following define can be swapped
- in below to fully enable trie logic.
-
-#define TRIE_TYPE_IS_SAFE 1
-
-Note that join_exact() assumes that the other types of EXACTFish nodes are not
-used in tries, so that would have to be updated if this changed
-
-*/
-#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
-
- if ( last && TRIE_TYPE_IS_SAFE ) {
- make_trie( pRExC_state,
- startbranch, first, cur, tail, count,
- optype, depth+1 );
+ if (first)
+ count++;
+ } /* end handle mergable triable node */
+ else {
+ /* handle unmergable node -
+ * noper may either be a triable node which can not be tried
+ * together with the current trie, or a non triable node */
+ if ( last ) {
+ /* If last is set and trietype is not NOTHING then we have found
+ * at least two triable branch sequences in a row of a similar
+ * trietype so we can turn them into a trie. If/when we
+ * allow NOTHING to start a trie sequence this condition will be
+ * required, and it isn't expensive so we leave it in for now. */
+ if ( trietype != NOTHING )
+ make_trie( pRExC_state,
+ startbranch, first, cur, tail, count,
+ trietype, depth+1 );
+ last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
}
- if ( PL_regkind[ OP( noper ) ] == EXACT
+ if ( noper_trietype
#ifdef NOJUMPTRIE
&& noper_next == tail
#endif
){
+ /* noper is triable, so we can start a new trie sequence */
count = 1;
first = cur;
- optype = OP( noper );
- } else {
+ trietype = noper_trietype;
+ } else if (first) {
+ /* if we already saw a first but the current node is not triable then we have
+ * to reset the first information. */
count = 0;
first = NULL;
- optype = 0;
+ trietype = 0;
}
- last = NULL;
- }
- }
- DEBUG_OPTIMISE_r({
+ } /* end handle unmergable node */
+ } /* loop over branches */
+ DEBUG_TRIE_COMPILE_r({
regprop(RExC_rx, mysv, cur);
PerlIO_printf( Perl_debug_log,
"%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
});
-
- if ( last && TRIE_TYPE_IS_SAFE ) {
- made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
+ if ( last ) {
+ if ( trietype != NOTHING ) {
+ /* the last branch of the sequence was part of a trie,
+ * so we have to construct it here outside of the loop
+ */
+ made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
#ifdef TRIE_STUDY_OPT
- if ( ((made == MADE_EXACT_TRIE &&
- startbranch == first)
- || ( first_non_open == first )) &&
- depth==0 ) {
- flags |= SCF_TRIE_RESTUDY;
- if ( startbranch == first
- && scan == tail )
- {
- RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
+ if ( ((made == MADE_EXACT_TRIE &&
+ startbranch == first)
+ || ( first_non_open == first )) &&
+ depth==0 ) {
+ flags |= SCF_TRIE_RESTUDY;
+ if ( startbranch == first
+ && scan == tail )
+ {
+ RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
+ }
}
- }
#endif
- }
- }
+ } else {
+ /* at this point we know whatever we have is a NOTHING sequence/branch
+ * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
+ */
+ if ( startbranch == first ) {
+ regnode *opt;
+ /* the entire thing is a NOTHING sequence, something like this:
+ * (?:|) So we can turn it into a plain NOTHING op. */
+ DEBUG_TRIE_COMPILE_r({
+ regprop(RExC_rx, mysv, cur);
+ PerlIO_printf( Perl_debug_log,
+ "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
+ "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
+
+ });
+ OP(startbranch)= NOTHING;
+ NEXT_OFF(startbranch)= tail - startbranch;
+ for ( opt= startbranch + 1; opt < tail ; opt++ )
+ OP(opt)= OPTIMIZED;
+ }
+ }
+ } /* end if ( last) */
+ } /* TRIE_MAXBUF is non zero */
} /* do trie */
UV uc;
if (UTF) {
const U8 * const s = (U8*)STRING(scan);
+ uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
- uc = utf8_to_uvchr(s, NULL);
} else {
uc = *((U8*)STRING(scan));
}
* elsewhere that does compute the fold settles down, it
* can be extracted out and re-used here */
for (i = 0; i < 256; i++){
- if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
+ if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
ANYOF_BITMAP_SET(data->start_class, i);
}
}
}
if (UTF) {
const U8 * const s = (U8 *)STRING(scan);
+ uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
- uc = utf8_to_uvchr(s, NULL);
}
else if (has_exactf_sharp_s) {
RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
/* Lookbehind, or need to calculate parens/evals/stclass: */
&& (scan->flags || data || (flags & SCF_DO_STCLASS))
&& (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
+ if ( OP(scan) == UNLESSM &&
+ scan->flags == 0 &&
+ OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
+ OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
+ ) {
+ regnode *opt;
+ regnode *upto= regnext(scan);
+ DEBUG_PARSE_r({
+ SV * const mysv_val=sv_newmortal();
+ DEBUG_STUDYDATA("OPFAIL",data,depth);
+
+ /*DEBUG_PARSE_MSG("opfail");*/
+ regprop(RExC_rx, mysv_val, upto);
+ PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
+ SvPV_nolen_const(mysv_val),
+ (IV)REG_NODE_NUM(upto),
+ (IV)(upto - scan)
+ );
+ });
+ OP(scan) = OPFAIL;
+ NEXT_OFF(scan) = upto - scan;
+ for (opt= scan + 1; opt < upto ; opt++)
+ OP(opt) = OPTIMIZED;
+ scan= upto;
+ continue;
+ }
if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
|| OP(scan) == UNLESSM )
{
}
}
}
-
-
}
#endif
}
* Clever compilers notice this and complain. --jhi */
REGC((U8)REG_MAGIC, (char*)RExC_emit);
#endif
- DEBUG_PARSE_r(
- PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
- RExC_lastnum=0;
- RExC_lastparse=NULL;
- );
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
if (reg(pRExC_state, 0, &flags,1) == NULL) {
RExC_precomp = NULL;
return(NULL);
#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
-#ifndef PERL_IN_XSUB_RE
-void
-Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
+STATIC void
+S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
{
/* Subject to change or removal. Append the range from 'start' to 'end' at
* the end of the inversion list. The range must be above any existing
}
}
+#ifndef PERL_IN_XSUB_RE
+
STATIC IV
S_invlist_search(pTHX_ SV* const invlist, const UV cp)
{
return;
}
-#endif
-
-STATIC SV*
-S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
+SV*
+Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
{
/* Add the range from 'start' to 'end' inclusive to the inversion list's
* set. A pointer to the inversion list is returned. This may actually be
return invlist;
}
+#endif
+
PERL_STATIC_INLINE SV*
S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
- return add_range_to_invlist(invlist, cp, cp);
+ return _add_range_to_invlist(invlist, cp, cp);
}
#ifndef PERL_IN_XSUB_RE
RExC_parse++;
}
if (*RExC_parse != ')') {
- RExC_parse = s;
+ RExC_parse = s;
vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
}
if (!SIZE_ONLY) {
U32 posflags = 0, negflags = 0;
U32 *flagsp = &posflags;
char has_charset_modifier = '\0';
- regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
- ? REGEX_UNICODE_CHARSET
- : REGEX_DEPENDS_CHARSET;
+ regex_charset cs = get_regex_charset(RExC_flags);
+ if (cs == REGEX_DEPENDS_CHARSET
+ && (RExC_utf8 || RExC_uni_semantics))
+ {
+ cs = REGEX_UNICODE_CHARSET;
+ }
while (*RExC_parse) {
/* && strchr("iogcmsx", *RExC_parse) */
}
break;
}
+ DEBUG_PARSE_r(if (!SIZE_ONLY) {
+ SV * const mysv_val1=sv_newmortal();
+ SV * const mysv_val2=sv_newmortal();
+ DEBUG_PARSE_MSG("lsbr");
+ regprop(RExC_rx, mysv_val1, lastbr);
+ regprop(RExC_rx, mysv_val2, ender);
+ PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
+ SvPV_nolen_const(mysv_val1),
+ (IV)REG_NODE_NUM(lastbr),
+ SvPV_nolen_const(mysv_val2),
+ (IV)REG_NODE_NUM(ender),
+ (IV)(ender - lastbr)
+ );
+ });
REGTAIL(pRExC_state, lastbr, ender);
if (have_branch && !SIZE_ONLY) {
+ char is_nothing= 1;
if (depth==1)
RExC_seen |= REG_TOP_LEVEL_BRANCHES;
const U8 op = PL_regkind[OP(br)];
if (op == BRANCH) {
REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
+ if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
+ is_nothing= 0;
}
else if (op == BRANCHJ) {
REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
+ /* for now we always disable this optimisation * /
+ if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
+ */
+ is_nothing= 0;
}
}
+ if (is_nothing) {
+ br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
+ DEBUG_PARSE_r(if (!SIZE_ONLY) {
+ SV * const mysv_val1=sv_newmortal();
+ SV * const mysv_val2=sv_newmortal();
+ DEBUG_PARSE_MSG("NADA");
+ regprop(RExC_rx, mysv_val1, ret);
+ regprop(RExC_rx, mysv_val2, ender);
+ PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
+ SvPV_nolen_const(mysv_val1),
+ (IV)REG_NODE_NUM(ret),
+ SvPV_nolen_const(mysv_val2),
+ (IV)REG_NODE_NUM(ender),
+ (IV)(ender - ret)
+ );
+ });
+ OP(br)= NOTHING;
+ if (OP(ender) == TAIL) {
+ NEXT_OFF(br)= 0;
+ RExC_emit= br + 1;
+ } else {
+ regnode *opt;
+ for ( opt= br + 1; opt < ender ; opt++ )
+ OP(opt)= OPTIMIZED;
+ NEXT_OFF(br)= ender - br;
+ }
+ }
}
}
vFAIL("Internal urp");
/* Supposed to be caught earlier. */
break;
- case '{':
- if (!regcurly(RExC_parse)) {
- RExC_parse++;
- goto defchar;
- }
- /* FALL THROUGH */
case '?':
case '+':
case '*':
ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
- if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
- ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
- }
goto finish_meta_pat;
case 'B':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
- if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
- ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
- }
goto finish_meta_pat;
case 's':
switch (get_regex_charset(RExC_flags)) {
/* FALL THROUGH */
default:
if (!SIZE_ONLY&& isALPHA(*p)) {
- /* Include any { following the alpha to emphasize
- * that it could be part of an escape at some point
- * in the future */
- int len = (*(p + 1) == '{') ? 2 : 1;
- ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
+ ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
}
goto normal_default;
}
break;
+ case '{':
+ /* Currently we don't warn when the lbrace is at the start
+ * of a construct. This catches it in the middle of a
+ * literal string, or when its the first thing after
+ * something like "\b" */
+ if (! SIZE_ONLY
+ && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
+ {
+ ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
+ }
+ /*FALLTHROUGH*/
default:
normal_default:
if (UTF8_IS_START(*p) && UTF) {
for (foldbuf = tmpbuf;
foldlen;
foldlen -= numlen) {
- ender = utf8_to_uvchr(foldbuf, &numlen);
+
+ /* tmpbuf has been constructed by us, so we
+ * know it is valid utf8 */
+ ender = valid_utf8_to_uvchr(foldbuf, &numlen);
if (numlen > 0) {
const STRLEN unilen = reguni(pRExC_state, ender, s);
s += unilen;
for (foldbuf = tmpbuf;
foldlen;
foldlen -= numlen) {
- ender = utf8_to_uvchr(foldbuf, &numlen);
+ ender = valid_utf8_to_uvchr(foldbuf, &numlen);
if (numlen > 0) {
const STRLEN unilen = reguni(pRExC_state, ender, s);
len += unilen;
* time */
#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
l1_sourcelist, Xpropertyname, run_time_list) \
+ /* First, resolve whether to use the ASCII-only list or the L1 \
+ * list */ \
+ DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
+ ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
+ Xpropertyname, run_time_list)
+
+#define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
+ Xpropertyname, run_time_list) \
/* If not /a matching, there are going to be code points we will have \
* to defer to runtime to look-up */ \
if (! AT_LEAST_ASCII_RESTRICTED) { \
ANYOF_CLASS_SET(node, class); \
} \
else { \
- _invlist_union(destlist, \
- (AT_LEAST_ASCII_RESTRICTED) \
- ? sourcelist \
- : l1_sourcelist, \
- &destlist); \
+ _invlist_union(destlist, sourcelist, &destlist); \
}
/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
PL_PosixCntrl, PL_XPosixCntrl);
break;
case ANYOF_DIGIT:
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+ /* There are no digits in the Latin1 range outside of
+ * ASCII, so call the macro that doesn't have to resolve
+ * them */
+ DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
+ PL_PosixDigit, "XPosixDigit", listsv);
break;
case ANYOF_NDIGIT:
DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
if (value > 255) {
const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
const UV natvalue = NATIVE_TO_UNI(value);
- nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
+ nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
}
#ifdef EBCDIC
literal_endpoint = 0;
U8 foldbuf[UTF8_MAXBYTES_CASE+1];
STRLEN foldlen;
const UV f =
- _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
+ _to_uni_fold_flags(j, foldbuf, &foldlen,
+ (allow_full_fold) ? FOLD_FLAGS_FULL : 0);
if (foldlen > (STRLEN)UNISKIP(f)) {
case EXACTFA:
case EXACTFU:
case EXACTFU_SS:
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(scan);
* Local variables:
* c-indentation-style: bsd
* c-basic-offset: 4
- * indent-tabs-mode: t
+ * indent-tabs-mode: nil
* End:
*
- * ex: set ts=8 sts=4 sw=4 noet:
+ * ex: set ts=8 sts=4 sw=4 et:
*/