X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/d82f99444a5b81603cb066efa358862a46ce3275..37e77c235a1d9e646643ada8d96154d9ae4d5137:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 88d9218..7500e37 100644 --- a/regcomp.c +++ b/regcomp.c @@ -86,6 +86,11 @@ #endif #include "dquote_static.c" +#ifndef PERL_IN_XSUB_RE +# include "charclass_invlists.h" +#endif + +#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #ifdef op #undef op @@ -104,8 +109,10 @@ #define STATIC static #endif + typedef struct RExC_state_t { - U32 flags; /* are we folding, multilining? */ + U32 flags; /* RXf_* are we folding, multilining? */ + U32 pm_flags; /* PMf_* stuff from the calling PMOP */ char *precomp; /* uncompiled string. */ REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ @@ -126,7 +133,6 @@ typedef struct RExC_state_t { I32 nestroot; /* root parens we are in - used by accept */ I32 extralen; I32 seen_zerolen; - I32 seen_evals; regnode **open_parens; /* pointers to open parens */ regnode **close_parens; /* pointers to close parens */ regnode *opend; /* END node in program */ @@ -144,10 +150,15 @@ typedef struct RExC_state_t { I32 in_lookbehind; I32 contains_locale; I32 override_recoding; + struct reg_code_block *code_blocks; /* positions of literal (?{}) + within pattern */ + int num_code_blocks; /* size of code_blocks[] */ + int code_index; /* next code_blocks[] slot */ #if ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) #endif + SV *runtime_code_qr; /* qr with the runtime code blocks */ #ifdef DEBUGGING const char *lastparse; I32 lastnum; @@ -159,6 +170,7 @@ typedef struct RExC_state_t { } RExC_state_t; #define RExC_flags (pRExC_state->flags) +#define RExC_pm_flags (pRExC_state->pm_flags) #define RExC_precomp (pRExC_state->precomp) #define RExC_rx_sv (pRExC_state->rx_sv) #define RExC_rx (pRExC_state->rx) @@ -181,7 +193,6 @@ typedef struct RExC_state_t { #define RExC_nestroot (pRExC_state->nestroot) #define RExC_extralen (pRExC_state->extralen) #define RExC_seen_zerolen (pRExC_state->seen_zerolen) -#define RExC_seen_evals (pRExC_state->seen_evals) #define RExC_utf8 (pRExC_state->utf8) #define RExC_uni_semantics (pRExC_state->uni_semantics) #define RExC_orig_utf8 (pRExC_state->orig_utf8) @@ -210,7 +221,8 @@ typedef struct RExC_state_t { #define HASWIDTH 0x01 /* Known to match non-null strings. */ /* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single - * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */ + * character, and if utf8, must be invariant. Note that this is not the same + * thing as REGNODE_SIMPLE */ #define SIMPLE 0x02 #define SPSTART 0x04 /* Starts with * or +. */ #define TRYAGAIN 0x08 /* Weeded out a declaration. */ @@ -379,9 +391,11 @@ static const scan_data_t zero_scan_data = #define SCF_SEEN_ACCEPT 0x8000 #define UTF cBOOL(RExC_utf8) + +/* The enums for all these are ordered so things work out correctly */ #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) -#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET) +#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) @@ -1361,44 +1375,47 @@ is the recommended Unicode-aware way of saying *(d++) = uv; */ -#define TRIE_STORE_REVCHAR \ +#define TRIE_STORE_REVCHAR(val) \ STMT_START { \ if (UTF) { \ - SV *zlopp = newSV(2); \ + SV *zlopp = newSV(7); /* XXX: optimize me */ \ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ - unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \ + unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \ SvCUR_set(zlopp, kapow - flrbbbbb); \ SvPOK_on(zlopp); \ SvUTF8_on(zlopp); \ av_push(revcharmap, zlopp); \ } else { \ - char ooooff = (char)uvc; \ + char ooooff = (char)val; \ av_push(revcharmap, newSVpvn(&ooooff, 1)); \ } \ } STMT_END -#define TRIE_READ_CHAR STMT_START { \ - wordlen++; \ - if ( UTF ) { \ - if ( folder ) { \ - if ( foldlen > 0 ) { \ - uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - scan += len; \ - len = 0; \ - } else { \ - len = UTF8SKIP(uc);\ - uvc = to_utf8_fold( uc, foldbuf, &foldlen); \ - foldlen -= UNISKIP( uvc ); \ - scan = foldbuf + UNISKIP( uvc ); \ - } \ - } else { \ - uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ - } \ - } else { \ - uvc = (U32)*uc; \ - len = 1; \ - } \ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need folding */ \ + uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* if we use this folder we have to obey unicode rules on latin-1 data */ \ + if ( foldlen > 0 ) { \ + uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + scan += len; \ + len = 0; \ + } else { \ + len = 1; \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \ + skiplen = UNISKIP(uvc); \ + foldlen -= skiplen; \ + scan = foldbuf + skiplen; \ + } \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ } STMT_END @@ -1517,10 +1534,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs switch (flags) { case EXACT: break; case EXACTFA: + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: case EXACTFU: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; case EXACTFL: folder = PL_fold_locale; break; - default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags ); + default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); } trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); @@ -1529,7 +1548,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->wordcount = word_count; RExC_rxi->data->data[ data_slot ] = (void*)trie; trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); - if (!(UTF && folder)) + if (flags == EXACT) trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( trie->wordcount+1, sizeof(reg_trie_wordinfo)); @@ -1542,7 +1561,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if (!SvIOK(re_trie_maxbuff)) { sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); } - DEBUG_OPTIMISE_r({ + DEBUG_TRIE_COMPILE_r({ PerlIO_printf( Perl_debug_log, "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", (int)depth * 2 + 2, "", @@ -1584,42 +1603,61 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs */ for ( cur = first ; cur < last ; cur = regnext( cur ) ) { - regnode * const noper = NEXTOPER( cur ); + regnode *noper = NEXTOPER( cur ); const U8 *uc = (U8*)STRING( noper ); - const U8 * const e = uc + STR_LEN( noper ); + const U8 *e = uc + STR_LEN( noper ); STRLEN foldlen = 0; U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + STRLEN skiplen = 0; const U8 *scan = (U8*)NULL; U32 wordlen = 0; /* required init */ STRLEN chars = 0; bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ if (OP(noper) == NOTHING) { - trie->minlen= 0; - continue; + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + trie->minlen= STR_LEN(noper); + } else { + trie->minlen= 0; + continue; + } } - if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */ + + if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte regardless of encoding */ - + if (OP( noper ) == EXACTFU_SS) { + /* false positives are ok, so just set this */ + TRIE_BITMAP_SET(trie,0xDF); + } + } for ( ; uc < e ; uc += len ) { TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; chars++; if ( uvc < 256 ) { + if ( folder ) { + U8 folded= folder[ (U8) uvc ]; + if ( !trie->charmap[ folded ] ) { + trie->charmap[ folded ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( folded ); + } + } if ( !trie->charmap[ uvc ] ) { trie->charmap[ uvc ]=( ++trie->uniquecharcount ); - if ( folder ) - trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ]; - TRIE_STORE_REVCHAR; + TRIE_STORE_REVCHAR( uvc ); } if ( set_bit ) { /* store the codepoint in the bitmap, and its folded * equivalent. */ - TRIE_BITMAP_SET(trie,uvc); + TRIE_BITMAP_SET(trie, uvc); /* store the folded codepoint */ - if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); if ( !UTF ) { /* store first byte of utf8 representation of @@ -1642,17 +1680,28 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !SvTRUE( *svpp ) ) { sv_setiv( *svpp, ++trie->uniquecharcount ); - TRIE_STORE_REVCHAR; + TRIE_STORE_REVCHAR(uvc); } } } if( cur == first ) { - trie->minlen=chars; - trie->maxlen=chars; + trie->minlen = chars; + trie->maxlen = chars; } else if (chars < trie->minlen) { - trie->minlen=chars; + trie->minlen = chars; } else if (chars > trie->maxlen) { - trie->maxlen=chars; + trie->maxlen = chars; + } + if (OP( noper ) == EXACTFU_SS) { + /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/ + if (trie->minlen > 1) + trie->minlen= 1; + } + if (OP( noper ) == EXACTFU_TRICKYFOLD) { + /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" + * - We assume that any such sequence might match a 2 byte string */ + if (trie->minlen > 2 ) + trie->minlen= 2; } } /* end first pass */ @@ -1716,15 +1765,25 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs for ( cur = first ; cur < last ; cur = regnext( cur ) ) { - regnode * const noper = NEXTOPER( cur ); + regnode *noper = NEXTOPER( cur ); U8 *uc = (U8*)STRING( noper ); - const U8 * const e = uc + STR_LEN( noper ); + const U8 *e = uc + STR_LEN( noper ); U32 state = 1; /* required init */ U16 charid = 0; /* sanity init */ U8 *scan = (U8*)NULL; /* sanity init */ STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + STRLEN skiplen = 0; + + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } + } if (OP(noper) != NOTHING) { for ( ; uc < e ; uc += len ) { @@ -1913,9 +1972,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs for ( cur = first ; cur < last ; cur = regnext( cur ) ) { - regnode * const noper = NEXTOPER( cur ); + regnode *noper = NEXTOPER( cur ); const U8 *uc = (U8*)STRING( noper ); - const U8 * const e = uc + STR_LEN( noper ); + const U8 *e = uc + STR_LEN( noper ); U32 state = 1; /* required init */ @@ -1925,8 +1984,18 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ + STRLEN skiplen = 0; U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } + } + if ( OP(noper) != NOTHING ) { for ( ; uc < e ; uc += len ) { @@ -2505,15 +2574,116 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode }}); +/* The below joins as many adjacent EXACTish nodes as possible into a single + * one, and looks for problematic sequences of characters whose folds vs. + * non-folds have sufficiently different lengths, that the optimizer would be + * fooled into rejecting legitimate matches of them, and the trie construction + * code can't cope with them. The joining is only done if: + * 1) there is room in the current conglomerated node to entirely contain the + * next one. + * 2) they are the exact same node type + * + * The adjacent nodes actually may be separated by NOTHING kind nodes, and + * these get optimized out + * + * If there are problematic code sequences, *min_subtract is set to the delta + * that the minimum size of the node can be less than its actual size. And, + * the node type of the result is changed to reflect that it contains these + * sequences. + * + * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF + * and contains LATIN SMALL LETTER SHARP S + * + * This is as good a place as any to discuss the design of handling these + * problematic sequences. It's been wrong in Perl for a very long time. There + * are three code points in Unicode whose folded lengths differ so much from + * the un-folded lengths that it causes problems for the optimizer and trie + * construction. Why only these are problematic, and not others where lengths + * also differ is something I (khw) do not understand. New versions of Unicode + * might add more such code points. Hopefully the logic in fold_grind.t that + * figures out what to test (in part by verifying that each size-combination + * gets tested) will catch any that do come along, so they can be added to the + * special handling below. The chances of new ones are actually rather small, + * as most, if not all, of the world's scripts that have casefolding have + * already been encoded by Unicode. Also, a number of Unicode's decisions were + * made to allow compatibility with pre-existing standards, and almost all of + * those have already been dealt with. These would otherwise be the most + * likely candidates for generating further tricky sequences. In other words, + * Unicode by itself is unlikely to add new ones unless it is for compatibility + * with pre-existing standards, and there aren't many of those left. + * + * The previous designs for dealing with these involved assigning a special + * node for them. This approach doesn't work, as evidenced by this example: + * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches + * Both these fold to "sss", but if the pattern is parsed to create a node of + * that would match just the \xDF, it won't be able to handle the case where a + * successful match would have to cross the node's boundary. The new approach + * that hopefully generally solves the problem generates an EXACTFU_SS node + * that is "sss". + * + * There are a number of components to the approach (a lot of work for just + * three code points!): + * 1) This routine examines each EXACTFish node that could contain the + * problematic sequences. It returns in *min_subtract how much to + * subtract from the the actual length of the string to get a real minimum + * for one that could match it. This number is usually 0 except for the + * problematic sequences. This delta is used by the caller to adjust the + * min length of the match, and the delta between min and max, so that the + * optimizer doesn't reject these possibilities based on size constraints. + * 2) These sequences are not currently correctly handled by the trie code + * either, so it changes the joined node type to ops that are not handled + * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD. + * 3) This is sufficient for the two Greek sequences (described below), but + * the one involving the Sharp s (\xDF) needs more. The node type + * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss" + * sequence in it. For non-UTF-8 patterns and strings, this is the only + * case where there is a possible fold length change. That means that a + * regular EXACTFU node without UTF-8 involvement doesn't have to concern + * itself with length changes, and so can be processed faster. regexec.c + * takes advantage of this. Generally, an EXACTFish node that is in UTF-8 + * is pre-folded by regcomp.c. This saves effort in regex matching. + * However, probably mostly for historical reasons, the pre-folding isn't + * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL + * nodes, as what they fold to isn't known until runtime.) The fold + * possibilities for the non-UTF8 patterns are quite simple, except for + * the sharp s. All the ones that don't involve a UTF-8 target string + * are members of a fold-pair, and arrays are set up for all of them + * that quickly find the other member of the pair. It might actually + * be faster to pre-fold these, but it isn't currently done, except for + * the sharp s. Code elsewhere in this file makes sure that it gets + * folded to 'ss', even if the pattern isn't UTF-8. This avoids the + * issues described in the next item. + * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches + * 'ss' or not is not knowable at compile time. It will match iff the + * target string is in UTF-8, unlike the EXACTFU nodes, where it always + * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus + * it can't be folded to "ss" at compile time, unlike EXACTFU does as + * described in item 3). An assumption that the optimizer part of + * regexec.c (probably unwittingly) makes is that a character in the + * pattern corresponds to at most a single character in the target string. + * (And I do mean character, and not byte here, unlike other parts of the + * documentation that have never been updated to account for multibyte + * Unicode.) This assumption is wrong only in this case, as all other + * cases are either 1-1 folds when no UTF-8 is involved; or is true by + * virtue of having this file pre-fold UTF-8 patterns. I'm + * reluctant to try to change this assumption, so instead the code punts. + * This routine examines EXACTF nodes for the sharp s, and returns a + * boolean indicating whether or not the node is an EXACTF node that + * contains a sharp s. When it is true, the caller sets a flag that later + * causes the optimizer in this file to not set values for the floating + * and fixed string lengths, and thus avoids the optimizer code in + * regexec.c that makes the invalid assumption. Thus, there is no + * optimization based on string lengths for EXACTF nodes that contain the + * sharp s. This only happens for /id rules (which means the pattern + * isn't in UTF-8). + */ - - -#define JOIN_EXACT(scan,min,flags) \ +#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \ if (PL_regkind[OP(scan)] == EXACT) \ - join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1) + join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1) STATIC U32 -S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) { +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) { /* Merge several consecutive EXACTish nodes into one. */ regnode *n = regnext(scan); U32 stringok = 1; @@ -2533,13 +2703,15 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags PERL_UNUSED_ARG(val); #endif DEBUG_PEEP("join",scan,depth); - - /* Skip NOTHING, merge EXACT*. */ - while (n && - ( PL_regkind[OP(n)] == NOTHING || - (stringok && (OP(n) == OP(scan)))) + + /* Look through the subsequent nodes in the chain. Skip NOTHING, merge + * EXACT ones that are mergeable to the current one. */ + while (n + && (PL_regkind[OP(n)] == NOTHING + || (stringok && OP(n) == OP(scan))) && NEXT_OFF(n) - && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { + && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) + { if (OP(n) == TAIL || n > next) stringok = 0; @@ -2588,67 +2760,180 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags } #endif } -#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390 -#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS -#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0 -#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS - if (UTF - && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA) - && ( STR_LEN(scan) >= 6 ) ) - { - /* - Two problematic code points in Unicode casefolding of EXACT nodes: - - U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS - U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS - - which casefold to - - Unicode UTF-8 - - U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 - U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 - - This means that in case-insensitive matching (or "loose matching", - as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte - length of the above casefolded versions) can match a target string - of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). - This would rather mess up the minimum length computation. - - What we'll do is to look for the tail four bytes, and then peek - at the preceding two bytes to see whether we need to decrease - the minimum length by four (six minus two). - - Thanks to the design of UTF-8, there cannot be false matches: - A sequence of valid UTF-8 bytes cannot be a subsequence of - another valid sequence of UTF-8 bytes. - - */ - char * const s0 = STRING(scan), *s, *t; - char * const s1 = s0 + STR_LEN(scan) - 1; - char * const s2 = s1 - 4; + *min_subtract = 0; + *has_exactf_sharp_s = FALSE; + + /* Here, all the adjacent mergeable EXACTish nodes have been merged. We + * can now analyze for sequences of problematic code points. (Prior to + * this final joining, sequences could have been split over boundaries, and + * hence missed). The sequences only happen in folding, hence for any + * non-EXACT EXACTish node */ + if (OP(scan) != EXACT) { + U8 *s; + U8 * s0 = (U8*) STRING(scan); + U8 * const s_end = s0 + STR_LEN(scan); + + /* The below is perhaps overboard, but this allows us to save a test + * each time through the loop at the expense of a mask. This is + * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a + * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64. + * This uses an exclusive 'or' to find that bit and then inverts it to + * form a mask, with just a single 0, in the bit position where 'S' and + * 's' differ. */ + const U8 S_or_s_mask = (U8) ~ ('S' ^ 's'); + const U8 s_masked = 's' & S_or_s_mask; + + /* One pass is made over the node's string looking for all the + * possibilities. to avoid some tests in the loop, there are two main + * cases, for UTF-8 patterns (which can't have EXACTF nodes) and + * non-UTF-8 */ + if (UTF) { + + /* There are two problematic Greek code points in Unicode + * casefolding + * + * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + * + * which casefold to + * + * Unicode UTF-8 + * + * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 + * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 + * + * This means that in case-insensitive matching (or "loose + * matching", as Unicode calls it), an EXACTF of length six (the + * UTF-8 encoded byte length of the above casefolded versions) can + * match a target string of length two (the byte length of UTF-8 + * encoded U+0390 or U+03B0). This would rather mess up the + * minimum length computation. (there are other code points that + * also fold to these two sequences, but the delta is smaller) + * + * If these sequences are found, the minimum length is decreased by + * four (six minus two). + * + * Similarly, 'ss' may match the single char and byte LATIN SMALL + * LETTER SHARP S. We decrease the min length by 1 for each + * occurrence of 'ss' found */ + #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */ - const char t0[] = "\xaf\x49\xaf\x42"; +# define U390_first_byte 0xb4 + const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42"; +# define U3B0_first_byte 0xb5 + const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42"; #else - const char t0[] = "\xcc\x88\xcc\x81"; +# define U390_first_byte 0xce + const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81"; +# define U3B0_first_byte 0xcf + const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81"; #endif - const char * const t1 = t0 + 3; - - for (s = s0 + 2; - s < s2 && (t = ninstr(s, s1, t0, t1)); - s = t + 4) { -#ifdef EBCDIC - if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) || - ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5)) -#else - if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || - ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) -#endif - *min -= 4; - } + const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte; + yields a net of 0 */ + /* Examine the string for one of the problematic sequences */ + for (s = s0; + s < s_end - 1; /* Can stop 1 before the end, as minimum length + * sequence we are looking for is 2 */ + s += UTF8SKIP(s)) + { + + /* Look for the first byte in each problematic sequence */ + switch (*s) { + /* We don't have to worry about other things that fold to + * 's' (such as the long s, U+017F), as all above-latin1 + * code points have been pre-folded */ + case 's': + case 'S': + + /* Current character is an 's' or 'S'. If next one is + * as well, we have the dreaded sequence */ + if (((*(s+1) & S_or_s_mask) == s_masked) + /* These two node types don't have special handling + * for 'ss' */ + && OP(scan) != EXACTFL && OP(scan) != EXACTFA) + { + *min_subtract += 1; + OP(scan) = EXACTFU_SS; + s++; /* No need to look at this character again */ + } + break; + + case U390_first_byte: + if (s_end - s >= len + + /* The 1's are because are skipping comparing the + * first byte */ + && memEQ(s + 1, U390_tail, len - 1)) + { + goto greek_sequence; + } + break; + + case U3B0_first_byte: + if (! (s_end - s >= len + && memEQ(s + 1, U3B0_tail, len - 1))) + { + break; + } + greek_sequence: + *min_subtract += 4; + + /* This can't currently be handled by trie's, so change + * the node type to indicate this. If EXACTFA and + * EXACTFL were ever to be handled by trie's, this + * would have to be changed. If this node has already + * been changed to EXACTFU_SS in this loop, leave it as + * is. (I (khw) think it doesn't matter in regexec.c + * for UTF patterns, but no need to change it */ + if (OP(scan) == EXACTFU) { + OP(scan) = EXACTFU_TRICKYFOLD; + } + s += 6; /* We already know what this sequence is. Skip + the rest of it */ + break; + } + } + } + else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { + + /* Here, the pattern is not UTF-8. We need to look only for the + * 'ss' sequence, and in the EXACTF case, the sharp s, which can be + * in the final position. Otherwise we can stop looking 1 byte + * earlier because have to find both the first and second 's' */ + const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; + + for (s = s0; s < upper; s++) { + switch (*s) { + case 'S': + case 's': + if (s_end - s > 1 + && ((*(s+1) & S_or_s_mask) == s_masked)) + { + *min_subtract += 1; + + /* EXACTF nodes need to know that the minimum + * length changed so that a sharp s in the string + * can match this ss in the pattern, but they + * remain EXACTF nodes, as they are not trie'able, + * so don't have to invent a new node type to + * exclude them from the trie code */ + if (OP(scan) != EXACTF) { + OP(scan) = EXACTFU_SS; + } + s++; + } + break; + case LATIN_SMALL_LETTER_SHARP_S: + if (OP(scan) == EXACTF) { + *has_exactf_sharp_s = TRUE; + } + break; + } + } + } } - + #ifdef DEBUGGING /* Allow dumping but overwriting the collection of skipped * ops and/or strings with fake optimized ops */ @@ -2762,10 +3047,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, fake_study_recurse: while ( scan && OP(scan) != END && scan < last ){ + UV min_subtract = 0; /* How much to subtract from the minimum node + length to get a real minimum (because the + folded version may be shorter) */ + bool has_exactf_sharp_s = FALSE; /* Peephole optimizer: */ DEBUG_STUDYDATA("Peep:", data,depth); DEBUG_PEEP("Peep",scan,depth); - JOIN_EXACT(scan,&min,0); + + /* Its not clear to khw or hv why this is done here, and not in the + * clauses that deal with EXACT nodes. khw's guess is that it's + * because of a previous design */ + JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0); /* Follow the next-chain of the current node and optimize away all the NOTHINGs from it. */ @@ -2956,7 +3249,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, regnode *first = (regnode *)NULL; regnode *last = (regnode *)NULL; regnode *tail = scan; - U8 optype = 0; + U8 trietype = 0; U32 count=0; #ifdef DEBUGGING @@ -2976,7 +3269,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } - DEBUG_OPTIMISE_r({ + DEBUG_TRIE_COMPILE_r({ regprop(RExC_rx, mysv, tail ); PerlIO_printf( Perl_debug_log, "%*s%s%s\n", (int)depth * 2 + 2, "", @@ -2987,37 +3280,66 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* - step through the branches, cur represents each - branch, noper is the first thing to be matched - as part of that branch and noper_next is the - regnext() of that node. if noper is an EXACT - and noper_next is the same as scan (our current - position in the regex) then the EXACT branch is - a possible optimization target. Once we have - two or more consecutive such branches we can - create a trie of the EXACT's contents and stich - it in place. If the sequence represents all of - the branches we eliminate the whole thing and - replace it with a single TRIE. If it is a - subsequence then we need to stitch it in. This - means the first branch has to remain, and needs - to be repointed at the item on the branch chain - following the last branch optimized. This could - be either a BRANCH, in which case the - subsequence is internal, or it could be the - item following the branch sequence in which - case the subsequence is at the end. + Step through the branches + cur represents each branch, + noper is the first thing to be matched as part of that branch + noper_next is the regnext() of that node. + + We normally handle a case like this /FOO[xyz]|BAR[pqr]/ + via a "jump trie" but we also support building with NOJUMPTRIE, + which restricts the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is a possible optimization + target. If we are building under NOJUMPTRIE then we require that noper_next + is the same as scan (our current position in the regex program). + + Once we have two or more consecutive such branches we can create a + trie of the EXACT's contents and stitch it in place into the program. + + If the sequence represents all of the branches in the alternation we + replace the entire thing with a single TRIE node. + + Otherwise when it is a subsequence we need to stitch it in place and + replace only the relevant branches. This means the first branch has + to remain as it is used by the alternation logic, and its next pointer, + and needs to be repointed at the item on the branch chain following + the last branch we have optimized away. + + This could be either a BRANCH, in which case the subsequence is internal, + or it could be the item following the branch sequence in which case the + subsequence is at the end (which does not necessarily mean the first node + is the start of the alternation). + + TRIE_TYPE(X) is a define which maps the optype to a trietype. + + optype | trietype + ----------------+----------- + NOTHING | NOTHING + EXACT | EXACT + EXACTFU | EXACTFU + EXACTFU_SS | EXACTFU + EXACTFU_TRICKYFOLD | EXACTFU + EXACTFA | 0 + */ +#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ + ( EXACT == (X) ) ? EXACT : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \ + 0 ) /* dont use tail as the end marker for this traverse */ for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { regnode * const noper = NEXTOPER( cur ); + U8 noper_type = OP( noper ); + U8 noper_trietype = TRIE_TYPE( noper_type ); #if defined(DEBUGGING) || defined(NOJUMPTRIE) regnode * const noper_next = regnext( noper ); + U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0; #endif - DEBUG_OPTIMISE_r({ + DEBUG_TRIE_COMPILE_r({ regprop(RExC_rx, mysv, cur); PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); @@ -3031,84 +3353,134 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, PerlIO_printf( Perl_debug_log,"\t=> %s\t", SvPV_nolen_const(mysv)); } - PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n", - REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) ); + PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n", + REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), + PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] + ); }); - if ( (((first && optype!=NOTHING) ? OP( noper ) == optype - : PL_regkind[ OP( noper ) ] == EXACT ) - || OP(noper) == NOTHING ) + + /* Is noper a trieable nodetype that can be merged with the + * current trie (if there is one)? */ + if ( noper_trietype + && + ( + ( noper_trietype == NOTHING) + || ( trietype == NOTHING ) + || ( trietype == noper_trietype ) + ) #ifdef NOJUMPTRIE && noper_next == tail #endif && count < U16_MAX) { - count++; - if ( !first || optype == NOTHING ) { - if (!first) first = cur; - optype = OP( noper ); + /* Handle mergable triable node + * Either we are the first node in a new trieable sequence, + * in which case we do some bookkeeping, otherwise we update + * the end pointer. */ + if ( !first ) { + first = cur; + trietype = noper_trietype; + if ( noper_trietype == NOTHING ) { +#if !defined(DEBUGGING) && !defined(NOJUMPTRIE) + regnode * const noper_next = regnext( noper ); + U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; +#endif + + if ( noper_next_trietype ) + trietype = noper_next_trietype; + } } else { + if ( trietype == NOTHING ) + trietype = noper_trietype; last = cur; } - } else { -/* - Currently the trie logic handles case insensitive matching properly only - when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode - semantics). - - If/when this is fixed the following define can be swapped - in below to fully enable trie logic. - -#define TRIE_TYPE_IS_SAFE 1 - -*/ -#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT) - - if ( last && TRIE_TYPE_IS_SAFE ) { - make_trie( pRExC_state, - startbranch, first, cur, tail, count, - optype, depth+1 ); + if (first) + count++; + } /* end handle mergable triable node */ + else { + /* handle unmergable node - + * noper may either be a triable node which can not be tried + * together with the current trie, or a non triable node */ + if ( last ) { + /* If last is set and trietype is not NOTHING then we have found + * at least two triable branch sequences in a row of a similar + * trietype so we can turn them into a trie. If/when we + * allow NOTHING to start a trie sequence this condition will be + * required, and it isn't expensive so we leave it in for now. */ + if ( trietype != NOTHING ) + make_trie( pRExC_state, + startbranch, first, cur, tail, count, + trietype, depth+1 ); + last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */ } - if ( PL_regkind[ OP( noper ) ] == EXACT + if ( noper_trietype #ifdef NOJUMPTRIE && noper_next == tail #endif ){ + /* noper is triable, so we can start a new trie sequence */ count = 1; first = cur; - optype = OP( noper ); - } else { + trietype = noper_trietype; + } else if (first) { + /* if we already saw a first but the current node is not triable then we have + * to reset the first information. */ count = 0; first = NULL; - optype = 0; + trietype = 0; } - last = NULL; - } - } - DEBUG_OPTIMISE_r({ + } /* end handle unmergable node */ + } /* loop over branches */ + DEBUG_TRIE_COMPILE_r({ regprop(RExC_rx, mysv, cur); PerlIO_printf( Perl_debug_log, "%*s- %s (%d) \n", (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); - - if ( last && TRIE_TYPE_IS_SAFE ) { - made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 ); + if ( last ) { + if ( trietype != NOTHING ) { + /* the last branch of the sequence was part of a trie, + * so we have to construct it here outside of the loop + */ + made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 ); #ifdef TRIE_STUDY_OPT - if ( ((made == MADE_EXACT_TRIE && - startbranch == first) - || ( first_non_open == first )) && - depth==0 ) { - flags |= SCF_TRIE_RESTUDY; - if ( startbranch == first - && scan == tail ) - { - RExC_seen &=~REG_TOP_LEVEL_BRANCHES; + if ( ((made == MADE_EXACT_TRIE && + startbranch == first) + || ( first_non_open == first )) && + depth==0 ) { + flags |= SCF_TRIE_RESTUDY; + if ( startbranch == first + && scan == tail ) + { + RExC_seen &=~REG_TOP_LEVEL_BRANCHES; + } } - } #endif - } - } + } else { + /* at this point we know whatever we have is a NOTHING sequence/branch + * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING + */ + if ( startbranch == first ) { + regnode *opt; + /* the entire thing is a NOTHING sequence, something like this: + * (?:|) So we can turn it into a plain NOTHING op. */ + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, cur); + PerlIO_printf( Perl_debug_log, + "%*s- %s (%d) \n", (int)depth * 2 + 2, + "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); + + }); + OP(startbranch)= NOTHING; + NEXT_OFF(startbranch)= tail - startbranch; + for ( opt= startbranch + 1; opt < tail ; opt++ ) + OP(opt)= OPTIMIZED; + } + } + } /* end if ( last) */ + } /* TRIE_MAXBUF is non zero */ } /* do trie */ @@ -3181,8 +3553,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, UV uc; if (UTF) { const U8 * const s = (U8*)STRING(scan); + uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); - uc = utf8_to_uvchr(s, NULL); } else { uc = *((U8*)STRING(scan)); } @@ -3245,7 +3617,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * elsewhere that does compute the fold settles down, it * can be extracted out and re-used here */ for (i = 0; i < 256; i++){ - if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) { + if (HAS_NONLATIN1_FOLD_CLOSURE(i)) { ANYOF_BITMAP_SET(data->start_class, i); } } @@ -3276,12 +3648,27 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } if (UTF) { const U8 * const s = (U8 *)STRING(scan); + uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); - uc = utf8_to_uvchr(s, NULL); } - min += l; - if (flags & SCF_DO_SUBSTR) - data->pos_min += l; + else if (has_exactf_sharp_s) { + RExC_seen |= REG_SEEN_EXACTF_SHARP_S; + } + min += l - min_subtract; + if (min < 0) { + min = 0; + } + delta += min_subtract; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += l - min_subtract; + if (data->pos_min < 0) { + data->pos_min = 0; + } + data->pos_delta += min_subtract; + if (min_subtract) { + data->longest = &(data->longest_float); + } + } if (flags & SCF_DO_STCLASS_AND) { /* Check whether it is compatible with what we know already! */ int compat = 1; @@ -3309,8 +3696,22 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* Also set the other member of the fold pair. In case * that unicode semantics is called for at runtime, use * the full latin1 fold. (Can't do this for locale, - * because not known until runtime */ + * because not known until runtime) */ ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); + + /* All other (EXACTFL handled above) folds except under + * /iaa that include s, S, and sharp_s also may include + * the others */ + if (OP(scan) != EXACTFA) { + if (uc == 's' || uc == 'S') { + ANYOF_BITMAP_SET(data->start_class, + LATIN_SMALL_LETTER_SHARP_S); + } + else if (uc == LATIN_SMALL_LETTER_SHARP_S) { + ANYOF_BITMAP_SET(data->start_class, 's'); + ANYOF_BITMAP_SET(data->start_class, 'S'); + } + } } } else if (uc >= 0x100) { @@ -3335,6 +3736,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * run-time */ ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); + + /* All folds except under /iaa that include s, S, + * and sharp_s also may include the others */ + if (OP(scan) != EXACTFA) { + if (uc == 's' || uc == 'S') { + ANYOF_BITMAP_SET(data->start_class, + LATIN_SMALL_LETTER_SHARP_S); + } + else if (uc == LATIN_SMALL_LETTER_SHARP_S) { + ANYOF_BITMAP_SET(data->start_class, 's'); + ANYOF_BITMAP_SET(data->start_class, 'S'); + } + } } } data->start_class->flags &= ~ANYOF_EOS; @@ -3740,18 +4154,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - else if (OP(scan) == FOLDCHAR) { - int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2; - flags &= ~SCF_DO_STCLASS; - min += 1; - delta += d; - if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ - data->pos_min += 1; - data->pos_delta += d; - data->longest = &(data->longest_float); - } - } else if (REGNODE_SIMPLE(OP(scan))) { int value = 0; @@ -4001,6 +4403,32 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* Lookbehind, or need to calculate parens/evals/stclass: */ && (scan->flags || data || (flags & SCF_DO_STCLASS)) && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { + if ( OP(scan) == UNLESSM && + scan->flags == 0 && + OP(NEXTOPER(NEXTOPER(scan))) == NOTHING && + OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED + ) { + regnode *opt; + regnode *upto= regnext(scan); + DEBUG_PARSE_r({ + SV * const mysv_val=sv_newmortal(); + DEBUG_STUDYDATA("OPFAIL",data,depth); + + /*DEBUG_PARSE_MSG("opfail");*/ + regprop(RExC_rx, mysv_val, upto); + PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(upto), + (IV)(upto - scan) + ); + }); + OP(scan) = OPFAIL; + NEXT_OFF(scan) = upto - scan; + for (opt= scan + 1; opt < upto ; opt++) + OP(opt) = OPTIMIZED; + scan= upto; + continue; + } if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY || OP(scan) == UNLESSM ) { @@ -4165,8 +4593,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } } - - } #endif } @@ -4467,71 +4893,359 @@ Perl_reginitcolors(pTHX) #endif /* - - pregcomp - compile a regular expression into internal code + * pregcomp - compile a regular expression into internal code * - * We can't allocate space until we know how big the compiled form will be, - * but we can't compile it (and thus know how big it is) until we've got a - * place to put the code. So we cheat: we compile it twice, once with code - * generation turned off and size counting turned on, and once "for real". - * This also means that we don't allocate space until we are sure that the - * thing really will compile successfully, and we never have to move the - * code and thus invalidate pointers into it. (Note that it has to be in - * one piece because free() must be able to free it all.) [NB: not true in perl] - * - * Beware that the optimization-preparation code in here knows about some - * of the structure of the compiled regexp. [I'll say.] + * Decides which engine's compiler to call based on the hint currently in + * scope */ +#ifndef PERL_IN_XSUB_RE +/* return the currently in-scope regex engine (or the default if none) */ + +regexp_engine const * +Perl_current_re_engine(pTHX) +{ + dVAR; + + if (IN_PERL_COMPILETIME) { + HV * const table = GvHV(PL_hintgv); + SV **ptr; + + if (!table) + return &PL_core_reg_engine; + ptr = hv_fetchs(table, "regcomp", FALSE); + if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) + return &PL_core_reg_engine; + return INT2PTR(regexp_engine*,SvIV(*ptr)); + } + else { + SV *ptr; + if (!PL_curcop->cop_hints_hash) + return &PL_core_reg_engine; + ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0); + if ( !(ptr && SvIOK(ptr) && SvIV(ptr))) + return &PL_core_reg_engine; + return INT2PTR(regexp_engine*,SvIV(ptr)); + } +} -#ifndef PERL_IN_XSUB_RE -#define RE_ENGINE_PTR &PL_core_reg_engine -#else -extern const struct regexp_engine my_reg_engine; -#define RE_ENGINE_PTR &my_reg_engine -#endif -#ifndef PERL_IN_XSUB_RE REGEXP * Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) { dVAR; - HV * const table = GvHV(PL_hintgv); + regexp_engine const *eng = current_re_engine(); + GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_PREGCOMP; - /* Dispatch a request to compile a regexp to correct - regexp engine. */ - if (table) { - SV **ptr= hv_fetchs(table, "regcomp", FALSE); - GET_RE_DEBUG_FLAGS_DECL; - if (ptr && SvIOK(*ptr) && SvIV(*ptr)) { - const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr)); - DEBUG_COMPILE_r({ - PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", - SvIV(*ptr)); - }); - return CALLREGCOMP_ENG(eng, pattern, flags); - } - } - return Perl_re_compile(aTHX_ pattern, flags); + /* Dispatch a request to compile a regexp to correct regexp engine. */ + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", + PTR2UV(eng)); + }); + return CALLREGCOMP_ENG(eng, pattern, flags); } #endif +/* public(ish) wrapper for Perl_re_op_compile that only takes an SV + * pattern rather than a list of OPs */ + +REGEXP * +Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) +{ + SV *pat = pattern; /* defeat constness! */ + PERL_ARGS_ASSERT_RE_COMPILE; + return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(), + NULL, NULL, rx_flags, 0); +} + +/* see if there are any run-time code blocks in the pattern. + * False positives are allowed */ + +static bool +S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr, + U32 pm_flags, char *pat, STRLEN plen) +{ + int n = 0; + STRLEN s; + + /* avoid infinitely recursing when we recompile the pattern parcelled up + * as qr'...'. A single constant qr// string can't have have any + * run-time component in it, and thus, no runtime code. (A non-qr + * string, however, can, e.g. $x =~ '(?{})') */ + if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST) + return 0; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + s = pRExC_state->code_blocks[n].end; + n++; + continue; + } + /* TODO ideally should handle [..], (#..), /#.../x to reduce false + * positives here */ + if (pat[s] == '(' && pat[s+1] == '?' && + (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{')) + ) + return 1; + } + return 0; +} + +/* Handle run-time code blocks. We will already have compiled any direct + * or indirect literal code blocks. Now, take the pattern 'pat' and make a + * copy of it, but with any literal code blocks blanked out and + * appropriate chars escaped; then feed it into + * + * eval "qr'modified_pattern'" + * + * For example, + * + * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno + * + * becomes + * + * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno' + * + * After eval_sv()-ing that, grab any new code blocks from the returned qr + * and merge them with any code blocks of the original regexp. + * + * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; + * instead, just save the qr and return FALSE; this tells our caller that + * the original pattern needs upgrading to utf8. + */ + +bool +S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, + char *pat, STRLEN plen) +{ + SV *qr; + + GET_RE_DEBUG_FLAGS_DECL; + + if (pRExC_state->runtime_code_qr) { + /* this is the second time we've been called; this should + * only happen if the main pattern got upgraded to utf8 + * during compilation; re-use the qr we compiled first time + * round (which should be utf8 too) + */ + qr = pRExC_state->runtime_code_qr; + pRExC_state->runtime_code_qr = NULL; + assert(RExC_utf8 && SvUTF8(qr)); + } + else { + int n = 0; + STRLEN s; + char *p, *newpat; + int newlen = plen + 6; /* allow for "qr''x\0" extra chars */ + SV *sv, *qr_ref; + dSP; + + /* determine how many extra chars we need for ' and \ escaping */ + for (s = 0; s < plen; s++) { + if (pat[s] == '\'' || pat[s] == '\\') + newlen++; + } + + Newx(newpat, newlen, char); + p = newpat; + *p++ = 'q'; *p++ = 'r'; *p++ = '\''; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + /* blank out literal code block */ + assert(pat[s] == '('); + while (s <= pRExC_state->code_blocks[n].end) { + *p++ = ' '; + s++; + } + s--; + n++; + continue; + } + if (pat[s] == '\'' || pat[s] == '\\') + *p++ = '\\'; + *p++ = pat[s]; + } + *p++ = '\''; + if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) + *p++ = 'x'; + *p++ = '\0'; + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, + "%sre-parsing pattern for runtime code:%s %s\n", + PL_colors[4],PL_colors[5],newpat); + }); + + sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); + Safefree(newpat); + + ENTER; + SAVETMPS; + save_re_context(); + PUSHSTACKi(PERLSI_REQUIRE); + /* this causes the toker to collapse \\ into \ when parsing + * qr''; normally only q'' does this. It also alters hints + * handling */ + PL_reg_state.re_reparsing = TRUE; + eval_sv(sv, G_SCALAR); + SvREFCNT_dec(sv); + SPAGAIN; + qr_ref = POPs; + PUTBACK; + if (SvTRUE(ERRSV)) + Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV)); + assert(SvROK(qr_ref)); + qr = SvRV(qr_ref); + assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); + /* the leaving below frees the tmp qr_ref. + * Give qr a life of its own */ + SvREFCNT_inc(qr); + POPSTACK; + FREETMPS; + LEAVE; + + } + + if (!RExC_utf8 && SvUTF8(qr)) { + /* first time through; the pattern got upgraded; save the + * qr for the next time through */ + assert(!pRExC_state->runtime_code_qr); + pRExC_state->runtime_code_qr = qr; + return 0; + } + + + /* extract any code blocks within the returned qr// */ + + + /* merge the main (r1) and run-time (r2) code blocks into one */ + { + RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2); + struct reg_code_block *new_block, *dst; + RExC_state_t * const r1 = pRExC_state; /* convenient alias */ + int i1 = 0, i2 = 0; + + if (!r2->num_code_blocks) /* we guessed wrong */ + return 1; + + Newx(new_block, + r1->num_code_blocks + r2->num_code_blocks, + struct reg_code_block); + dst = new_block; + + while ( i1 < r1->num_code_blocks + || i2 < r2->num_code_blocks) + { + struct reg_code_block *src; + bool is_qr = 0; + + if (i1 == r1->num_code_blocks) { + src = &r2->code_blocks[i2++]; + is_qr = 1; + } + else if (i2 == r2->num_code_blocks) + src = &r1->code_blocks[i1++]; + else if ( r1->code_blocks[i1].start + < r2->code_blocks[i2].start) + { + src = &r1->code_blocks[i1++]; + assert(src->end < r2->code_blocks[i2].start); + } + else { + assert( r1->code_blocks[i1].start + > r2->code_blocks[i2].start); + src = &r2->code_blocks[i2++]; + is_qr = 1; + assert(src->end < r1->code_blocks[i1].start); + } + + assert(pat[src->start] == '('); + assert(pat[src->end] == ')'); + dst->start = src->start; + dst->end = src->end; + dst->block = src->block; + dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) + : src->src_regex; + dst++; + } + r1->num_code_blocks += r2->num_code_blocks; + Safefree(r1->code_blocks); + r1->code_blocks = new_block; + } + + SvREFCNT_dec(qr); + return 1; +} + + +/* + * Perl_re_op_compile - the perl internal RE engine's function to compile a + * regular expression into internal code. + * The pattern may be passed either as: + * a list of SVs (patternp plus pat_count) + * a list of OPs (expr) + * If both are passed, the SV list is used, but the OP list indicates + * which SVs are actually pre-compiled code blocks + * + * The SVs in the list have magic and qr overloading applied to them (and + * the list may be modified in-place with replacement SVs in the latter + * case). + * + * If the pattern hasn't changed from old_re, then old_re will be + * returned. + * + * eng is the current engine. If that engine has an op_comp method, then + * handle directly (i.e. we assume that op_comp was us); otherwise, just + * do the initial concatenation of arguments and pass on to the external + * engine. + * + * If is_bare_re is not null, set it to a boolean indicating whether the + * arg list reduced (after overloading) to a single bare regex which has + * been returned (i.e. /$qr/). + * + * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details. + * + * pm_flags contains the PMf_* flags, typically based on those from the + * pm_flags field of the related PMOP. Currently we're only interested in + * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL. + * + * We can't allocate space until we know how big the compiled form will be, + * but we can't compile it (and thus know how big it is) until we've got a + * place to put the code. So we cheat: we compile it twice, once with code + * generation turned off and size counting turned on, and once "for real". + * This also means that we don't allocate space until we are sure that the + * thing really will compile successfully, and we never have to move the + * code and thus invalidate pointers into it. (Note that it has to be in + * one piece because free() must be able to free it all.) [NB: not true in perl] + * + * Beware that the optimization-preparation code in here knows about some + * of the structure of the compiled regexp. [I'll say.] + */ + REGEXP * -Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) +Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, + OP *expr, const regexp_engine* eng, REGEXP *VOL old_re, + bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags) { dVAR; REGEXP *rx; struct regexp *r; register regexp_internal *ri; STRLEN plen; - char* VOL exp; + char * VOL exp; char* xend; regnode *scan; I32 flags; I32 minlen = 0; - U32 pm_flags; + U32 rx_flags; + SV * VOL pat; /* these are all flags - maybe they should be turned * into a single int with different bit masks */ @@ -4539,8 +5253,10 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) I32 sawplus = 0; I32 sawopen = 0; bool used_setjump = FALSE; - regex_charset initial_charset = get_regex_charset(orig_pm_flags); - + regex_charset initial_charset = get_regex_charset(orig_rx_flags); + bool code_is_utf8 = 0; + bool VOL recompile = 0; + bool runtime_code = 0; U8 jump_ret = 0; dJMPENV; scan_data_t data; @@ -4552,20 +5268,300 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) #endif GET_RE_DEBUG_FLAGS_DECL; - PERL_ARGS_ASSERT_RE_COMPILE; + PERL_ARGS_ASSERT_RE_OP_COMPILE; DEBUG_r(if (!PL_colorset) reginitcolors()); - exp = SvPV(pattern, plen); +#ifndef PERL_IN_XSUB_RE + /* Initialize these here instead of as-needed, as is quick and avoids + * having to test them each time otherwise */ + if (! PL_AboveLatin1) { + PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); + PL_ASCII = _new_invlist_C_array(ASCII_invlist); + PL_Latin1 = _new_invlist_C_array(Latin1_invlist); + + PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); + PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); + + PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist); + PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist); + + PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist); + PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist); + + PL_L1Cased = _new_invlist_C_array(L1Cased_invlist); - if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */ - RExC_utf8 = RExC_orig_utf8 = 0; + PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist); + PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist); + + PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist); + + PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist); + PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist); + + PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); + PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); + + PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist); + PL_PosixLower = _new_invlist_C_array(PosixLower_invlist); + + PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist); + PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist); + + PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist); + PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist); + + PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist); + PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist); + + PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist); + PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist); + + PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist); + PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist); + + PL_VertSpace = _new_invlist_C_array(VertSpace_invlist); + + PL_PosixWord = _new_invlist_C_array(PosixWord_invlist); + PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist); + + PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist); + PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist); + } +#endif + + pRExC_state->code_blocks = NULL; + pRExC_state->num_code_blocks = 0; + + if (is_bare_re) + *is_bare_re = FALSE; + + if (expr && (expr->op_type == OP_LIST || + (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) { + + /* is the source UTF8, and how many code blocks are there? */ + OP *o; + int ncode = 0; + + for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) { + if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv)) + code_is_utf8 = 1; + else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) + /* count of DO blocks */ + ncode++; + } + if (ncode) { + pRExC_state->num_code_blocks = ncode; + Newx(pRExC_state->code_blocks, ncode, struct reg_code_block); + } + } + + if (pat_count) { + /* handle a list of SVs */ + + SV **svp; + + /* apply magic and RE overloading to each arg */ + for (svp = patternp; svp < patternp + pat_count; svp++) { + SV *rx = *svp; + SvGETMAGIC(rx); + if (SvROK(rx) && SvAMAGIC(rx)) { + SV *sv = AMG_CALLunary(rx, regexp_amg); + if (sv) { + if (SvROK(sv)) + sv = SvRV(sv); + if (SvTYPE(sv) != SVt_REGEXP) + Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP"); + *svp = sv; + } + } + } + + if (pat_count > 1) { + /* concat multiple args and find any code block indexes */ + + OP *o = NULL; + int n = 0; + bool utf8 = 0; + + if (pRExC_state->num_code_blocks) { + o = cLISTOPx(expr)->op_first; + assert(o->op_type == OP_PUSHMARK); + o = o->op_sibling; + } + + pat = newSVpvn("", 0); + SAVEFREESV(pat); + + /* determine if the pattern is going to be utf8 (needed + * in advance to align code block indices correctly). + * XXX This could fail to be detected for an arg with + * overloading but not concat overloading; but the main effect + * in this obscure case is to need a 'use re eval' for a + * literal code block */ + for (svp = patternp; svp < patternp + pat_count; svp++) { + if (SvUTF8(*svp)) + utf8 = 1; + } + if (utf8) + SvUTF8_on(pat); + + for (svp = patternp; svp < patternp + pat_count; svp++) { + SV *sv, *msv = *svp; + SV *rx; + bool code = 0; + if (o) { + if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) { + assert(n < pRExC_state->num_code_blocks); + pRExC_state->code_blocks[n].start = SvCUR(pat); + pRExC_state->code_blocks[n].block = o; + pRExC_state->code_blocks[n].src_regex = NULL; + n++; + code = 1; + o = o->op_sibling; /* skip CONST */ + assert(o); + } + o = o->op_sibling;; + } + + /* extract any code blocks within any embedded qr//'s */ + rx = msv; + if (SvROK(rx)) + rx = SvRV(rx); + if (SvTYPE(rx) == SVt_REGEXP + && RX_ENGINE((REGEXP*)rx)->op_comp) + { + + RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri); + if (ri->num_code_blocks) { + int i; + /* the presence of an embedded qr// with code means + * we should always recompile: the text of the + * qr// may not have changed, but it may be a + * different closure than last time */ + recompile = 1; + Renew(pRExC_state->code_blocks, + pRExC_state->num_code_blocks + ri->num_code_blocks, + struct reg_code_block); + pRExC_state->num_code_blocks += ri->num_code_blocks; + for (i=0; i < ri->num_code_blocks; i++) { + struct reg_code_block *src, *dst; + STRLEN offset = SvCUR(pat) + + ((struct regexp *)SvANY(rx))->pre_prefix; + assert(n < pRExC_state->num_code_blocks); + src = &ri->code_blocks[i]; + dst = &pRExC_state->code_blocks[n]; + dst->start = src->start + offset; + dst->end = src->end + offset; + dst->block = src->block; + dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) + src->src_regex + ? src->src_regex + : (REGEXP*)rx); + n++; + } + } + } + + if ((SvAMAGIC(pat) || SvAMAGIC(msv)) && + (sv = amagic_call(pat, msv, concat_amg, AMGf_assign))) + { + sv_setsv(pat, sv); + /* overloading involved: all bets are off over literal + * code. Pretend we haven't seen it */ + pRExC_state->num_code_blocks -= n; + n = 0; + + } + else { + if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) + msv = SvRV(msv); + sv_catsv_nomg(pat, msv); + if (code) + pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1; + } + } + SvSETMAGIC(pat); + } + else + pat = *patternp; + + /* handle bare regex: foo =~ $re */ + { + SV *re = pat; + if (SvROK(re)) + re = SvRV(re); + if (SvTYPE(re) == SVt_REGEXP) { + if (is_bare_re) + *is_bare_re = TRUE; + SvREFCNT_inc(re); + Safefree(pRExC_state->code_blocks); + return (REGEXP*)re; + } + } } else { - RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); + /* not a list of SVs, so must be a list of OPs */ + assert(expr); + if (expr->op_type == OP_LIST) { + int i = -1; + bool is_code = 0; + OP *o; + + pat = newSVpvn("", 0); + SAVEFREESV(pat); + if (code_is_utf8) + SvUTF8_on(pat); + + /* given a list of CONSTs and DO blocks in expr, append all + * the CONSTs to pat, and record the start and end of each + * code block in code_blocks[] (each DO{} op is followed by an + * OP_CONST containing the corresponding literal '(?{...}) + * text) + */ + for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) { + if (o->op_type == OP_CONST) { + sv_catsv(pat, cSVOPo_sv); + if (is_code) { + pRExC_state->code_blocks[i].end = SvCUR(pat)-1; + is_code = 0; + } + } + else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) { + assert(i+1 < pRExC_state->num_code_blocks); + pRExC_state->code_blocks[++i].start = SvCUR(pat); + pRExC_state->code_blocks[i].block = o; + pRExC_state->code_blocks[i].src_regex = NULL; + is_code = 1; + } + } + } + else { + assert(expr->op_type == OP_CONST); + pat = cSVOPx_sv(expr); + } } + + exp = SvPV_nomg(pat, plen); + + if (!eng->op_comp) { + if ((SvUTF8(pat) && IN_BYTES) + || SvGMAGICAL(pat) || SvAMAGIC(pat)) + { + /* make a temporary copy; either to convert to bytes, + * or to avoid repeating get-magic / overloaded stringify */ + pat = newSVpvn_flags(exp, plen, SVs_TEMP | + (IN_BYTES ? 0 : SvUTF8(pat))); + } + Safefree(pRExC_state->code_blocks); + return CALLREGCOMP_ENG(eng, pat, orig_rx_flags); + } + + /* ignore the utf8ness if the pattern is 0 length */ + RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); RExC_uni_semantics = 0; RExC_contains_locale = 0; + pRExC_state->runtime_code_qr = NULL; /****************** LONG JUMP TARGET HERE***********************/ /* Longjmp back to here if have to switch in midstream to utf8 */ @@ -4586,7 +5582,10 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) }); } else { /* longjumped back */ - STRLEN len = plen; + U8 *src, *dst; + int n=0; + STRLEN s = 0, d = 0; + bool do_end = 0; /* If the cause for the longjmp was other than changing to utf8, pop * our own setjmp, and longjmp to the correct handler */ @@ -4607,19 +5606,81 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) -- dmq */ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); - exp = (char*)Perl_bytes_to_utf8(aTHX_ - (U8*)SvPV_nomg(pattern, plen), - &len); - xend = exp + len; - RExC_orig_utf8 = RExC_utf8 = 1; - SAVEFREEPV(exp); + + /* upgrade pattern to UTF8, and if there are code blocks, + * recalculate the indices. + * This is essentially an unrolled Perl_bytes_to_utf8() */ + + src = (U8*)SvPV_nomg(pat, plen); + Newx(dst, plen * 2 + 1, U8); + + while (s < plen) { + const UV uv = NATIVE_TO_ASCII(src[s]); + if (UNI_IS_INVARIANT(uv)) + dst[d] = (U8)UTF_TO_NATIVE(uv); + else { + dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv); + dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv); + } + if (n < pRExC_state->num_code_blocks) { + if (!do_end && pRExC_state->code_blocks[n].start == s) { + pRExC_state->code_blocks[n].start = d; + assert(dst[d] == '('); + do_end = 1; + } + else if (do_end && pRExC_state->code_blocks[n].end == s) { + pRExC_state->code_blocks[n].end = d; + assert(dst[d] == ')'); + do_end = 0; + n++; + } + } + s++; + d++; + } + dst[d] = '\0'; + plen = d; + exp = (char*) dst; + xend = exp + plen; + SAVEFREEPV(exp); + RExC_orig_utf8 = RExC_utf8 = 1; } + /* return old regex if pattern hasn't changed */ + + if ( old_re + && !recompile + && !!RX_UTF8(old_re) == !!RExC_utf8 + && RX_PRECOMP(old_re) + && RX_PRELEN(old_re) == plen + && memEQ(RX_PRECOMP(old_re), exp, plen)) + { + /* with runtime code, always recompile */ + runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags, + exp, plen); + if (!runtime_code) { + ReREFCNT_inc(old_re); + if (used_setjump) { + JMPENV_POP; + } + Safefree(pRExC_state->code_blocks); + return old_re; + } + } + else if ((pm_flags & PMf_USE_RE_EVAL) + /* this second condition covers the non-regex literal case, + * i.e. $foo =~ '(?{})'. */ + || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME + && (PL_hints & HINT_RE_EVAL)) + ) + runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags, + exp, plen); + #ifdef TRIE_STUDY_OPT restudied = 0; #endif - pm_flags = orig_pm_flags; + rx_flags = orig_rx_flags; if (initial_charset == REGEX_LOCALE_CHARSET) { RExC_contains_locale = 1; @@ -4628,17 +5689,30 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ - set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET); + set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); } RExC_precomp = exp; - RExC_flags = pm_flags; + RExC_flags = rx_flags; + RExC_pm_flags = pm_flags; + + if (runtime_code) { + if (PL_tainting && PL_tainted) + Perl_croak(aTHX_ "Eval-group in insecure regular expression"); + + if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { + /* whoops, we have a non-utf8 pattern, whilst run-time code + * got compiled as utf8. Try again with a utf8 pattern */ + JMPENV_JUMP(UTF8_LONGJMP); + } + } + assert(!pRExC_state->runtime_code_qr); + RExC_sawback = 0; RExC_seen = 0; RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; - RExC_seen_evals = 0; RExC_extralen = 0; RExC_override_recoding = 0; @@ -4661,6 +5735,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) #endif RExC_recurse = NULL; RExC_recurse_count = 0; + pRExC_state->code_index = 0; #if 0 /* REGC() is (currently) a NOP at the first pass. * Clever compilers notice this and complain. --jhi */ @@ -4673,6 +5748,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) ); if (reg(pRExC_state, 0, &flags,1) == NULL) { RExC_precomp = NULL; + Safefree(pRExC_state->code_blocks); return(NULL); } @@ -4692,9 +5768,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) /* The first pass could have found things that force Unicode semantics */ if ((RExC_utf8 || RExC_uni_semantics) - && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) + && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET) { - set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET); + set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); } /* Small enough for pointer-storage convention? @@ -4725,8 +5801,15 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) /* non-zero initialization begins here */ RXi_SET( r, ri ); - r->engine= RE_ENGINE_PTR; - r->extflags = pm_flags; + r->engine= eng; + r->extflags = rx_flags; + if (pm_flags & PMf_IS_QR) { + ri->code_blocks = pRExC_state->code_blocks; + ri->num_code_blocks = pRExC_state->num_code_blocks; + } + else + SAVEFREEPV(pRExC_state->code_blocks); + { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET); @@ -4758,7 +5841,8 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */ SvPOK_on(rx); - SvFLAGS(rx) |= SvUTF8(pattern); + if (RExC_utf8) + SvFLAGS(rx) |= SVf_UTF8; *p++='('; *p++='?'; /* If a default, cover it using the caret */ @@ -4818,7 +5902,8 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) RExC_rxi = ri; /* Second pass: emit code. */ - RExC_flags = pm_flags; /* don't let top level (?i) bleed */ + RExC_flags = rx_flags; /* don't let top level (?i) bleed */ + RExC_pm_flags = pm_flags; RExC_parse = exp; RExC_end = xend; RExC_naughty = 0; @@ -4826,9 +5911,8 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) RExC_emit_start = ri->program; RExC_emit = ri->program; RExC_emit_bound = ri->program + RExC_size + 1; + pRExC_state->code_index = 0; - /* Store the count of eval-groups for security checks: */ - RExC_rx->seen_evals = RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); if (reg(pRExC_state, 0, &flags,1) == NULL) { ReREFCNT_dec(rx); @@ -4981,7 +6065,7 @@ reStudy: else if ((!sawopen || !RExC_sawback) && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && - !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL)) + !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks) { /* turn .* into ^.* with an implied $*=1 */ const int type = @@ -4994,7 +6078,7 @@ reStudy: goto again; } if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback) - && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */ + && !pRExC_state->num_code_blocks) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->intflags |= PREGf_SKIP; @@ -5067,9 +6151,11 @@ reStudy: { I32 t,ml; - if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ - && data.offset_fixed == data.offset_float_min - && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) + /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ + if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S) + || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) goto remove_float; /* As in (a)+. */ /* copy the information about the longest float from the reg_scan_data @@ -5112,10 +6198,13 @@ reStudy: Be careful. */ longest_fixed_length = CHR_SVLEN(data.longest_fixed); - if (longest_fixed_length - || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ - && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE)))) + + /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ + if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) + && (longest_fixed_length + || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ + && (!(data.flags & SF_FIX_BEFORE_MEOL) + || (RExC_flags & RXf_PMf_MULTILINE)))) ) { I32 t,ml; @@ -5262,7 +6351,7 @@ reStudy: r->extflags |= RXf_GPOS_SEEN; if (RExC_seen & REG_SEEN_LOOKBEHIND) r->extflags |= RXf_LOOKBEHIND_SEEN; - if (RExC_seen & REG_SEEN_EVAL) + if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; if (RExC_seen & REG_SEEN_CANY) r->extflags |= RXf_CANY_SEEN; @@ -5270,6 +6359,8 @@ reStudy: r->intflags |= PREGf_VERBARG_SEEN; if (RExC_seen & REG_SEEN_CUTGROUP) r->intflags |= PREGf_CUTGROUP_SEEN; + if (pm_flags & PMf_USE_RE_EVAL) + r->intflags |= PREGf_USE_RE_EVAL; if (RExC_paren_names) RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); else @@ -5340,8 +6431,6 @@ reStudy: return rx; } -#undef RE_ENGINE_PTR - SV* Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, @@ -5878,7 +6967,19 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) #define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */ #define INVLIST_ITER_OFFSET 1 /* Current iteration position */ -#define INVLIST_ZERO_OFFSET 2 /* 0 or 1; must be last element in header */ +/* This is a combination of a version and data structure type, so that one + * being passed in can be validated to be an inversion list of the correct + * vintage. When the structure of the header is changed, a new random number + * in the range 2**31-1 should be generated and the new() method changed to + * insert that at this location. Then, if an auxiliary program doesn't change + * correspondingly, it will be discovered immediately */ +#define INVLIST_VERSION_ID_OFFSET 2 +#define INVLIST_VERSION_ID 1064334010 + +/* For safety, when adding new elements, remember to #undef them at the end of + * the inversion list code section */ + +#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */ /* The UV at position ZERO contains either 0 or 1. If 0, the inversion list * contains the code point U+00000, and begins here. If 1, the inversion list * doesn't contain U+0000, and it begins at the next UV in the array. @@ -6038,10 +7139,39 @@ Perl__new_invlist(pTHX_ IV initial_size) * properly */ *get_invlist_zero_addr(new_list) = UV_MAX; + *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID; +#if HEADER_LENGTH != 4 +# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length +#endif + return new_list; } #endif +STATIC SV* +S__new_invlist_C_array(pTHX_ UV* list) +{ + /* Return a pointer to a newly constructed inversion list, initialized to + * point to , which has to be in the exact correct inversion list + * form, including internal fields. Thus this is a dangerous routine that + * should not be used in the wrong hands */ + + SV* invlist = newSV_type(SVt_PV); + + PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; + + SvPV_set(invlist, (char *) list); + SvLEN_set(invlist, 0); /* Means we own the contents, and the system + shouldn't touch it */ + SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist))); + + if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) { + Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); + } + + return invlist; +} + STATIC void S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) { @@ -6068,9 +7198,10 @@ S_invlist_trim(pTHX_ SV* const invlist) #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) -#ifndef PERL_IN_XSUB_RE -void -Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) +#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output) + +STATIC void +S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) { /* Subject to change or removal. Append the range from 'start' to 'end' at * the end of the inversion list. The range must be above any existing @@ -6148,6 +7279,8 @@ Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV } } +#ifndef PERL_IN_XSUB_RE + STATIC IV S_invlist_search(pTHX_ SV* const invlist, const UV cp) { @@ -6273,12 +7406,17 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV return; } + void -Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output) { /* Take the union of two inversion lists and point to it. *output * should be defined upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. + * the reference count to that list will be decremented. The first list, + * , may be NULL, in which case a copy of the second list is returned. + * If is TRUE, the union is taken of the complement + * (inversion) of instead of b itself. + * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some * length there. The preface says to incorporate its examples into your @@ -6313,17 +7451,21 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) */ UV count = 0; - PERL_ARGS_ASSERT__INVLIST_UNION; + PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; assert(a != b); /* If either one is empty, the union is the other one */ - len_a = invlist_len(a); - if (len_a == 0) { + if (a == NULL || ((len_a = invlist_len(a)) == 0)) { if (*output == a) { - SvREFCNT_dec(a); + if (a != NULL) { + SvREFCNT_dec(a); + } } if (*output != b) { *output = invlist_clone(b); + if (complement_b) { + _invlist_invert(*output); + } } /* else *output already = b; */ return; } @@ -6331,10 +7473,20 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) if (*output == b) { SvREFCNT_dec(b); } - if (*output != a) { - *output = invlist_clone(a); - } - /* else *output already = a; */ + + /* The complement of an empty list is a list that has everything in it, + * so the union with includes everything too */ + if (complement_b) { + if (a == *output) { + SvREFCNT_dec(a); + } + *output = _new_invlist(1); + _append_range_to_invlist(*output, 0, UV_MAX); + } + else if (*output != a) { + *output = invlist_clone(a); + } + /* else *output already = a; */ return; } @@ -6342,6 +7494,31 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) array_a = invlist_array(a); array_b = invlist_array(b); + /* If are to take the union of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later, and clear the + * flag as we don't have to do anything else later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + complement_b = FALSE; + } + else { + + /* But if the first element is not zero, we unshift a 0 before the + * array. The data structure reserves a space for that 0 (which + * should be a '1' right now), so physical shifting is unneeded, + * but temporarily change that element to 0. Before exiting the + * routine, we must restore the element to '1' */ + array_b--; + len_b++; + array_b[0] = 0; + } + } + /* Size the union for the worst case: that the sets are completely * disjoint */ u = _new_invlist(len_a + len_b); @@ -6460,16 +7637,24 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) SvREFCNT_dec(*output); } + /* If we've changed b, restore it */ + if (complement_b) { + array_b[0] = 1; + } + *output = u; return; } void -Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i) +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i) { /* Take the intersection of two inversion lists and point to it. *i * should be defined upon input, and if it points to one of the two lists, * the reference count to that list will be decremented. + * If is TRUE, the result will be the intersection of + * and the complement (or inversion) of instead of directly. + * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some * length there. The preface says to incorporate its examples into your @@ -6500,22 +7685,38 @@ Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i) */ UV count = 0; - PERL_ARGS_ASSERT__INVLIST_INTERSECTION; + PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; assert(a != b); - /* If either one is empty, the intersection is null */ + /* Special case if either one is empty */ len_a = invlist_len(a); if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) { - /* If the result is the same as one of the inputs, the input is being - * overwritten */ + if (len_a != 0 && complement_b) { + + /* Here, 'a' is not empty, therefore from the above 'if', 'b' must + * be empty. Here, also we are using 'b's complement, which hence + * must be every possible code point. Thus the intersection is + * simply 'a'. */ + if (*i != a) { + *i = invlist_clone(a); + + if (*i == b) { + SvREFCNT_dec(b); + } + } + /* else *i is already 'a' */ + return; + } + + /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The + * intersection must be empty */ if (*i == a) { SvREFCNT_dec(a); } else if (*i == b) { SvREFCNT_dec(b); } - *i = _new_invlist(0); return; } @@ -6524,6 +7725,31 @@ Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i) array_a = invlist_array(a); array_b = invlist_array(b); + /* If are to take the intersection of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later, and clear the + * flag as we don't have to do anything else later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + complement_b = FALSE; + } + else { + + /* But if the first element is not zero, we unshift a 0 before the + * array. The data structure reserves a space for that 0 (which + * should be a '1' right now), so physical shifting is unneeded, + * but temporarily change that element to 0. Before exiting the + * routine, we must restore the element to '1' */ + array_b--; + len_b++; + array_b[0] = 0; + } + } + /* Size the intersection for the worst case: that the intersection ends up * fragmenting everything to be completely disjoint */ r= _new_invlist(len_a + len_b); @@ -6632,14 +7858,17 @@ Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i) SvREFCNT_dec(*i); } + /* If we've changed b, restore it */ + if (complement_b) { + array_b[0] = 1; + } + *i = r; return; } -#endif - -STATIC SV* -S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) +SV* +Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) { /* Add the range from 'start' to 'end' inclusive to the inversion list's * set. A pointer to the inversion list is returned. This may actually be @@ -6680,9 +7909,11 @@ S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) return invlist; } +#endif + PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { - return add_range_to_invlist(invlist, cp, cp); + return _add_range_to_invlist(invlist, cp, cp); } #ifndef PERL_IN_XSUB_RE @@ -6773,46 +8004,6 @@ S_invlist_clone(pTHX_ SV* const invlist) return new_invlist; } -#ifndef PERL_IN_XSUB_RE -void -Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result) -{ - /* Point to an inversion list which consists of all elements in - * that aren't also in . *result should be defined upon input, and - * if it points to C its reference count will be decremented. */ - - PERL_ARGS_ASSERT__INVLIST_SUBTRACT; - assert(a != b); - - /* Subtracting nothing retains the original */ - if (invlist_len(b) == 0) { - - if (*result == b) { - SvREFCNT_dec(b); - } - - /* If the result is not to be the same variable as the original, create - * a copy */ - if (*result != a) { - *result = invlist_clone(a); - } - } else { - SV *b_copy = invlist_clone(b); - _invlist_invert(b_copy); /* Everything not in 'b' */ - - if (*result == b) { - SvREFCNT_dec(b); - } - - _invlist_intersection(a, b_copy, result); /* Everything in 'a' not in - 'b' */ - SvREFCNT_dec(b_copy); - } - - return; -} -#endif - PERL_STATIC_INLINE UV* S_get_invlist_iter_addr(pTHX_ SV* invlist) { @@ -6824,6 +8015,16 @@ S_get_invlist_iter_addr(pTHX_ SV* invlist) return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV))); } +PERL_STATIC_INLINE UV* +S_get_invlist_version_id_addr(pTHX_ SV* invlist) +{ + /* Return the address of the UV that contains the version id. */ + + PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR; + + return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV))); +} + PERL_STATIC_INLINE void S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */ { @@ -6929,6 +8130,7 @@ S_invlist_dump(pTHX_ SV* const invlist, const char * const header) #undef INVLIST_LEN_OFFSET #undef INVLIST_ZERO_OFFSET #undef INVLIST_ITER_OFFSET +#undef INVLIST_VERSION_ID /* End of inversion list object */ @@ -7369,67 +8571,51 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* FALL THROUGH */ case '{': /* (?{...}) */ { - I32 count = 1; U32 n = 0; - char c; - char *s = RExC_parse; + struct reg_code_block *cb; RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_EVAL; - while (count && (c = *RExC_parse)) { - if (c == '\\') { - if (RExC_parse[1]) - RExC_parse++; - } - else if (c == '{') - count++; - else if (c == '}') - count--; - RExC_parse++; - } - if (*RExC_parse != ')') { - RExC_parse = s; - vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); + + if ( !pRExC_state->num_code_blocks + || pRExC_state->code_index >= pRExC_state->num_code_blocks + || pRExC_state->code_blocks[pRExC_state->code_index].start + != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) + - RExC_start) + ) { + if (RExC_pm_flags & PMf_USE_RE_EVAL) + FAIL("panic: Sequence (?{...}): no code block found\n"); + FAIL("Eval-group not allowed at runtime, use re 'eval'"); } + /* this is a pre-compiled code block (?{...}) */ + cb = &pRExC_state->code_blocks[pRExC_state->code_index]; + RExC_parse = RExC_start + cb->end; if (!SIZE_ONLY) { - PAD *pad; - OP_4tree *sop, *rop; - SV * const sv = newSVpvn(s, RExC_parse - 1 - s); - - ENTER; - Perl_save_re_context(aTHX); - rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad); - sop->op_private |= OPpREFCOUNTED; - /* re_dup will OpREFCNT_inc */ - OpREFCNT_set(sop, 1); - LEAVE; - - n = add_data(pRExC_state, 3, "nop"); - RExC_rxi->data->data[n] = (void*)rop; - RExC_rxi->data->data[n+1] = (void*)sop; - RExC_rxi->data->data[n+2] = (void*)pad; - SvREFCNT_dec(sv); - } - else { /* First pass */ - if (PL_reginterp_cnt < ++RExC_seen_evals - && IN_PERL_RUNTIME) - /* No compiled RE interpolated, has runtime - components ===> unsafe. */ - FAIL("Eval-group not allowed at runtime, use re 'eval'"); - if (PL_tainting && PL_tainted) - FAIL("Eval-group in insecure regular expression"); -#if PERL_VERSION > 8 - if (IN_PERL_COMPILETIME) - PL_cv_has_eval = 1; -#endif + OP *o = cb->block; + if (cb->src_regex) { + n = add_data(pRExC_state, 2, "rl"); + RExC_rxi->data->data[n] = + (void*)SvREFCNT_inc((SV*)cb->src_regex); + RExC_rxi->data->data[n+1] = (void*)o; + } + else { + n = add_data(pRExC_state, 1, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l"); + RExC_rxi->data->data[n] = (void*)o; + } } - + pRExC_state->code_index++; nextchar(pRExC_state); + if (is_logical) { + regnode *eval; ret = reg_node(pRExC_state, LOGICAL); - if (!SIZE_ONLY) + eval = reganode(pRExC_state, EVAL, n); + if (!SIZE_ONLY) { ret->flags = 2; - REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n)); + /* for later propagation into (??{}) return value */ + eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME); + } + REGTAIL(pRExC_state, ret, eval); /* deal with the length of this later - MJD */ return ret; } @@ -7575,9 +8761,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) U32 posflags = 0, negflags = 0; U32 *flagsp = &posflags; char has_charset_modifier = '\0'; - regex_charset cs = (RExC_utf8 || RExC_uni_semantics) - ? REGEX_UNICODE_CHARSET - : REGEX_DEPENDS_CHARSET; + regex_charset cs = get_regex_charset(RExC_flags); + if (cs == REGEX_DEPENDS_CHARSET + && (RExC_utf8 || RExC_uni_semantics)) + { + cs = REGEX_UNICODE_CHARSET; + } while (*RExC_parse) { /* && strchr("iogcmsx", *RExC_parse) */ @@ -7856,9 +9045,24 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } break; } + DEBUG_PARSE_r(if (!SIZE_ONLY) { + SV * const mysv_val1=sv_newmortal(); + SV * const mysv_val2=sv_newmortal(); + DEBUG_PARSE_MSG("lsbr"); + regprop(RExC_rx, mysv_val1, lastbr); + regprop(RExC_rx, mysv_val2, ender); + PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val1), + (IV)REG_NODE_NUM(lastbr), + SvPV_nolen_const(mysv_val2), + (IV)REG_NODE_NUM(ender), + (IV)(ender - lastbr) + ); + }); REGTAIL(pRExC_state, lastbr, ender); if (have_branch && !SIZE_ONLY) { + char is_nothing= 1; if (depth==1) RExC_seen |= REG_TOP_LEVEL_BRANCHES; @@ -7867,11 +9071,44 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) const U8 op = PL_regkind[OP(br)]; if (op == BRANCH) { REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); + if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender) + is_nothing= 0; } else if (op == BRANCHJ) { REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); + /* for now we always disable this optimisation * / + if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender) + */ + is_nothing= 0; } } + if (is_nothing) { + br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret; + DEBUG_PARSE_r(if (!SIZE_ONLY) { + SV * const mysv_val1=sv_newmortal(); + SV * const mysv_val2=sv_newmortal(); + DEBUG_PARSE_MSG("NADA"); + regprop(RExC_rx, mysv_val1, ret); + regprop(RExC_rx, mysv_val2, ender); + PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val1), + (IV)REG_NODE_NUM(ret), + SvPV_nolen_const(mysv_val2), + (IV)REG_NODE_NUM(ender), + (IV)(ender - ret) + ); + }); + OP(br)= NOTHING; + if (OP(ender) == TAIL) { + NEXT_OFF(br)= 0; + RExC_emit= br + 1; + } else { + regnode *opt; + for ( opt= br + 1; opt < ender ; opt++ ) + OP(opt)= OPTIMIZED; + NEXT_OFF(br)= ender - br; + } + } } } @@ -8551,12 +9788,6 @@ tryagain: vFAIL("Internal urp"); /* Supposed to be caught earlier. */ break; - case '{': - if (!regcurly(RExC_parse)) { - RExC_parse++; - goto defchar; - } - /* FALL THROUGH */ case '?': case '+': case '*': @@ -8682,9 +9913,6 @@ tryagain: ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; - if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead"); - } goto finish_meta_pat; case 'B': RExC_seen_zerolen++; @@ -8709,9 +9937,6 @@ tryagain: ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; - if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { - ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead"); - } goto finish_meta_pat; case 's': switch (get_regex_charset(RExC_flags)) { @@ -9003,41 +10228,51 @@ tryagain: RExC_parse++; defchar: { - typedef enum { - generic_char = 0, - char_s, - upsilon_1, - upsilon_2, - iota_1, - iota_2, - } char_state; - char_state latest_char_state = generic_char; register STRLEN len; register UV ender; register char *p; char *s; STRLEN foldlen; U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; - regnode * orig_emit; U8 node_type; + /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so, + * it is folded to 'ss' even if not utf8 */ + bool is_exactfu_sharp_s; + ender = 0; - orig_emit = RExC_emit; /* Save the original output node position in - case we need to output a different node - type */ - node_type = (U8) ((! FOLD) ? EXACT - : (LOC) - ? EXACTFL - : (MORE_ASCII_RESTRICTED) - ? EXACTFA - : (AT_LEAST_UNI_SEMANTICS) - ? EXACTFU - : EXACTF); + node_type = ((! FOLD) ? EXACT + : (LOC) + ? EXACTFL + : (MORE_ASCII_RESTRICTED) + ? EXACTFA + : (AT_LEAST_UNI_SEMANTICS) + ? EXACTFU + : EXACTF); ret = reg_node(pRExC_state, node_type); s = STRING(ret); + + /* XXX The node can hold up to 255 bytes, yet this only goes to + * 127. I (khw) do not know why. Keeping it somewhat less than + * 255 allows us to not have to worry about overflow due to + * converting to utf8 and fold expansion, but that value is + * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes + * split up by this limit into a single one using the real max of + * 255. Even at 127, this breaks under rare circumstances. If + * folding, we do not want to split a node at a character that is a + * non-final in a multi-char fold, as an input string could just + * happen to want to match across the node boundary. The join + * would solve that problem if the join actually happens. But a + * series of more than two nodes in a row each of 127 would cause + * the first join to succeed to get to 254, but then there wouldn't + * be room for the next one, which could at be one of those split + * multi-char folds. I don't know of any fool-proof solution. One + * could back off to end with only a code point that isn't such a + * non-final, but it is possible for there not to be any in the + * entire node. */ for (len = 0, p = RExC_parse - 1; - len < 127 && p < RExC_end; - len++) + len < 127 && p < RExC_end; + len++) { char * const oldp = p; @@ -9208,15 +10443,22 @@ tryagain: /* FALL THROUGH */ default: if (!SIZE_ONLY&& isALPHA(*p)) { - /* Include any { following the alpha to emphasize - * that it could be part of an escape at some point - * in the future */ - int len = (*(p + 1) == '{') ? 2 : 1; - ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p); + ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p); } goto normal_default; } break; + case '{': + /* Currently we don't warn when the lbrace is at the start + * of a construct. This catches it in the middle of a + * literal string, or when its the first thing after + * something like "\b" */ + if (! SIZE_ONLY + && (len || (p > RExC_start && isALPHA_A(*(p -1))))) + { + ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through"); + } + /*FALLTHROUGH*/ default: normal_default: if (UTF8_IS_START(*p) && UTF) { @@ -9230,223 +10472,16 @@ tryagain: break; } /* End of switch on the literal */ - /* Certain characters are problematic because their folded - * length is so different from their original length that it - * isn't handleable by the optimizer. They are therefore not - * placed in an EXACTish node; and are here handled specially. - * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S, - * putting it in a special node keeps regexec from having to - * deal with a non-utf8 multi-char fold */ - if (FOLD - && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))) - { - /* We look for either side of the fold. For example \xDF - * folds to 'ss'. We look for both the single character - * \xDF and the sequence 'ss'. When we find something that - * could be one of those, we stop and flush whatever we - * have output so far into the EXACTish node that was being - * built. Then restore the input pointer to what it was. - * regatom will return that EXACT node, and will be called - * again, positioned so the first character is the one in - * question, which we return in a different node type. - * The multi-char folds are a sequence, so the occurrence - * of the first character in that sequence doesn't - * necessarily mean that what follows is the rest of the - * sequence. We keep track of that with a state machine, - * with the state being set to the latest character - * processed before the current one. Most characters will - * set the state to 0, but if one occurs that is part of a - * potential tricky fold sequence, the state is set to that - * character, and the next loop iteration sees if the state - * should progress towards the final folded-from character, - * or if it was a false alarm. If it turns out to be a - * false alarm, the character(s) will be output in a new - * EXACTish node, and join_exact() will later combine them. - * In the case of the 'ss' sequence, which is more common - * and more easily checked, some look-ahead is done to - * save time by ruling-out some false alarms */ - switch (ender) { - default: - latest_char_state = generic_char; - break; - case 's': - case 'S': - case 0x17F: /* LATIN SMALL LETTER LONG S */ - if (AT_LEAST_UNI_SEMANTICS) { - if (latest_char_state == char_s) { /* 'ss' */ - ender = LATIN_SMALL_LETTER_SHARP_S; - goto do_tricky; - } - else if (p < RExC_end) { - - /* Look-ahead at the next character. If it - * is also an s, we handle as a sharp s - * tricky regnode. */ - if (*p == 's' || *p == 'S') { - - /* But first flush anything in the - * EXACTish buffer */ - if (len != 0) { - p = oldp; - goto loopdone; - } - p++; /* Account for swallowing this - 's' up */ - ender = LATIN_SMALL_LETTER_SHARP_S; - goto do_tricky; - } - /* Here, the next character is not a - * literal 's', but still could - * evaluate to one if part of a \o{}, - * \x or \OCTAL-DIGIT. The minimum - * length required for that is 4, eg - * \x53 or \123 */ - else if (*p == '\\' - && p < RExC_end - 4 - && (isDIGIT(*(p + 1)) - || *(p + 1) == 'x' - || *(p + 1) == 'o' )) - { - - /* Here, it could be an 's', too much - * bother to figure it out here. Flush - * the buffer if any; when come back - * here, set the state so know that the - * previous char was an 's' */ - if (len != 0) { - latest_char_state = generic_char; - p = oldp; - goto loopdone; - } - latest_char_state = char_s; - break; - } - } - } - - /* Here, can't be an 'ss' sequence, or at least not - * one that could fold to/from the sharp ss */ - latest_char_state = generic_char; - break; - case 0x03C5: /* First char in upsilon series */ - case 0x03A5: /* Also capital UPSILON, which folds to - 03C5, and hence exhibits the same - problem */ - if (p < RExC_end - 4) { /* Need >= 4 bytes left */ - latest_char_state = upsilon_1; - if (len != 0) { - p = oldp; - goto loopdone; - } - } - else { - latest_char_state = generic_char; - } - break; - case 0x03B9: /* First char in iota series */ - case 0x0399: /* Also capital IOTA */ - case 0x1FBE: /* GREEK PROSGEGRAMMENI folds to 3B9 */ - case 0x0345: /* COMBINING GREEK YPOGEGRAMMENI folds - to 3B9 */ - if (p < RExC_end - 4) { - latest_char_state = iota_1; - if (len != 0) { - p = oldp; - goto loopdone; - } - } - else { - latest_char_state = generic_char; - } - break; - case 0x0308: - if (latest_char_state == upsilon_1) { - latest_char_state = upsilon_2; - } - else if (latest_char_state == iota_1) { - latest_char_state = iota_2; - } - else { - latest_char_state = generic_char; - } - break; - case 0x301: - if (latest_char_state == upsilon_2) { - ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS; - goto do_tricky; - } - else if (latest_char_state == iota_2) { - ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS; - goto do_tricky; - } - latest_char_state = generic_char; - break; - - /* These are the tricky fold characters. Flush any - * buffer first. (When adding to this list, also should - * add them to fold_grind.t to make sure get tested) */ - case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS: - case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS: - case LATIN_SMALL_LETTER_SHARP_S: - case LATIN_CAPITAL_LETTER_SHARP_S: - case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */ - case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */ - if (len != 0) { - p = oldp; - goto loopdone; - } - /* FALL THROUGH */ - do_tricky: { - char* const oldregxend = RExC_end; - U8 tmpbuf[UTF8_MAXBYTES+1]; - - /* Here, we know we need to generate a special - * regnode, and 'ender' contains the tricky - * character. What's done is to pretend it's in a - * [bracketed] class, and let the code that deals - * with those handle it, as that code has all the - * intelligence necessary. First save the current - * parse state, get rid of the already allocated - * but empty EXACT node that the ANYOFV node will - * replace, and point the parse to a buffer which - * we fill with the character we want the regclass - * code to think is being parsed */ - RExC_emit = orig_emit; - RExC_parse = (char *) tmpbuf; - if (UTF) { - U8 *d = uvchr_to_utf8(tmpbuf, ender); - *d = '\0'; - RExC_end = (char *) d; - } - else { /* ender above 255 already excluded */ - tmpbuf[0] = (U8) ender; - tmpbuf[1] = '\0'; - RExC_end = RExC_parse + 1; - } - - ret = regclass(pRExC_state,depth+1); - - /* Here, have parsed the buffer. Reset the parse to - * the actual input, and return */ - RExC_end = oldregxend; - RExC_parse = p - 1; - - Set_Node_Offset(ret, RExC_parse); - Set_Node_Cur_Length(ret); - nextchar(pRExC_state); - *flagp |= HASWIDTH|SIMPLE; - return ret; - } - } - } - + is_exactfu_sharp_s = (node_type == EXACTFU + && ender == LATIN_SMALL_LETTER_SHARP_S); if ( RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); - if (UTF && FOLD) { + if ((UTF && FOLD) || is_exactfu_sharp_s) { /* Prime the casefolded buffer. Locale rules, which apply * only to code points < 256, aren't known until execution, * so for them, just output the original character using - * utf8 */ + * utf8. If we start to fold non-UTF patterns, be sure to + * update join_exact() */ if (LOC && ender < 256) { if (UNI_IS_INVARIANT(ender)) { *tmpbuf = (U8) ender; @@ -9505,14 +10540,17 @@ tryagain: if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; - else if (UTF) { + else if (UTF || is_exactfu_sharp_s) { if (FOLD) { /* Emit all the Unicode characters. */ STRLEN numlen; for (foldbuf = tmpbuf; foldlen; foldlen -= numlen) { - ender = utf8_to_uvchr(foldbuf, &numlen); + + /* tmpbuf has been constructed by us, so we + * know it is valid utf8 */ + ender = valid_utf8_to_uvchr(foldbuf, &numlen); if (numlen > 0) { const STRLEN unilen = reguni(pRExC_state, ender, s); s += unilen; @@ -9541,14 +10579,14 @@ tryagain: } break; } - if (UTF) { + if (UTF || is_exactfu_sharp_s) { if (FOLD) { /* Emit all the Unicode characters. */ STRLEN numlen; for (foldbuf = tmpbuf; foldlen; foldlen -= numlen) { - ender = utf8_to_uvchr(foldbuf, &numlen); + ender = valid_utf8_to_uvchr(foldbuf, &numlen); if (numlen > 0) { const STRLEN unilen = reguni(pRExC_state, ender, s); len += unilen; @@ -9795,91 +10833,136 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) } } -/* No locale test, and always Unicode semantics, no ignore-case differences */ -#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ -ANYOF_##NAME: \ - for (value = 0; value < 256; value++) \ - if (TEST) \ - stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - yesno = '+'; \ - what = WORD; \ - break; \ -case ANYOF_N##NAME: \ - for (value = 0; value < 256; value++) \ - if (!TEST) \ - stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - yesno = '!'; \ - what = WORD; \ - break - -/* Like the above, but there are differences if we are in uni-8-bit or not, so - * there are two tests passed in, to use depending on that. There aren't any - * cases where the label is different from the name, so no need for that - * parameter. - * Sets 'what' to WORD which is the property name for non-bitmap code points; - * But, uses FOLD_WORD instead if /i has been selected, to allow a different - * property name */ -#define _C_C_T_(NAME, TEST_8, TEST_7, WORD, FOLD_WORD) \ -ANYOF_##NAME: \ - if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \ - else if (UNI_SEMANTICS) { \ - for (value = 0; value < 256; value++) { \ - if (TEST_8(value)) stored += \ - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - } \ - } \ - else { \ - for (value = 0; value < 128; value++) { \ - if (TEST_7(UNI_TO_NATIVE(value))) stored += \ - set_regclass_bit(pRExC_state, ret, \ - (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ - } \ - } \ - yesno = '+'; \ - if (FOLD) { \ - what = FOLD_WORD; \ - } \ - else { \ - what = WORD; \ - } \ - break; \ -case ANYOF_N##NAME: \ - if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \ - else if (UNI_SEMANTICS) { \ - for (value = 0; value < 256; value++) { \ - if (! TEST_8(value)) stored += \ - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - } \ - } \ - else { \ - for (value = 0; value < 128; value++) { \ - if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \ - pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ - } \ - if (AT_LEAST_ASCII_RESTRICTED) { \ - for (value = 128; value < 256; value++) { \ - stored += set_regclass_bit( \ - pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ - } \ - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \ - } \ - else { \ - /* For a non-ut8 target string with DEPENDS semantics, all above \ - * ASCII Latin1 code points match the complement of any of the \ - * classes. But in utf8, they have their Unicode semantics, so \ - * can't just set them in the bitmap, or else regexec.c will think \ - * they matched when they shouldn't. */ \ - ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \ - } \ - } \ - yesno = '!'; \ - if (FOLD) { \ - what = FOLD_WORD; \ - } \ - else { \ - what = WORD; \ - } \ - break +/* Generate the code to add a full posix character to the bracketed + * character class given by . ( is needed only under locale rules) + * destlist is the inversion list for non-locale rules that this class is + * to be added to + * sourcelist is the ASCII-range inversion list to add under /a rules + * Xsourcelist is the full Unicode range list to use otherwise. */ +#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \ + if (LOC) { \ + SV* scratch_list = NULL; \ + \ + /* Set this class in the node for runtime matching */ \ + ANYOF_CLASS_SET(node, class); \ + \ + /* For above Latin1 code points, we use the full Unicode range */ \ + _invlist_intersection(PL_AboveLatin1, \ + Xsourcelist, \ + &scratch_list); \ + /* And set the output to it, adding instead if there already is an \ + * output. Checking if is NULL first saves an extra \ + * clone. Its reference count will be decremented at the next \ + * union, etc, or if this is the only instance, at the end of the \ + * routine */ \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + } \ + else { \ + /* For non-locale, just add it to any existing list */ \ + _invlist_union(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : Xsourcelist, \ + &destlist); \ + } + +/* Like DO_POSIX, but matches the complement of and . + */ +#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \ + if (LOC) { \ + SV* scratch_list = NULL; \ + ANYOF_CLASS_SET(node, class); \ + _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + } \ + else { \ + _invlist_union_complement_2nd(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : Xsourcelist, \ + &destlist); \ + /* Under /d, everything in the upper half of the Latin1 range \ + * matches this complement */ \ + if (DEPENDS_SEMANTICS) { \ + ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \ + } \ + } + +/* Generate the code to add a posix character to the bracketed + * character class given by . ( is needed only under locale rules) + * destlist is the inversion list for non-locale rules that this class is + * to be added to + * sourcelist is the ASCII-range inversion list to add under /a rules + * l1_sourcelist is the Latin1 range list to use otherwise. + * Xpropertyname is the name to add to of the property to + * specify the code points above Latin1 that will have to be + * determined at run-time + * run_time_list is a SV* that contains text names of properties that are to + * be computed at run time. This concatenates + * to it, apppropriately + * This is essentially DO_POSIX, but we know only the Latin1 values at compile + * time */ +#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ + l1_sourcelist, Xpropertyname, run_time_list) \ + /* First, resolve whether to use the ASCII-only list or the L1 \ + * list */ \ + DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \ + ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\ + Xpropertyname, run_time_list) + +#define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \ + Xpropertyname, run_time_list) \ + /* If not /a matching, there are going to be code points we will have \ + * to defer to runtime to look-up */ \ + if (! AT_LEAST_ASCII_RESTRICTED) { \ + Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \ + } \ + if (LOC) { \ + ANYOF_CLASS_SET(node, class); \ + } \ + else { \ + _invlist_union(destlist, sourcelist, &destlist); \ + } + +/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of + * this and DO_N_POSIX */ +#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ + l1_sourcelist, Xpropertyname, run_time_list) \ + if (AT_LEAST_ASCII_RESTRICTED) { \ + _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \ + } \ + else { \ + Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \ + if (LOC) { \ + ANYOF_CLASS_SET(node, namedclass); \ + } \ + else { \ + SV* scratch_list = NULL; \ + _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + if (DEPENDS_SEMANTICS) { \ + ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \ + } \ + } \ + } STATIC U8 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr) @@ -10356,18 +11439,7 @@ parseit: /* Invert if asking for the complement */ if (value == 'P') { - - /* Add to any existing list */ - if (! properties) { - properties = invlist_clone(invlist); - _invlist_invert(properties); - } - else { - invlist = invlist_clone(invlist); - _invlist_invert(invlist); - _invlist_union(properties, invlist, &properties); - SvREFCNT_dec(invlist); - } + _invlist_union_complement_2nd(properties, invlist, &properties); /* The swash can't be used as-is, because we've * inverted things; delay removing it to here after @@ -10376,12 +11448,7 @@ parseit: swash = NULL; } else { - if (! properties) { - properties = invlist_clone(invlist); - } - else { - _invlist_union(properties, invlist, &properties); - } + _invlist_union(properties, invlist, &properties); } } Safefree(name); @@ -10525,85 +11592,212 @@ parseit: } if (!SIZE_ONLY) { - const char *what = NULL; - char yesno = 0; /* Possible truncation here but in some 64-bit environments * the compiler gets heartburn about switch on 64-bit values. * A similar issue a little earlier when switching on value. * --jhi */ switch ((I32)namedclass) { - - case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum", "XPosixAlnum"); - case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha", "XPosixAlpha"); - case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank", "XPosixBlank"); - case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl", "XPosixCntrl"); - case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph", "XPosixGraph"); - case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower", "__XPosixLower_i"); - case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint", "XPosixPrint"); - case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace", "XPosixSpace"); - case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct", "XPosixPunct"); - case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper", "__XPosixUpper_i"); - /* \s, \w match all unicode if utf8. */ - case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl", "SpacePerl"); - case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word", "Word"); - case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit", "XPosixXDigit"); - case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); - case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); + + case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */ + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); + break; + case ANYOF_NALNUMC: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); + break; + case ANYOF_ALPHA: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); + break; + case ANYOF_NALPHA: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); + break; case ANYOF_ASCII: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_ASCII); - else { - for (value = 0; value < 128; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); + if (LOC) { + ANYOF_CLASS_SET(ret, namedclass); } - yesno = '+'; - what = NULL; /* Doesn't match outside ascii, so - don't want to add +utf8:: */ + else { + _invlist_union(properties, PL_ASCII, &properties); + } break; case ANYOF_NASCII: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NASCII); - else { - for (value = 128; value < 256; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); + if (LOC) { + ANYOF_CLASS_SET(ret, namedclass); } - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; - yesno = '!'; - what = "ASCII"; - break; + else { + _invlist_union_complement_2nd(properties, + PL_ASCII, &properties); + if (DEPENDS_SEMANTICS) { + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; + } + } + break; + case ANYOF_BLANK: + DO_POSIX(ret, namedclass, properties, + PL_PosixBlank, PL_XPosixBlank); + break; + case ANYOF_NBLANK: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixBlank, PL_XPosixBlank); + break; + case ANYOF_CNTRL: + DO_POSIX(ret, namedclass, properties, + PL_PosixCntrl, PL_XPosixCntrl); + break; + case ANYOF_NCNTRL: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixCntrl, PL_XPosixCntrl); + break; case ANYOF_DIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_DIGIT); + /* There are no digits in the Latin1 range outside of + * ASCII, so call the macro that doesn't have to resolve + * them */ + DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties, + PL_PosixDigit, "XPosixDigit", listsv); + break; + case ANYOF_NDIGIT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv); + break; + case ANYOF_GRAPH: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); + break; + case ANYOF_NGRAPH: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); + break; + case ANYOF_HORIZWS: + /* For these, we use the nonbitmap, as /d doesn't make a + * difference in what these match. There would be problems + * if these characters had folds other than themselves, as + * nonbitmap is subject to folding. It turns out that \h + * is just a synonym for XPosixBlank */ + _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap); + break; + case ANYOF_NHORIZWS: + _invlist_union_complement_2nd(nonbitmap, + PL_XPosixBlank, &nonbitmap); + break; + case ANYOF_LOWER: + case ANYOF_NLOWER: + { /* These require special handling, as they differ under + folding, matching Cased there (which in the ASCII range + is the same as Alpha */ + + SV* ascii_source; + SV* l1_source; + const char *Xname; + + if (FOLD && ! LOC) { + ascii_source = PL_PosixAlpha; + l1_source = PL_L1Cased; + Xname = "Cased"; + } + else { + ascii_source = PL_PosixLower; + l1_source = PL_L1PosixLower; + Xname = "XPosixLower"; + } + if (namedclass == ANYOF_LOWER) { + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + ascii_source, l1_source, Xname, listsv); + } else { - /* consecutive digits assumed */ - for (value = '0'; value <= '9'; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, + properties, ascii_source, l1_source, Xname, listsv); } - yesno = '+'; - what = "Digit"; break; - case ANYOF_NDIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); + } + case ANYOF_PRINT: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); + break; + case ANYOF_NPRINT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); + break; + case ANYOF_PUNCT: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); + break; + case ANYOF_NPUNCT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); + break; + case ANYOF_PSXSPC: + DO_POSIX(ret, namedclass, properties, + PL_PosixSpace, PL_XPosixSpace); + break; + case ANYOF_NPSXSPC: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixSpace, PL_XPosixSpace); + break; + case ANYOF_SPACE: + DO_POSIX(ret, namedclass, properties, + PL_PerlSpace, PL_XPerlSpace); + break; + case ANYOF_NSPACE: + DO_N_POSIX(ret, namedclass, properties, + PL_PerlSpace, PL_XPerlSpace); + break; + case ANYOF_UPPER: /* Same as LOWER, above */ + case ANYOF_NUPPER: + { + SV* ascii_source; + SV* l1_source; + const char *Xname; + + if (FOLD && ! LOC) { + ascii_source = PL_PosixAlpha; + l1_source = PL_L1Cased; + Xname = "Cased"; + } else { - /* consecutive digits assumed */ - for (value = 0; value < '0'; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); - for (value = '9' + 1; value < 256; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); + ascii_source = PL_PosixUpper; + l1_source = PL_L1PosixUpper; + Xname = "XPosixUpper"; + } + if (namedclass == ANYOF_UPPER) { + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + ascii_source, l1_source, Xname, listsv); } - yesno = '!'; - what = "Digit"; - if (AT_LEAST_ASCII_RESTRICTED ) { - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; + else { + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, + properties, ascii_source, l1_source, Xname, listsv); } - break; + break; + } + case ANYOF_ALNUM: /* Really is 'Word' */ + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + break; + case ANYOF_NALNUM: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + break; + case ANYOF_VERTWS: + /* For these, we use the nonbitmap, as /d doesn't make a + * difference in what these match. There would be problems + * if these characters had folds other than themselves, as + * nonbitmap is subject to folding */ + _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap); + break; + case ANYOF_NVERTWS: + _invlist_union_complement_2nd(nonbitmap, + PL_VertSpace, &nonbitmap); + break; + case ANYOF_XDIGIT: + DO_POSIX(ret, namedclass, properties, + PL_PosixXDigit, PL_XPosixXDigit); + break; + case ANYOF_NXDIGIT: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixXDigit, PL_XPosixXDigit); + break; case ANYOF_MAX: /* this is to handle \p and \P */ break; @@ -10611,10 +11805,6 @@ parseit: vFAIL("Invalid [::] class"); break; } - if (what && ! (AT_LEAST_ASCII_RESTRICTED)) { - /* Strings such as "+utf8::isWord\n" */ - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", yesno, what); - } continue; } @@ -10695,7 +11885,7 @@ parseit: if (value > 255) { const UV prevnatvalue = NATIVE_TO_UNI(prevvalue); const UV natvalue = NATIVE_TO_UNI(value); - nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue); + nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue); } #ifdef EBCDIC literal_endpoint = 0; @@ -10772,7 +11962,8 @@ parseit: U8 foldbuf[UTF8_MAXBYTES_CASE+1]; STRLEN foldlen; const UV f = - _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold); + _to_uni_fold_flags(j, foldbuf, &foldlen, + (allow_full_fold) ? FOLD_FLAGS_FULL : 0); if (foldlen > (STRLEN)UNISKIP(f)) { @@ -10952,13 +12143,14 @@ parseit: } } - /* Done with loop; set to not include any code points that - * are in the bitmap */ + /* Done with loop; remove any code points that are in the bitmap from + * */ if (change_invlist) { - SV* keep_list = _new_invlist(2); - _append_range_to_invlist(keep_list, max_cp_to_set + 1, UV_MAX); - _invlist_intersection(nonbitmap, keep_list, &nonbitmap); - SvREFCNT_dec(keep_list); + _invlist_subtract(nonbitmap, + (DEPENDS_SEMANTICS) + ? PL_ASCII + : PL_Latin1, + &nonbitmap); } /* If have completely emptied it, remove it completely */ @@ -11045,6 +12237,12 @@ parseit: * there should not be overlap unless is /d rules. */ _invlist_invert(nonbitmap); + /* Any swash can't be used as-is, because we've inverted things */ + if (swash) { + SvREFCNT_dec(swash); + swash = NULL; + } + for (i = 0; i < 256; ++i) { if (ANYOF_BITMAP_TEST(ret, i)) { ANYOF_BITMAP_CLEAR(ret, i); @@ -11072,10 +12270,7 @@ parseit: else { /* There is no overlap for non-/d, so just delete anything * below 256 */ - SV* keep_list = _new_invlist(2); - _append_range_to_invlist(keep_list, 256, UV_MAX); - _invlist_intersection(nonbitmap, keep_list, &nonbitmap); - SvREFCNT_dec(keep_list); + _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap); } } @@ -11257,7 +12452,6 @@ parseit: } return ret; } -#undef _C_C_T_ /* reg_skipcomment() @@ -11628,9 +12822,11 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, for (;;) { regnode * const temp = regnext(scan); #ifdef EXPERIMENTAL_INPLACESCAN - if (PL_regkind[OP(scan)] == EXACT) - if (join_exact(pRExC_state,scan,&min,1,val,depth+1)) + if (PL_regkind[OP(scan)] == EXACT) { + bool has_exactf_sharp_s; /* Unexamined in this routine */ + if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1)) return EXACT; + } #endif if ( exact ) { switch (OP(scan)) { @@ -11638,6 +12834,8 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, case EXACTF: case EXACTFA: case EXACTFU: + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: case EXACTFL: if( exact == PSEUDO ) exact= OP(scan); @@ -11962,8 +13160,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ - else if (k == FOLDCHAR) - Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); @@ -12224,6 +13420,7 @@ Perl_pregfree2(pTHX_ REGEXP *rx) SvREFCNT_dec(r->saved_copy); #endif Safefree(r->offs); + SvREFCNT_dec(r->qr_anoncv); } /* reg_temp_copy() @@ -12287,6 +13484,7 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) ret->saved_copy = NULL; #endif ret->mother_re = rx; + SvREFCNT_inc_void(ret->qr_anoncv); return ret_x; } @@ -12329,16 +13527,21 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) if (ri->u.offsets) Safefree(ri->u.offsets); /* 20010421 MJD */ #endif + if (ri->code_blocks) { + int n; + for (n = 0; n < ri->num_code_blocks; n++) + SvREFCNT_dec(ri->code_blocks[n].src_regex); + Safefree(ri->code_blocks); + } + if (ri->data) { int n = ri->data->count; - PAD* new_comppad = NULL; - PAD* old_comppad; - PADOFFSET refcnt; while (--n >= 0) { /* If you add a ->what type here, update the comment in regcomp.h */ switch (ri->data->what[n]) { case 'a': + case 'r': case 's': case 'S': case 'u': @@ -12347,27 +13550,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) case 'f': Safefree(ri->data->data[n]); break; - case 'p': - new_comppad = MUTABLE_AV(ri->data->data[n]); - break; - case 'o': - if (new_comppad == NULL) - Perl_croak(aTHX_ "panic: pregfree comppad"); - PAD_SAVE_LOCAL(old_comppad, - /* Watch out for global destruction's random ordering. */ - (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL - ); - OP_REFCNT_LOCK; - refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]); - OP_REFCNT_UNLOCK; - if (!refcnt) - op_free((OP_4tree*)ri->data->data[n]); - - PAD_RESTORE_LOCAL(old_comppad); - SvREFCNT_dec(MUTABLE_SV(new_comppad)); - new_comppad = NULL; - break; - case 'n': + case 'l': + case 'L': break; case 'T': { /* Aho Corasick add-on structure for a trie node. @@ -12495,6 +13679,7 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) } RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); + ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param)); if (ret->pprivate) RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); @@ -12555,7 +13740,20 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); - + + reti->num_code_blocks = ri->num_code_blocks; + if (ri->code_blocks) { + int n; + Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block, + struct reg_code_block); + Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks, + struct reg_code_block); + for (n = 0; n < ri->num_code_blocks; n++) + reti->code_blocks[n].src_regex = (REGEXP*) + sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param); + } + else + reti->code_blocks = NULL; reti->regstclass = NULL; @@ -12572,12 +13770,11 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) for (i = 0; i < count; i++) { d->what[i] = ri->data->what[i]; switch (d->what[i]) { - /* legal options are one of: sSfpontTua - see also regcomp.h and pregfree() */ + /* see also regcomp.h and regfree_internal() */ case 'a': /* actually an AV, but the dup function is identical. */ + case 'r': case 's': case 'S': - case 'p': /* actually an AV, but the dup function is identical. */ case 'u': /* actually an HV, but the dup function is identical. */ d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); break; @@ -12588,13 +13785,6 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) struct regnode_charclass_class); reti->regstclass = (regnode*)d->data[i]; break; - case 'o': - /* Compiled op trees are readonly and in shared memory, - and can thus be shared without duplication. */ - OP_REFCNT_LOCK; - d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]); - OP_REFCNT_UNLOCK; - break; case 'T': /* Trie stclasses are readonly and can thus be shared * without duplication. We free the stclass in pregfree @@ -12607,7 +13797,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) ((reg_trie_data*)ri->data->data[i])->refcount++; OP_REFCNT_UNLOCK; /* Fall through */ - case 'n': + case 'l': + case 'L': d->data[i] = ri->data->data[i]; break; default: @@ -12717,8 +13908,6 @@ Perl_save_re_context(pTHX) Copy(&PL_reg_state, state, 1, struct re_save_state); - PL_reg_start_tmp = 0; - PL_reg_start_tmpl = 0; PL_reg_oldsaved = NULL; PL_reg_oldsavedlen = 0; PL_reg_maxiter = 0; @@ -12960,8 +14149,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, * Local variables: * c-indentation-style: bsd * c-basic-offset: 4 - * indent-tabs-mode: t + * indent-tabs-mode: nil * End: * - * ex: set ts=8 sts=4 sw=4 noet: + * ex: set ts=8 sts=4 sw=4 et: */