X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/86d6fcadb912bd04e5bb511a8188871eb12e4274..65c512c3519b68852f64f5e1293cebadee892115:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 02a6f75..b106cc1 100644 --- a/regcomp.c +++ b/regcomp.c @@ -86,6 +86,9 @@ #endif #include "dquote_static.c" +#ifndef PERL_IN_XSUB_RE +# include "charclass_invlists.h" +#endif #ifdef op #undef op @@ -379,9 +382,11 @@ static const scan_data_t zero_scan_data = #define SCF_SEEN_ACCEPT 0x8000 #define UTF cBOOL(RExC_utf8) + +/* The enums for all these are ordered so things work out correctly */ #define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET) -#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET) +#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) #define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) @@ -1361,44 +1366,47 @@ is the recommended Unicode-aware way of saying *(d++) = uv; */ -#define TRIE_STORE_REVCHAR \ +#define TRIE_STORE_REVCHAR(val) \ STMT_START { \ if (UTF) { \ - SV *zlopp = newSV(2); \ + SV *zlopp = newSV(7); /* XXX: optimize me */ \ unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ - unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \ + unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \ SvCUR_set(zlopp, kapow - flrbbbbb); \ SvPOK_on(zlopp); \ SvUTF8_on(zlopp); \ av_push(revcharmap, zlopp); \ } else { \ - char ooooff = (char)uvc; \ + char ooooff = (char)val; \ av_push(revcharmap, newSVpvn(&ooooff, 1)); \ } \ } STMT_END -#define TRIE_READ_CHAR STMT_START { \ - wordlen++; \ - if ( UTF ) { \ - if ( folder ) { \ - if ( foldlen > 0 ) { \ - uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - scan += len; \ - len = 0; \ - } else { \ - len = UTF8SKIP(uc);\ - uvc = to_utf8_fold( uc, foldbuf, &foldlen); \ - foldlen -= UNISKIP( uvc ); \ - scan = foldbuf + UNISKIP( uvc ); \ - } \ - } else { \ - uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ - } \ - } else { \ - uvc = (U32)*uc; \ - len = 1; \ - } \ +#define TRIE_READ_CHAR STMT_START { \ + wordlen++; \ + if ( UTF ) { \ + /* if it is UTF then it is either already folded, or does not need folding */ \ + uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \ + } \ + else if (folder == PL_fold_latin1) { \ + /* if we use this folder we have to obey unicode rules on latin-1 data */ \ + if ( foldlen > 0 ) { \ + uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + scan += len; \ + len = 0; \ + } else { \ + len = 1; \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \ + skiplen = UNISKIP(uvc); \ + foldlen -= skiplen; \ + scan = foldbuf + skiplen; \ + } \ + } else { \ + /* raw data, will be folded later if needed */ \ + uvc = (U32)*uc; \ + len = 1; \ + } \ } STMT_END @@ -1517,10 +1525,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs switch (flags) { case EXACT: break; case EXACTFA: + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: case EXACTFU: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; case EXACTFL: folder = PL_fold_locale; break; - default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags ); + default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); } trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); @@ -1529,7 +1539,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->wordcount = word_count; RExC_rxi->data->data[ data_slot ] = (void*)trie; trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); - if (!(UTF && folder)) + if (flags == EXACT) trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( trie->wordcount+1, sizeof(reg_trie_wordinfo)); @@ -1589,6 +1599,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs const U8 * const e = uc + STR_LEN( noper ); STRLEN foldlen = 0; U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + STRLEN skiplen = 0; const U8 *scan = (U8*)NULL; U32 wordlen = 0; /* required init */ STRLEN chars = 0; @@ -1598,28 +1609,37 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->minlen= 0; continue; } - if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */ + if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte regardless of encoding */ - + if (OP( noper ) == EXACTFU_SS) { + /* false positives are ok, so just set this */ + TRIE_BITMAP_SET(trie,0xDF); + } + } for ( ; uc < e ; uc += len ) { TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; chars++; if ( uvc < 256 ) { + if ( folder ) { + U8 folded= folder[ (U8) uvc ]; + if ( !trie->charmap[ folded ] ) { + trie->charmap[ folded ]=( ++trie->uniquecharcount ); + TRIE_STORE_REVCHAR( folded ); + } + } if ( !trie->charmap[ uvc ] ) { trie->charmap[ uvc ]=( ++trie->uniquecharcount ); - if ( folder ) - trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ]; - TRIE_STORE_REVCHAR; + TRIE_STORE_REVCHAR( uvc ); } if ( set_bit ) { /* store the codepoint in the bitmap, and its folded * equivalent. */ - TRIE_BITMAP_SET(trie,uvc); + TRIE_BITMAP_SET(trie, uvc); /* store the folded codepoint */ - if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); if ( !UTF ) { /* store first byte of utf8 representation of @@ -1642,17 +1662,28 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( !SvTRUE( *svpp ) ) { sv_setiv( *svpp, ++trie->uniquecharcount ); - TRIE_STORE_REVCHAR; + TRIE_STORE_REVCHAR(uvc); } } } if( cur == first ) { - trie->minlen=chars; - trie->maxlen=chars; + trie->minlen = chars; + trie->maxlen = chars; } else if (chars < trie->minlen) { - trie->minlen=chars; + trie->minlen = chars; } else if (chars > trie->maxlen) { - trie->maxlen=chars; + trie->maxlen = chars; + } + if (OP( noper ) == EXACTFU_SS) { + /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/ + if (trie->minlen > 1) + trie->minlen= 1; + } + if (OP( noper ) == EXACTFU_TRICKYFOLD) { + /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" + * - We assume that any such sequence might match a 2 byte string */ + if (trie->minlen > 2 ) + trie->minlen= 2; } } /* end first pass */ @@ -1725,6 +1756,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + STRLEN skiplen = 0; if (OP(noper) != NOTHING) { for ( ; uc < e ; uc += len ) { @@ -1925,8 +1957,10 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STRLEN foldlen = 0; /* required init */ U32 wordlen = 0; /* required init */ + STRLEN skiplen = 0; U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + if ( OP(noper) != NOTHING ) { for ( ; uc < e ; uc += len ) { @@ -2505,13 +2539,116 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode }}); +/* The below joins as many adjacent EXACTish nodes as possible into a single + * one, and looks for problematic sequences of characters whose folds vs. + * non-folds have sufficiently different lengths, that the optimizer would be + * fooled into rejecting legitimate matches of them, and the trie construction + * code can't cope with them. The joining is only done if: + * 1) there is room in the current conglomerated node to entirely contain the + * next one. + * 2) they are the exact same node type + * + * The adjacent nodes actually may be separated by NOTHING kind nodes, and + * these get optimized out + * + * If there are problematic code sequences, *min_subtract is set to the delta + * that the minimum size of the node can be less than its actual size. And, + * the node type of the result is changed to reflect that it contains these + * sequences. + * + * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF + * and contains LATIN SMALL LETTER SHARP S + * + * This is as good a place as any to discuss the design of handling these + * problematic sequences. It's been wrong in Perl for a very long time. There + * are three code points in Unicode whose folded lengths differ so much from + * the un-folded lengths that it causes problems for the optimizer and trie + * construction. Why only these are problematic, and not others where lengths + * also differ is something I (khw) do not understand. New versions of Unicode + * might add more such code points. Hopefully the logic in fold_grind.t that + * figures out what to test (in part by verifying that each size-combination + * gets tested) will catch any that do come along, so they can be added to the + * special handling below. The chances of new ones are actually rather small, + * as most, if not all, of the world's scripts that have casefolding have + * already been encoded by Unicode. Also, a number of Unicode's decisions were + * made to allow compatibility with pre-existing standards, and almost all of + * those have already been dealt with. These would otherwise be the most + * likely candidates for generating further tricky sequences. In other words, + * Unicode by itself is unlikely to add new ones unless it is for compatibility + * with pre-existing standards, and there aren't many of those left. + * + * The previous designs for dealing with these involved assigning a special + * node for them. This approach doesn't work, as evidenced by this example: + * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches + * Both these fold to "sss", but if the pattern is parsed to create a node of + * that would match just the \xDF, it won't be able to handle the case where a + * successful match would have to cross the node's boundary. The new approach + * that hopefully generally solves the problem generates an EXACTFU_SS node + * that is "sss". + * + * There are a number of components to the approach (a lot of work for just + * three code points!): + * 1) This routine examines each EXACTFish node that could contain the + * problematic sequences. It returns in *min_subtract how much to + * subtract from the the actual length of the string to get a real minimum + * for one that could match it. This number is usually 0 except for the + * problematic sequences. This delta is used by the caller to adjust the + * min length of the match, and the delta between min and max, so that the + * optimizer doesn't reject these possibilities based on size constraints. + * 2) These sequences are not currently correctly handled by the trie code + * either, so it changes the joined node type to ops that are not handled + * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD. + * 3) This is sufficient for the two Greek sequences (described below), but + * the one involving the Sharp s (\xDF) needs more. The node type + * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss" + * sequence in it. For non-UTF-8 patterns and strings, this is the only + * case where there is a possible fold length change. That means that a + * regular EXACTFU node without UTF-8 involvement doesn't have to concern + * itself with length changes, and so can be processed faster. regexec.c + * takes advantage of this. Generally, an EXACTFish node that is in UTF-8 + * is pre-folded by regcomp.c. This saves effort in regex matching. + * However, probably mostly for historical reasons, the pre-folding isn't + * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL + * nodes, as what they fold to isn't known until runtime.) The fold + * possibilities for the non-UTF8 patterns are quite simple, except for + * the sharp s. All the ones that don't involve a UTF-8 target string + * are members of a fold-pair, and arrays are set up for all of them + * that quickly find the other member of the pair. It might actually + * be faster to pre-fold these, but it isn't currently done, except for + * the sharp s. Code elsewhere in this file makes sure that it gets + * folded to 'ss', even if the pattern isn't UTF-8. This avoids the + * issues described in the next item. + * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches + * 'ss' or not is not knowable at compile time. It will match iff the + * target string is in UTF-8, unlike the EXACTFU nodes, where it always + * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus + * it can't be folded to "ss" at compile time, unlike EXACTFU does as + * described in item 3). An assumption that the optimizer part of + * regexec.c (probably unwittingly) makes is that a character in the + * pattern corresponds to at most a single character in the target string. + * (And I do mean character, and not byte here, unlike other parts of the + * documentation that have never been updated to account for multibyte + * Unicode.) This assumption is wrong only in this case, as all other + * cases are either 1-1 folds when no UTF-8 is involved; or is true by + * virtue of having this file pre-fold UTF-8 patterns. I'm + * reluctant to try to change this assumption, so instead the code punts. + * This routine examines EXACTF nodes for the sharp s, and returns a + * boolean indicating whether or not the node is an EXACTF node that + * contains a sharp s. When it is true, the caller sets a flag that later + * causes the optimizer in this file to not set values for the floating + * and fixed string lengths, and thus avoids the optimizer code in + * regexec.c that makes the invalid assumption. Thus, there is no + * optimization based on string lengths for EXACTF nodes that contain the + * sharp s. This only happens for /id rules (which means the pattern + * isn't in UTF-8). + */ -#define JOIN_EXACT(scan,min_change,flags) \ +#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \ if (PL_regkind[OP(scan)] == EXACT) \ - join_exact(pRExC_state,(scan),(min_change),(flags),NULL,depth+1) + join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1) STATIC U32 -S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, U32 flags,regnode *val, U32 depth) { +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) { /* Merge several consecutive EXACTish nodes into one. */ regnode *n = regnext(scan); U32 stringok = 1; @@ -2531,7 +2668,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, U32 PERL_UNUSED_ARG(val); #endif DEBUG_PEEP("join",scan,depth); - + /* Look through the subsequent nodes in the chain. Skip NOTHING, merge * EXACT ones that are mergeable to the current one. */ while (n @@ -2588,28 +2725,38 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, U32 } #endif } -#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390 -#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS -#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0 -#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS - *min_change = 0; + *min_subtract = 0; + *has_exactf_sharp_s = FALSE; /* Here, all the adjacent mergeable EXACTish nodes have been merged. We * can now analyze for sequences of problematic code points. (Prior to * this final joining, sequences could have been split over boundaries, and - * hence missed). The sequences only happen in folding */ + * hence missed). The sequences only happen in folding, hence for any + * non-EXACT EXACTish node */ if (OP(scan) != EXACT) { - char *s, *t; - char * s0 = STRING(scan); - char * const s_end = s0 + STR_LEN(scan); - - /* First we look at the sequences that can occur only in UTF-8 strings. - * The sequences are of length 6 */ - if (UTF && STR_LEN(scan) >= 6) { + U8 *s; + U8 * s0 = (U8*) STRING(scan); + U8 * const s_end = s0 + STR_LEN(scan); + + /* The below is perhaps overboard, but this allows us to save a test + * each time through the loop at the expense of a mask. This is + * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a + * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64. + * This uses an exclusive 'or' to find that bit and then inverts it to + * form a mask, with just a single 0, in the bit position where 'S' and + * 's' differ. */ + const U8 S_or_s_mask = (U8) ~ ('S' ^ 's'); + const U8 s_masked = 's' & S_or_s_mask; + + /* One pass is made over the node's string looking for all the + * possibilities. to avoid some tests in the loop, there are two main + * cases, for UTF-8 patterns (which can't have EXACTF nodes) and + * non-UTF-8 */ + if (UTF) { - /* Two problematic code points in Unicode casefolding of EXACT - * nodes: + /* There are two problematic Greek code points in Unicode + * casefolding * * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS @@ -2629,37 +2776,124 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, U32 * minimum length computation. (there are other code points that * also fold to these two sequences, but the delta is smaller) * - * What we'll do is to look for the tail four bytes, and then peek - * at the preceding two bytes to see whether we need to decrease - * the minimum length by four (six minus two). + * If these sequences are found, the minimum length is decreased by + * four (six minus two). * - * Thanks to the design of UTF-8, there cannot be false matches: - * A sequence of valid UTF-8 bytes cannot be a subsequence of - * another valid sequence of UTF-8 bytes. */ + * Similarly, 'ss' may match the single char and byte LATIN SMALL + * LETTER SHARP S. We decrease the min length by 1 for each + * occurrence of 'ss' found */ #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */ - const char U390_first_byte = '\xb4'; - const char U390_2nd_byte = '\x68'; - const char U3B0_first_byte = '\xb5'; - const char U3B0_2nd_byte = '\x46'; - const char tail[] = "\xaf\x49\xaf\x42"; +# define U390_first_byte 0xb4 + const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42"; +# define U3B0_first_byte 0xb5 + const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42"; #else - const char U390_first_byte = '\xce'; - const char U390_2nd_byte = '\xb9'; - const char U3B0_first_byte = '\xcf'; - const char U3B0_2nd_byte = '\x85'; - const char tail[] = "\xcc\x88\xcc\x81"; +# define U390_first_byte 0xce + const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81"; +# define U3B0_first_byte 0xcf + const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81"; #endif - const STRLEN tail_len = sizeof(tail) - 1; - for (s = s0 + 2; /* +2 is to skip the non-tail */ - s <= s_end - tail_len - && (t = ninstr(s, s_end, tail, tail + tail_len)); - s = t + tail_len) + const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte; + yields a net of 0 */ + /* Examine the string for one of the problematic sequences */ + for (s = s0; + s < s_end - 1; /* Can stop 1 before the end, as minimum length + * sequence we are looking for is 2 */ + s += UTF8SKIP(s)) { - if ((t[-1] == U390_2nd_byte && t[-2] == U390_first_byte) - || (t[-1] == U3B0_2nd_byte && t[-2] == U3B0_first_byte)) - { - *min_change -= 4; + + /* Look for the first byte in each problematic sequence */ + switch (*s) { + /* We don't have to worry about other things that fold to + * 's' (such as the long s, U+017F), as all above-latin1 + * code points have been pre-folded */ + case 's': + case 'S': + + /* Current character is an 's' or 'S'. If next one is + * as well, we have the dreaded sequence */ + if (((*(s+1) & S_or_s_mask) == s_masked) + /* These two node types don't have special handling + * for 'ss' */ + && OP(scan) != EXACTFL && OP(scan) != EXACTFA) + { + *min_subtract += 1; + OP(scan) = EXACTFU_SS; + s++; /* No need to look at this character again */ + } + break; + + case U390_first_byte: + if (s_end - s >= len + + /* The 1's are because are skipping comparing the + * first byte */ + && memEQ(s + 1, U390_tail, len - 1)) + { + goto greek_sequence; + } + break; + + case U3B0_first_byte: + if (! (s_end - s >= len + && memEQ(s + 1, U3B0_tail, len - 1))) + { + break; + } + greek_sequence: + *min_subtract += 4; + + /* This can't currently be handled by trie's, so change + * the node type to indicate this. If EXACTFA and + * EXACTFL were ever to be handled by trie's, this + * would have to be changed. If this node has already + * been changed to EXACTFU_SS in this loop, leave it as + * is. (I (khw) think it doesn't matter in regexec.c + * for UTF patterns, but no need to change it */ + if (OP(scan) == EXACTFU) { + OP(scan) = EXACTFU_TRICKYFOLD; + } + s += 6; /* We already know what this sequence is. Skip + the rest of it */ + break; + } + } + } + else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { + + /* Here, the pattern is not UTF-8. We need to look only for the + * 'ss' sequence, and in the EXACTF case, the sharp s, which can be + * in the final position. Otherwise we can stop looking 1 byte + * earlier because have to find both the first and second 's' */ + const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; + + for (s = s0; s < upper; s++) { + switch (*s) { + case 'S': + case 's': + if (s_end - s > 1 + && ((*(s+1) & S_or_s_mask) == s_masked)) + { + *min_subtract += 1; + + /* EXACTF nodes need to know that the minimum + * length changed so that a sharp s in the string + * can match this ss in the pattern, but they + * remain EXACTF nodes, as they are not trie'able, + * so don't have to invent a new node type to + * exclude them from the trie code */ + if (OP(scan) != EXACTF) { + OP(scan) = EXACTFU_SS; + } + s++; + } + break; + case LATIN_SMALL_LETTER_SHARP_S: + if (OP(scan) == EXACTF) { + *has_exactf_sharp_s = TRUE; + } + break; } } } @@ -2778,11 +3012,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, fake_study_recurse: while ( scan && OP(scan) != END && scan < last ){ - IV min_change = 0; + UV min_subtract = 0; /* How much to subtract from the minimum node + length to get a real minimum (because the + folded version may be shorter) */ + bool has_exactf_sharp_s = FALSE; /* Peephole optimizer: */ DEBUG_STUDYDATA("Peep:", data,depth); DEBUG_PEEP("Peep",scan,depth); - JOIN_EXACT(scan,&min_change,0); + + /* Its not clear to khw or hv why this is done here, and not in the + * clauses that deal with EXACT nodes. khw's guess is that it's + * because of a previous design */ + JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0); /* Follow the next-chain of the current node and optimize away all the NOTHINGs from it. */ @@ -2973,7 +3214,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, regnode *first = (regnode *)NULL; regnode *last = (regnode *)NULL; regnode *tail = scan; - U8 optype = 0; + U8 trietype = 0; U32 count=0; #ifdef DEBUGGING @@ -3004,32 +3245,59 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* - step through the branches, cur represents each - branch, noper is the first thing to be matched - as part of that branch and noper_next is the - regnext() of that node. if noper is an EXACT - and noper_next is the same as scan (our current - position in the regex) then the EXACT branch is - a possible optimization target. Once we have - two or more consecutive such branches we can - create a trie of the EXACT's contents and stich - it in place. If the sequence represents all of - the branches we eliminate the whole thing and - replace it with a single TRIE. If it is a - subsequence then we need to stitch it in. This - means the first branch has to remain, and needs - to be repointed at the item on the branch chain - following the last branch optimized. This could - be either a BRANCH, in which case the - subsequence is internal, or it could be the - item following the branch sequence in which - case the subsequence is at the end. + Step through the branches + cur represents each branch, + noper is the first thing to be matched as part of that branch + noper_next is the regnext() of that node. + + We normally handle a case like this /FOO[xyz]|BAR[pqr]/ + via a "jump trie" but we also support building with NOJUMPTRIE, + which restricts the trie logic to structures like /FOO|BAR/. + + If noper is a trieable nodetype then the branch is a possible optimization + target. If we are building under NOJUMPTRIE then we require that noper_next + is the same as scan (our current position in the regex program). + + Once we have two or more consecutive such branches we can create a + trie of the EXACT's contents and stitch it in place into the program. + + If the sequence represents all of the branches in the alternation we + replace the entire thing with a single TRIE node. + + Otherwise when it is a subsequence we need to stitch it in place and + replace only the relevant branches. This means the first branch has + to remain as it is used by the alternation logic, and its next pointer, + and needs to be repointed at the item on the branch chain following + the last branch we have optimized away. + + This could be either a BRANCH, in which case the subsequence is internal, + or it could be the item following the branch sequence in which case the + subsequence is at the end (which does not necessarily mean the first node + is the start of the alternation). + + TRIE_TYPE(X) is a define which maps the optype to a trietype. + + optype | trietype + ----------------+----------- + NOTHING | NOTHING + EXACT | EXACT + EXACTFU | EXACTFU + EXACTFU_SS | EXACTFU + EXACTFU_TRICKYFOLD | EXACTFU + EXACTFA | 0 + */ +#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ + ( EXACT == (X) ) ? EXACT : \ + ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \ + 0 ) /* dont use tail as the end marker for this traverse */ for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { regnode * const noper = NEXTOPER( cur ); + U8 noper_type = OP( noper ); + U8 noper_trietype = TRIE_TYPE( noper_type ); #if defined(DEBUGGING) || defined(NOJUMPTRIE) regnode * const noper_next = regnext( noper ); #endif @@ -3051,56 +3319,83 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n", REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) ); }); - if ( (((first && optype!=NOTHING) ? OP( noper ) == optype - : PL_regkind[ OP( noper ) ] == EXACT ) - || OP(noper) == NOTHING ) + + /* Is noper a trieable nodetype that can be merged with the + * current trie (if there is one)? */ + if ( noper_trietype + && + ( + /* XXX: Currently we cannot allow a NOTHING node to be the first element + * of a TRIEABLE sequence, Otherwise we will overwrite the regop following + * the NOTHING with the TRIE regop later on. This is because a NOTHING node + * is only one regnode wide, and a TRIE is two regnodes. An example of a + * problematic pattern is: "x" =~ /\A(?>(?:(?:)A|B|C?x))\z/ + * At a later point of time we can somewhat workaround this by handling + * NOTHING -> EXACT sequences as generated by /(?:)A|(?:)B/ type patterns, + * as we can effectively ignore the NOTHING regop in that case. + * This clause, which allows NOTHING to start a sequence is left commented + * out as a reference. + * - Yves + + ( noper_trietype == NOTHING) + || ( trietype == NOTHING ) + */ + ( noper_trietype == NOTHING && trietype ) + || ( trietype == noper_trietype ) + ) #ifdef NOJUMPTRIE && noper_next == tail #endif && count < U16_MAX) { + /* Handle mergable triable node + * Either we are the first node in a new trieable sequence, + * in which case we do some bookkeeping, otherwise we update + * the end pointer. */ count++; - if ( !first || optype == NOTHING ) { - if (!first) first = cur; - optype = OP( noper ); + if ( !first ) { + first = cur; + trietype = noper_trietype; } else { + if ( trietype == NOTHING ) + trietype = noper_trietype; last = cur; } - } else { -/* - Currently the trie logic handles case insensitive matching properly only - when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode - semantics). - - If/when this is fixed the following define can be swapped - in below to fully enable trie logic. - -#define TRIE_TYPE_IS_SAFE 1 - -*/ -#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT) - - if ( last && TRIE_TYPE_IS_SAFE ) { - make_trie( pRExC_state, - startbranch, first, cur, tail, count, - optype, depth+1 ); + } /* end handle mergable triable node */ + else { + /* handle unmergable node - + * noper may either be a triable node which can not be tried + * together with the current trie, or a non triable node */ + if ( last ) { + /* If last is set and trietype is not NOTHING then we have found + * at least two triable branch sequences in a row of a similar + * trietype so we can turn them into a trie. If/when we + * allow NOTHING to start a trie sequence this condition will be + * required, and it isn't expensive so we leave it in for now. */ + if ( trietype != NOTHING ) + make_trie( pRExC_state, + startbranch, first, cur, tail, count, + trietype, depth+1 ); + last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */ } - if ( PL_regkind[ OP( noper ) ] == EXACT + if ( noper_trietype #ifdef NOJUMPTRIE && noper_next == tail #endif ){ + /* noper is triable, so we can start a new trie sequence */ count = 1; first = cur; - optype = OP( noper ); - } else { + trietype = noper_trietype; + } else if (first) { + /* if we already saw a first but the current node is not triable then we have + * to reset the first information. */ count = 0; first = NULL; - optype = 0; + trietype = 0; } - last = NULL; - } - } + } /* end handle unmergable node */ + } /* loop over branches */ DEBUG_OPTIMISE_r({ regprop(RExC_rx, mysv, cur); PerlIO_printf( Perl_debug_log, @@ -3108,9 +3403,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); - - if ( last && TRIE_TYPE_IS_SAFE ) { - made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 ); + if ( last && trietype != NOTHING ) { + /* the last branch of the sequence was part of a trie, + * so we have to construct it here outside of the loop + */ + made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 ); #ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && startbranch == first) @@ -3124,8 +3421,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } #endif - } - } + } /* end if ( last) */ + } /* TRIE_MAXBUF is non zero */ } /* do trie */ @@ -3198,8 +3495,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, UV uc; if (UTF) { const U8 * const s = (U8*)STRING(scan); + uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); - uc = utf8_to_uvchr(s, NULL); } else { uc = *((U8*)STRING(scan)); } @@ -3293,21 +3590,24 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } if (UTF) { const U8 * const s = (U8 *)STRING(scan); + uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); - uc = utf8_to_uvchr(s, NULL); } - min += l + min_change; + else if (has_exactf_sharp_s) { + RExC_seen |= REG_SEEN_EXACTF_SHARP_S; + } + min += l - min_subtract; if (min < 0) { min = 0; } - delta += abs(min_change); + delta += min_subtract; if (flags & SCF_DO_SUBSTR) { - data->pos_min += l + min_change; + data->pos_min += l - min_subtract; if (data->pos_min < 0) { data->pos_min = 0; } - data->pos_delta += abs(min_change); - if (min_change) { + data->pos_delta += min_subtract; + if (min_subtract) { data->longest = &(data->longest_float); } } @@ -3338,11 +3638,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* Also set the other member of the fold pair. In case * that unicode semantics is called for at runtime, use * the full latin1 fold. (Can't do this for locale, - * because not known until runtime */ + * because not known until runtime) */ ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); - /* All folds except under /iaa that include s, S, and - * sharp_s also may include the others */ + /* All other (EXACTFL handled above) folds except under + * /iaa that include s, S, and sharp_s also may include + * the others */ if (OP(scan) != EXACTFA) { if (uc == 's' || uc == 'S') { ANYOF_BITMAP_SET(data->start_class, @@ -3795,18 +4096,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - else if (OP(scan) == FOLDCHAR) { - int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2; - flags &= ~SCF_DO_STCLASS; - min += 1; - delta += d; - if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ - data->pos_min += 1; - data->pos_delta += d; - data->longest = &(data->longest_float); - } - } else if (REGNODE_SIMPLE(OP(scan))) { int value = 0; @@ -4611,6 +4900,64 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) DEBUG_r(if (!PL_colorset) reginitcolors()); +#ifndef PERL_IN_XSUB_RE + /* Initialize these here instead of as-needed, as is quick and avoids + * having to test them each time otherwise */ + if (! PL_AboveLatin1) { + PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); + PL_ASCII = _new_invlist_C_array(ASCII_invlist); + PL_Latin1 = _new_invlist_C_array(Latin1_invlist); + + PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); + PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); + + PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist); + PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist); + + PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist); + PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist); + + PL_L1Cased = _new_invlist_C_array(L1Cased_invlist); + + PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist); + PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist); + + PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist); + + PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist); + PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist); + + PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); + PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); + + PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist); + PL_PosixLower = _new_invlist_C_array(PosixLower_invlist); + + PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist); + PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist); + + PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist); + PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist); + + PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist); + PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist); + + PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist); + PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist); + + PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist); + PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist); + + PL_VertSpace = _new_invlist_C_array(VertSpace_invlist); + + PL_PosixWord = _new_invlist_C_array(PosixWord_invlist); + PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist); + + PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist); + PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist); + } +#endif + exp = SvPV(pattern, plen); if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */ @@ -5122,9 +5469,11 @@ reStudy: { I32 t,ml; - if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ - && data.offset_fixed == data.offset_float_min - && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) + /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ + if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S) + || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) goto remove_float; /* As in (a)+. */ /* copy the information about the longest float from the reg_scan_data @@ -5167,10 +5516,13 @@ reStudy: Be careful. */ longest_fixed_length = CHR_SVLEN(data.longest_fixed); - if (longest_fixed_length - || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ - && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE)))) + + /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ + if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) + && (longest_fixed_length + || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ + && (!(data.flags & SF_FIX_BEFORE_MEOL) + || (RExC_flags & RXf_PMf_MULTILINE)))) ) { I32 t,ml; @@ -5933,7 +6285,19 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) #define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */ #define INVLIST_ITER_OFFSET 1 /* Current iteration position */ -#define INVLIST_ZERO_OFFSET 2 /* 0 or 1; must be last element in header */ +/* This is a combination of a version and data structure type, so that one + * being passed in can be validated to be an inversion list of the correct + * vintage. When the structure of the header is changed, a new random number + * in the range 2**31-1 should be generated and the new() method changed to + * insert that at this location. Then, if an auxiliary program doesn't change + * correspondingly, it will be discovered immediately */ +#define INVLIST_VERSION_ID_OFFSET 2 +#define INVLIST_VERSION_ID 1064334010 + +/* For safety, when adding new elements, remember to #undef them at the end of + * the inversion list code section */ + +#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */ /* The UV at position ZERO contains either 0 or 1. If 0, the inversion list * contains the code point U+00000, and begins here. If 1, the inversion list * doesn't contain U+0000, and it begins at the next UV in the array. @@ -6093,10 +6457,39 @@ Perl__new_invlist(pTHX_ IV initial_size) * properly */ *get_invlist_zero_addr(new_list) = UV_MAX; + *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID; +#if HEADER_LENGTH != 4 +# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length +#endif + return new_list; } #endif +STATIC SV* +S__new_invlist_C_array(pTHX_ UV* list) +{ + /* Return a pointer to a newly constructed inversion list, initialized to + * point to , which has to be in the exact correct inversion list + * form, including internal fields. Thus this is a dangerous routine that + * should not be used in the wrong hands */ + + SV* invlist = newSV_type(SVt_PV); + + PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; + + SvPV_set(invlist, (char *) list); + SvLEN_set(invlist, 0); /* Means we own the contents, and the system + shouldn't touch it */ + SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist))); + + if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) { + Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); + } + + return invlist; +} + STATIC void S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) { @@ -6123,6 +6516,8 @@ S_invlist_trim(pTHX_ SV* const invlist) #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) +#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output) + #ifndef PERL_IN_XSUB_RE void Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) @@ -6328,12 +6723,17 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV return; } + void -Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output) { /* Take the union of two inversion lists and point to it. *output * should be defined upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. + * the reference count to that list will be decremented. The first list, + * , may be NULL, in which case a copy of the second list is returned. + * If is TRUE, the union is taken of the complement + * (inversion) of instead of b itself. + * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some * length there. The preface says to incorporate its examples into your @@ -6368,17 +6768,21 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) */ UV count = 0; - PERL_ARGS_ASSERT__INVLIST_UNION; + PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; assert(a != b); /* If either one is empty, the union is the other one */ - len_a = invlist_len(a); - if (len_a == 0) { + if (a == NULL || ((len_a = invlist_len(a)) == 0)) { if (*output == a) { - SvREFCNT_dec(a); + if (a != NULL) { + SvREFCNT_dec(a); + } } if (*output != b) { *output = invlist_clone(b); + if (complement_b) { + _invlist_invert(*output); + } } /* else *output already = b; */ return; } @@ -6386,10 +6790,20 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) if (*output == b) { SvREFCNT_dec(b); } - if (*output != a) { - *output = invlist_clone(a); - } - /* else *output already = a; */ + + /* The complement of an empty list is a list that has everything in it, + * so the union with includes everything too */ + if (complement_b) { + if (a == *output) { + SvREFCNT_dec(a); + } + *output = _new_invlist(1); + _append_range_to_invlist(*output, 0, UV_MAX); + } + else if (*output != a) { + *output = invlist_clone(a); + } + /* else *output already = a; */ return; } @@ -6397,6 +6811,31 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) array_a = invlist_array(a); array_b = invlist_array(b); + /* If are to take the union of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later, and clear the + * flag as we don't have to do anything else later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + complement_b = FALSE; + } + else { + + /* But if the first element is not zero, we unshift a 0 before the + * array. The data structure reserves a space for that 0 (which + * should be a '1' right now), so physical shifting is unneeded, + * but temporarily change that element to 0. Before exiting the + * routine, we must restore the element to '1' */ + array_b--; + len_b++; + array_b[0] = 0; + } + } + /* Size the union for the worst case: that the sets are completely * disjoint */ u = _new_invlist(len_a + len_b); @@ -6515,16 +6954,24 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) SvREFCNT_dec(*output); } + /* If we've changed b, restore it */ + if (complement_b) { + array_b[0] = 1; + } + *output = u; return; } void -Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i) +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i) { /* Take the intersection of two inversion lists and point to it. *i * should be defined upon input, and if it points to one of the two lists, * the reference count to that list will be decremented. + * If is TRUE, the result will be the intersection of + * and the complement (or inversion) of instead of directly. + * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some * length there. The preface says to incorporate its examples into your @@ -6555,22 +7002,38 @@ Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i) */ UV count = 0; - PERL_ARGS_ASSERT__INVLIST_INTERSECTION; + PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; assert(a != b); - /* If either one is empty, the intersection is null */ + /* Special case if either one is empty */ len_a = invlist_len(a); if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) { - /* If the result is the same as one of the inputs, the input is being - * overwritten */ + if (len_a != 0 && complement_b) { + + /* Here, 'a' is not empty, therefore from the above 'if', 'b' must + * be empty. Here, also we are using 'b's complement, which hence + * must be every possible code point. Thus the intersection is + * simply 'a'. */ + if (*i != a) { + *i = invlist_clone(a); + + if (*i == b) { + SvREFCNT_dec(b); + } + } + /* else *i is already 'a' */ + return; + } + + /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The + * intersection must be empty */ if (*i == a) { SvREFCNT_dec(a); } else if (*i == b) { SvREFCNT_dec(b); } - *i = _new_invlist(0); return; } @@ -6579,6 +7042,31 @@ Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i) array_a = invlist_array(a); array_b = invlist_array(b); + /* If are to take the intersection of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later, and clear the + * flag as we don't have to do anything else later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + complement_b = FALSE; + } + else { + + /* But if the first element is not zero, we unshift a 0 before the + * array. The data structure reserves a space for that 0 (which + * should be a '1' right now), so physical shifting is unneeded, + * but temporarily change that element to 0. Before exiting the + * routine, we must restore the element to '1' */ + array_b--; + len_b++; + array_b[0] = 0; + } + } + /* Size the intersection for the worst case: that the intersection ends up * fragmenting everything to be completely disjoint */ r= _new_invlist(len_a + len_b); @@ -6687,6 +7175,11 @@ Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i) SvREFCNT_dec(*i); } + /* If we've changed b, restore it */ + if (complement_b) { + array_b[0] = 1; + } + *i = r; return; } @@ -6828,46 +7321,6 @@ S_invlist_clone(pTHX_ SV* const invlist) return new_invlist; } -#ifndef PERL_IN_XSUB_RE -void -Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result) -{ - /* Point to an inversion list which consists of all elements in - * that aren't also in . *result should be defined upon input, and - * if it points to C its reference count will be decremented. */ - - PERL_ARGS_ASSERT__INVLIST_SUBTRACT; - assert(a != b); - - /* Subtracting nothing retains the original */ - if (invlist_len(b) == 0) { - - if (*result == b) { - SvREFCNT_dec(b); - } - - /* If the result is not to be the same variable as the original, create - * a copy */ - if (*result != a) { - *result = invlist_clone(a); - } - } else { - SV *b_copy = invlist_clone(b); - _invlist_invert(b_copy); /* Everything not in 'b' */ - - if (*result == b) { - SvREFCNT_dec(b); - } - - _invlist_intersection(a, b_copy, result); /* Everything in 'a' not in - 'b' */ - SvREFCNT_dec(b_copy); - } - - return; -} -#endif - PERL_STATIC_INLINE UV* S_get_invlist_iter_addr(pTHX_ SV* invlist) { @@ -6879,6 +7332,16 @@ S_get_invlist_iter_addr(pTHX_ SV* invlist) return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV))); } +PERL_STATIC_INLINE UV* +S_get_invlist_version_id_addr(pTHX_ SV* invlist) +{ + /* Return the address of the UV that contains the version id. */ + + PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR; + + return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV))); +} + PERL_STATIC_INLINE void S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */ { @@ -6984,6 +7447,7 @@ S_invlist_dump(pTHX_ SV* const invlist, const char * const header) #undef INVLIST_LEN_OFFSET #undef INVLIST_ZERO_OFFSET #undef INVLIST_ITER_OFFSET +#undef INVLIST_VERSION_ID /* End of inversion list object */ @@ -7630,9 +8094,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) U32 posflags = 0, negflags = 0; U32 *flagsp = &posflags; char has_charset_modifier = '\0'; - regex_charset cs = (RExC_utf8 || RExC_uni_semantics) - ? REGEX_UNICODE_CHARSET - : REGEX_DEPENDS_CHARSET; + regex_charset cs = get_regex_charset(RExC_flags); + if (cs == REGEX_DEPENDS_CHARSET + && (RExC_utf8 || RExC_uni_semantics)) + { + cs = REGEX_UNICODE_CHARSET; + } while (*RExC_parse) { /* && strchr("iogcmsx", *RExC_parse) */ @@ -9058,28 +9525,19 @@ tryagain: RExC_parse++; defchar: { - typedef enum { - generic_char = 0, - char_s, - upsilon_1, - upsilon_2, - iota_1, - iota_2, - } char_state; - char_state latest_char_state = generic_char; register STRLEN len; register UV ender; register char *p; char *s; STRLEN foldlen; U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; - regnode * orig_emit; U8 node_type; + /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so, + * it is folded to 'ss' even if not utf8 */ + bool is_exactfu_sharp_s; + ender = 0; - orig_emit = RExC_emit; /* Save the original output node position in - case we need to output a different node - type */ node_type = ((! FOLD) ? EXACT : (LOC) ? EXACTFL @@ -9304,223 +9762,16 @@ tryagain: break; } /* End of switch on the literal */ - /* Certain characters are problematic because their folded - * length is so different from their original length that it - * isn't handleable by the optimizer. They are therefore not - * placed in an EXACTish node; and are here handled specially. - * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S, - * putting it in a special node keeps regexec from having to - * deal with a non-utf8 multi-char fold */ - if (FOLD - && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))) - { - /* We look for either side of the fold. For example \xDF - * folds to 'ss'. We look for both the single character - * \xDF and the sequence 'ss'. When we find something that - * could be one of those, we stop and flush whatever we - * have output so far into the EXACTish node that was being - * built. Then restore the input pointer to what it was. - * regatom will return that EXACT node, and will be called - * again, positioned so the first character is the one in - * question, which we return in a different node type. - * The multi-char folds are a sequence, so the occurrence - * of the first character in that sequence doesn't - * necessarily mean that what follows is the rest of the - * sequence. We keep track of that with a state machine, - * with the state being set to the latest character - * processed before the current one. Most characters will - * set the state to 0, but if one occurs that is part of a - * potential tricky fold sequence, the state is set to that - * character, and the next loop iteration sees if the state - * should progress towards the final folded-from character, - * or if it was a false alarm. If it turns out to be a - * false alarm, the character(s) will be output in a new - * EXACTish node, and join_exact() will later combine them. - * In the case of the 'ss' sequence, which is more common - * and more easily checked, some look-ahead is done to - * save time by ruling-out some false alarms */ - switch (ender) { - default: - latest_char_state = generic_char; - break; - case 's': - case 'S': - case 0x17F: /* LATIN SMALL LETTER LONG S */ - if (AT_LEAST_UNI_SEMANTICS) { - if (latest_char_state == char_s) { /* 'ss' */ - ender = LATIN_SMALL_LETTER_SHARP_S; - goto do_tricky; - } - else if (p < RExC_end) { - - /* Look-ahead at the next character. If it - * is also an s, we handle as a sharp s - * tricky regnode. */ - if (*p == 's' || *p == 'S') { - - /* But first flush anything in the - * EXACTish buffer */ - if (len != 0) { - p = oldp; - goto loopdone; - } - p++; /* Account for swallowing this - 's' up */ - ender = LATIN_SMALL_LETTER_SHARP_S; - goto do_tricky; - } - /* Here, the next character is not a - * literal 's', but still could - * evaluate to one if part of a \o{}, - * \x or \OCTAL-DIGIT. The minimum - * length required for that is 4, eg - * \x53 or \123 */ - else if (*p == '\\' - && p < RExC_end - 4 - && (isDIGIT(*(p + 1)) - || *(p + 1) == 'x' - || *(p + 1) == 'o' )) - { - - /* Here, it could be an 's', too much - * bother to figure it out here. Flush - * the buffer if any; when come back - * here, set the state so know that the - * previous char was an 's' */ - if (len != 0) { - latest_char_state = generic_char; - p = oldp; - goto loopdone; - } - latest_char_state = char_s; - break; - } - } - } - - /* Here, can't be an 'ss' sequence, or at least not - * one that could fold to/from the sharp ss */ - latest_char_state = generic_char; - break; - case 0x03C5: /* First char in upsilon series */ - case 0x03A5: /* Also capital UPSILON, which folds to - 03C5, and hence exhibits the same - problem */ - if (p < RExC_end - 4) { /* Need >= 4 bytes left */ - latest_char_state = upsilon_1; - if (len != 0) { - p = oldp; - goto loopdone; - } - } - else { - latest_char_state = generic_char; - } - break; - case 0x03B9: /* First char in iota series */ - case 0x0399: /* Also capital IOTA */ - case 0x1FBE: /* GREEK PROSGEGRAMMENI folds to 3B9 */ - case 0x0345: /* COMBINING GREEK YPOGEGRAMMENI folds - to 3B9 */ - if (p < RExC_end - 4) { - latest_char_state = iota_1; - if (len != 0) { - p = oldp; - goto loopdone; - } - } - else { - latest_char_state = generic_char; - } - break; - case 0x0308: - if (latest_char_state == upsilon_1) { - latest_char_state = upsilon_2; - } - else if (latest_char_state == iota_1) { - latest_char_state = iota_2; - } - else { - latest_char_state = generic_char; - } - break; - case 0x301: - if (latest_char_state == upsilon_2) { - ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS; - goto do_tricky; - } - else if (latest_char_state == iota_2) { - ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS; - goto do_tricky; - } - latest_char_state = generic_char; - break; - - /* These are the tricky fold characters. Flush any - * buffer first. (When adding to this list, also should - * add them to fold_grind.t to make sure get tested) */ - case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS: - case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS: - case LATIN_SMALL_LETTER_SHARP_S: - case LATIN_CAPITAL_LETTER_SHARP_S: - case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */ - case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */ - if (len != 0) { - p = oldp; - goto loopdone; - } - /* FALL THROUGH */ - do_tricky: { - char* const oldregxend = RExC_end; - U8 tmpbuf[UTF8_MAXBYTES+1]; - - /* Here, we know we need to generate a special - * regnode, and 'ender' contains the tricky - * character. What's done is to pretend it's in a - * [bracketed] class, and let the code that deals - * with those handle it, as that code has all the - * intelligence necessary. First save the current - * parse state, get rid of the already allocated - * but empty EXACT node that the ANYOFV node will - * replace, and point the parse to a buffer which - * we fill with the character we want the regclass - * code to think is being parsed */ - RExC_emit = orig_emit; - RExC_parse = (char *) tmpbuf; - if (UTF) { - U8 *d = uvchr_to_utf8(tmpbuf, ender); - *d = '\0'; - RExC_end = (char *) d; - } - else { /* ender above 255 already excluded */ - tmpbuf[0] = (U8) ender; - tmpbuf[1] = '\0'; - RExC_end = RExC_parse + 1; - } - - ret = regclass(pRExC_state,depth+1); - - /* Here, have parsed the buffer. Reset the parse to - * the actual input, and return */ - RExC_end = oldregxend; - RExC_parse = p - 1; - - Set_Node_Offset(ret, RExC_parse); - Set_Node_Cur_Length(ret); - nextchar(pRExC_state); - *flagp |= HASWIDTH|SIMPLE; - return ret; - } - } - } - + is_exactfu_sharp_s = (node_type == EXACTFU + && ender == LATIN_SMALL_LETTER_SHARP_S); if ( RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); - if (UTF && FOLD) { + if ((UTF && FOLD) || is_exactfu_sharp_s) { /* Prime the casefolded buffer. Locale rules, which apply * only to code points < 256, aren't known until execution, * so for them, just output the original character using - * utf8 */ + * utf8. If we start to fold non-UTF patterns, be sure to + * update join_exact() */ if (LOC && ender < 256) { if (UNI_IS_INVARIANT(ender)) { *tmpbuf = (U8) ender; @@ -9579,14 +9830,17 @@ tryagain: if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; - else if (UTF) { + else if (UTF || is_exactfu_sharp_s) { if (FOLD) { /* Emit all the Unicode characters. */ STRLEN numlen; for (foldbuf = tmpbuf; foldlen; foldlen -= numlen) { - ender = utf8_to_uvchr(foldbuf, &numlen); + + /* tmpbuf has been constructed by us, so we + * know it is valid utf8 */ + ender = valid_utf8_to_uvchr(foldbuf, &numlen); if (numlen > 0) { const STRLEN unilen = reguni(pRExC_state, ender, s); s += unilen; @@ -9615,14 +9869,14 @@ tryagain: } break; } - if (UTF) { + if (UTF || is_exactfu_sharp_s) { if (FOLD) { /* Emit all the Unicode characters. */ STRLEN numlen; for (foldbuf = tmpbuf; foldlen; foldlen -= numlen) { - ender = utf8_to_uvchr(foldbuf, &numlen); + ender = valid_utf8_to_uvchr(foldbuf, &numlen); if (numlen > 0) { const STRLEN unilen = reguni(pRExC_state, ender, s); len += unilen; @@ -9869,91 +10123,132 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) } } -/* No locale test, and always Unicode semantics, no ignore-case differences */ -#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ -ANYOF_##NAME: \ - for (value = 0; value < 256; value++) \ - if (TEST) \ - stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - yesno = '+'; \ - what = WORD; \ - break; \ -case ANYOF_N##NAME: \ - for (value = 0; value < 256; value++) \ - if (!TEST) \ - stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - yesno = '!'; \ - what = WORD; \ - break - -/* Like the above, but there are differences if we are in uni-8-bit or not, so - * there are two tests passed in, to use depending on that. There aren't any - * cases where the label is different from the name, so no need for that - * parameter. - * Sets 'what' to WORD which is the property name for non-bitmap code points; - * But, uses FOLD_WORD instead if /i has been selected, to allow a different - * property name */ -#define _C_C_T_(NAME, TEST_8, TEST_7, WORD, FOLD_WORD) \ -ANYOF_##NAME: \ - if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \ - else if (UNI_SEMANTICS) { \ - for (value = 0; value < 256; value++) { \ - if (TEST_8(value)) stored += \ - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - } \ - } \ - else { \ - for (value = 0; value < 128; value++) { \ - if (TEST_7(UNI_TO_NATIVE(value))) stored += \ - set_regclass_bit(pRExC_state, ret, \ - (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ - } \ - } \ - yesno = '+'; \ - if (FOLD) { \ - what = FOLD_WORD; \ - } \ - else { \ - what = WORD; \ - } \ - break; \ -case ANYOF_N##NAME: \ - if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \ - else if (UNI_SEMANTICS) { \ - for (value = 0; value < 256; value++) { \ - if (! TEST_8(value)) stored += \ - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - } \ - } \ - else { \ - for (value = 0; value < 128; value++) { \ - if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \ - pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ - } \ - if (AT_LEAST_ASCII_RESTRICTED) { \ - for (value = 128; value < 256; value++) { \ - stored += set_regclass_bit( \ - pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ - } \ - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \ - } \ - else { \ - /* For a non-ut8 target string with DEPENDS semantics, all above \ - * ASCII Latin1 code points match the complement of any of the \ - * classes. But in utf8, they have their Unicode semantics, so \ - * can't just set them in the bitmap, or else regexec.c will think \ - * they matched when they shouldn't. */ \ - ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \ - } \ - } \ - yesno = '!'; \ - if (FOLD) { \ - what = FOLD_WORD; \ - } \ - else { \ - what = WORD; \ - } \ - break +/* Generate the code to add a full posix character to the bracketed + * character class given by . ( is needed only under locale rules) + * destlist is the inversion list for non-locale rules that this class is + * to be added to + * sourcelist is the ASCII-range inversion list to add under /a rules + * Xsourcelist is the full Unicode range list to use otherwise. */ +#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \ + if (LOC) { \ + SV* scratch_list = NULL; \ + \ + /* Set this class in the node for runtime matching */ \ + ANYOF_CLASS_SET(node, class); \ + \ + /* For above Latin1 code points, we use the full Unicode range */ \ + _invlist_intersection(PL_AboveLatin1, \ + Xsourcelist, \ + &scratch_list); \ + /* And set the output to it, adding instead if there already is an \ + * output. Checking if is NULL first saves an extra \ + * clone. Its reference count will be decremented at the next \ + * union, etc, or if this is the only instance, at the end of the \ + * routine */ \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + } \ + else { \ + /* For non-locale, just add it to any existing list */ \ + _invlist_union(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : Xsourcelist, \ + &destlist); \ + } + +/* Like DO_POSIX, but matches the complement of and . + */ +#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \ + if (LOC) { \ + SV* scratch_list = NULL; \ + ANYOF_CLASS_SET(node, class); \ + _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + } \ + else { \ + _invlist_union_complement_2nd(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : Xsourcelist, \ + &destlist); \ + /* Under /d, everything in the upper half of the Latin1 range \ + * matches this complement */ \ + if (DEPENDS_SEMANTICS) { \ + ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \ + } \ + } + +/* Generate the code to add a posix character to the bracketed + * character class given by . ( is needed only under locale rules) + * destlist is the inversion list for non-locale rules that this class is + * to be added to + * sourcelist is the ASCII-range inversion list to add under /a rules + * l1_sourcelist is the Latin1 range list to use otherwise. + * Xpropertyname is the name to add to of the property to + * specify the code points above Latin1 that will have to be + * determined at run-time + * run_time_list is a SV* that contains text names of properties that are to + * be computed at run time. This concatenates + * to it, apppropriately + * This is essentially DO_POSIX, but we know only the Latin1 values at compile + * time */ +#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ + l1_sourcelist, Xpropertyname, run_time_list) \ + /* If not /a matching, there are going to be code points we will have \ + * to defer to runtime to look-up */ \ + if (! AT_LEAST_ASCII_RESTRICTED) { \ + Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \ + } \ + if (LOC) { \ + ANYOF_CLASS_SET(node, class); \ + } \ + else { \ + _invlist_union(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : l1_sourcelist, \ + &destlist); \ + } + +/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of + * this and DO_N_POSIX */ +#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ + l1_sourcelist, Xpropertyname, run_time_list) \ + if (AT_LEAST_ASCII_RESTRICTED) { \ + _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \ + } \ + else { \ + Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \ + if (LOC) { \ + ANYOF_CLASS_SET(node, namedclass); \ + } \ + else { \ + SV* scratch_list = NULL; \ + _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + if (DEPENDS_SEMANTICS) { \ + ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \ + } \ + } \ + } STATIC U8 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr) @@ -10430,18 +10725,7 @@ parseit: /* Invert if asking for the complement */ if (value == 'P') { - - /* Add to any existing list */ - if (! properties) { - properties = invlist_clone(invlist); - _invlist_invert(properties); - } - else { - invlist = invlist_clone(invlist); - _invlist_invert(invlist); - _invlist_union(properties, invlist, &properties); - SvREFCNT_dec(invlist); - } + _invlist_union_complement_2nd(properties, invlist, &properties); /* The swash can't be used as-is, because we've * inverted things; delay removing it to here after @@ -10450,12 +10734,7 @@ parseit: swash = NULL; } else { - if (! properties) { - properties = invlist_clone(invlist); - } - else { - _invlist_union(properties, invlist, &properties); - } + _invlist_union(properties, invlist, &properties); } } Safefree(name); @@ -10599,85 +10878,211 @@ parseit: } if (!SIZE_ONLY) { - const char *what = NULL; - char yesno = 0; /* Possible truncation here but in some 64-bit environments * the compiler gets heartburn about switch on 64-bit values. * A similar issue a little earlier when switching on value. * --jhi */ switch ((I32)namedclass) { - - case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum", "XPosixAlnum"); - case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha", "XPosixAlpha"); - case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank", "XPosixBlank"); - case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl", "XPosixCntrl"); - case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph", "XPosixGraph"); - case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower", "__XPosixLower_i"); - case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint", "XPosixPrint"); - case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace", "XPosixSpace"); - case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct", "XPosixPunct"); - case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper", "__XPosixUpper_i"); - /* \s, \w match all unicode if utf8. */ - case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl", "SpacePerl"); - case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word", "Word"); - case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit", "XPosixXDigit"); - case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); - case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); + + case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */ + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); + break; + case ANYOF_NALNUMC: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); + break; + case ANYOF_ALPHA: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); + break; + case ANYOF_NALPHA: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); + break; case ANYOF_ASCII: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_ASCII); - else { - for (value = 0; value < 128; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); + if (LOC) { + ANYOF_CLASS_SET(ret, namedclass); } - yesno = '+'; - what = NULL; /* Doesn't match outside ascii, so - don't want to add +utf8:: */ + else { + _invlist_union(properties, PL_ASCII, &properties); + } break; case ANYOF_NASCII: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NASCII); - else { - for (value = 128; value < 256; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); + if (LOC) { + ANYOF_CLASS_SET(ret, namedclass); } - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; - yesno = '!'; - what = "ASCII"; - break; + else { + _invlist_union_complement_2nd(properties, + PL_ASCII, &properties); + if (DEPENDS_SEMANTICS) { + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; + } + } + break; + case ANYOF_BLANK: + DO_POSIX(ret, namedclass, properties, + PL_PosixBlank, PL_XPosixBlank); + break; + case ANYOF_NBLANK: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixBlank, PL_XPosixBlank); + break; + case ANYOF_CNTRL: + DO_POSIX(ret, namedclass, properties, + PL_PosixCntrl, PL_XPosixCntrl); + break; + case ANYOF_NCNTRL: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixCntrl, PL_XPosixCntrl); + break; case ANYOF_DIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_DIGIT); + /* Ignore the compiler warning for this macro, planned to + * be eliminated later */ + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv); + break; + case ANYOF_NDIGIT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv); + break; + case ANYOF_GRAPH: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); + break; + case ANYOF_NGRAPH: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); + break; + case ANYOF_HORIZWS: + /* For these, we use the nonbitmap, as /d doesn't make a + * difference in what these match. There would be problems + * if these characters had folds other than themselves, as + * nonbitmap is subject to folding. It turns out that \h + * is just a synonym for XPosixBlank */ + _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap); + break; + case ANYOF_NHORIZWS: + _invlist_union_complement_2nd(nonbitmap, + PL_XPosixBlank, &nonbitmap); + break; + case ANYOF_LOWER: + case ANYOF_NLOWER: + { /* These require special handling, as they differ under + folding, matching Cased there (which in the ASCII range + is the same as Alpha */ + + SV* ascii_source; + SV* l1_source; + const char *Xname; + + if (FOLD && ! LOC) { + ascii_source = PL_PosixAlpha; + l1_source = PL_L1Cased; + Xname = "Cased"; + } + else { + ascii_source = PL_PosixLower; + l1_source = PL_L1PosixLower; + Xname = "XPosixLower"; + } + if (namedclass == ANYOF_LOWER) { + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + ascii_source, l1_source, Xname, listsv); + } else { - /* consecutive digits assumed */ - for (value = '0'; value <= '9'; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, + properties, ascii_source, l1_source, Xname, listsv); } - yesno = '+'; - what = "Digit"; break; - case ANYOF_NDIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); + } + case ANYOF_PRINT: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); + break; + case ANYOF_NPRINT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); + break; + case ANYOF_PUNCT: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); + break; + case ANYOF_NPUNCT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); + break; + case ANYOF_PSXSPC: + DO_POSIX(ret, namedclass, properties, + PL_PosixSpace, PL_XPosixSpace); + break; + case ANYOF_NPSXSPC: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixSpace, PL_XPosixSpace); + break; + case ANYOF_SPACE: + DO_POSIX(ret, namedclass, properties, + PL_PerlSpace, PL_XPerlSpace); + break; + case ANYOF_NSPACE: + DO_N_POSIX(ret, namedclass, properties, + PL_PerlSpace, PL_XPerlSpace); + break; + case ANYOF_UPPER: /* Same as LOWER, above */ + case ANYOF_NUPPER: + { + SV* ascii_source; + SV* l1_source; + const char *Xname; + + if (FOLD && ! LOC) { + ascii_source = PL_PosixAlpha; + l1_source = PL_L1Cased; + Xname = "Cased"; + } else { - /* consecutive digits assumed */ - for (value = 0; value < '0'; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); - for (value = '9' + 1; value < 256; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); + ascii_source = PL_PosixUpper; + l1_source = PL_L1PosixUpper; + Xname = "XPosixUpper"; } - yesno = '!'; - what = "Digit"; - if (AT_LEAST_ASCII_RESTRICTED ) { - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; + if (namedclass == ANYOF_UPPER) { + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + ascii_source, l1_source, Xname, listsv); } - break; + else { + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, + properties, ascii_source, l1_source, Xname, listsv); + } + break; + } + case ANYOF_ALNUM: /* Really is 'Word' */ + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + break; + case ANYOF_NALNUM: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + break; + case ANYOF_VERTWS: + /* For these, we use the nonbitmap, as /d doesn't make a + * difference in what these match. There would be problems + * if these characters had folds other than themselves, as + * nonbitmap is subject to folding */ + _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap); + break; + case ANYOF_NVERTWS: + _invlist_union_complement_2nd(nonbitmap, + PL_VertSpace, &nonbitmap); + break; + case ANYOF_XDIGIT: + DO_POSIX(ret, namedclass, properties, + PL_PosixXDigit, PL_XPosixXDigit); + break; + case ANYOF_NXDIGIT: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixXDigit, PL_XPosixXDigit); + break; case ANYOF_MAX: /* this is to handle \p and \P */ break; @@ -10685,10 +11090,6 @@ parseit: vFAIL("Invalid [::] class"); break; } - if (what && ! (AT_LEAST_ASCII_RESTRICTED)) { - /* Strings such as "+utf8::isWord\n" */ - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", yesno, what); - } continue; } @@ -11026,13 +11427,14 @@ parseit: } } - /* Done with loop; set to not include any code points that - * are in the bitmap */ + /* Done with loop; remove any code points that are in the bitmap from + * */ if (change_invlist) { - SV* keep_list = _new_invlist(2); - _append_range_to_invlist(keep_list, max_cp_to_set + 1, UV_MAX); - _invlist_intersection(nonbitmap, keep_list, &nonbitmap); - SvREFCNT_dec(keep_list); + _invlist_subtract(nonbitmap, + (DEPENDS_SEMANTICS) + ? PL_ASCII + : PL_Latin1, + &nonbitmap); } /* If have completely emptied it, remove it completely */ @@ -11119,6 +11521,12 @@ parseit: * there should not be overlap unless is /d rules. */ _invlist_invert(nonbitmap); + /* Any swash can't be used as-is, because we've inverted things */ + if (swash) { + SvREFCNT_dec(swash); + swash = NULL; + } + for (i = 0; i < 256; ++i) { if (ANYOF_BITMAP_TEST(ret, i)) { ANYOF_BITMAP_CLEAR(ret, i); @@ -11146,10 +11554,7 @@ parseit: else { /* There is no overlap for non-/d, so just delete anything * below 256 */ - SV* keep_list = _new_invlist(2); - _append_range_to_invlist(keep_list, 256, UV_MAX); - _invlist_intersection(nonbitmap, keep_list, &nonbitmap); - SvREFCNT_dec(keep_list); + _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap); } } @@ -11331,7 +11736,6 @@ parseit: } return ret; } -#undef _C_C_T_ /* reg_skipcomment() @@ -11702,9 +12106,11 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, for (;;) { regnode * const temp = regnext(scan); #ifdef EXPERIMENTAL_INPLACESCAN - if (PL_regkind[OP(scan)] == EXACT) - if (join_exact(pRExC_state,scan,&min,1,val,depth+1)) + if (PL_regkind[OP(scan)] == EXACT) { + bool has_exactf_sharp_s; /* Unexamined in this routine */ + if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1)) return EXACT; + } #endif if ( exact ) { switch (OP(scan)) { @@ -11713,7 +12119,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, case EXACTFA: case EXACTFU: case EXACTFU_SS: - case EXACTFU_NO_TRIE: + case EXACTFU_TRICKYFOLD: case EXACTFL: if( exact == PSEUDO ) exact= OP(scan); @@ -12038,8 +12444,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ - else if (k == FOLDCHAR) - Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o);