X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/88d45d285bee142b4f37f62d7260e24504d371e5..370b8f2f82c087e5659f68c3d8b8bb963d23a63f:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 99668c3..dd5a37c 100644 --- a/regcomp.c +++ b/regcomp.c @@ -86,6 +86,9 @@ #endif #include "dquote_static.c" +#ifndef PERL_IN_XSUB_RE +# include "charclass_invlists.h" +#endif #ifdef op #undef op @@ -689,7 +692,7 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); else data->flags &= ~SF_FIX_BEFORE_EOL; - data->minlen_fixed=minlenp; + data->minlen_fixed=minlenp; data->lookbehind_fixed=0; } else { /* *data->longest == data->longest_float */ @@ -1515,10 +1518,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #endif switch (flags) { + case EXACT: break; case EXACTFA: case EXACTFU: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; case EXACTFL: folder = PL_fold_locale; break; + default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags ); } trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); @@ -1705,7 +1710,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, "%*sCompiling trie using list compiler\n", (int)depth * 2 + 2, "")); - + trie->states = (reg_trie_state *) PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, sizeof(reg_trie_state) ); @@ -2503,15 +2508,116 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode }}); +/* The below joins as many adjacent EXACTish nodes as possible into a single + * one, and looks for problematic sequences of characters whose folds vs. + * non-folds have sufficiently different lengths, that the optimizer would be + * fooled into rejecting legitimate matches of them, and the trie construction + * code can't cope with them. The joining is only done if: + * 1) there is room in the current conglomerated node to entirely contain the + * next one. + * 2) they are the exact same node type + * + * The adjacent nodes actually may be separated by NOTHING kind nodes, and + * these get optimized out + * + * If there are problematic code sequences, *min_subtract is set to the delta + * that the minimum size of the node can be less than its actual size. And, + * the node type of the result is changed to reflect that it contains these + * sequences. + * + * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF + * and contains LATIN SMALL LETTER SHARP S + * + * This is as good a place as any to discuss the design of handling these + * problematic sequences. It's been wrong in Perl for a very long time. There + * are three code points in Unicode whose folded lengths differ so much from + * the un-folded lengths that it causes problems for the optimizer and trie + * construction. Why only these are problematic, and not others where lengths + * also differ is something I (khw) do not understand. New versions of Unicode + * might add more such code points. Hopefully the logic in fold_grind.t that + * figures out what to test (in part by verifying that each size-combination + * gets tested) will catch any that do come along, so they can be added to the + * special handling below. The chances of new ones are actually rather small, + * as most, if not all, of the world's scripts that have casefolding have + * already been encoded by Unicode. Also, a number of Unicode's decisions were + * made to allow compatibility with pre-existing standards, and almost all of + * those have already been dealt with. These would otherwise be the most + * likely candidates for generating further tricky sequences. In other words, + * Unicode by itself is unlikely to add new ones unless it is for compatibility + * with pre-existing standards, and there aren't many of those left. + * + * The previous designs for dealing with these involved assigning a special + * node for them. This approach doesn't work, as evidenced by this example: + * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches + * Both these fold to "sss", but if the pattern is parsed to create a node of + * that would match just the \xDF, it won't be able to handle the case where a + * successful match would have to cross the node's boundary. The new approach + * that hopefully generally solves the problem generates an EXACTFU_SS node + * that is "sss". + * + * There are a number of components to the approach (a lot of work for just + * three code points!): + * 1) This routine examines each EXACTFish node that could contain the + * problematic sequences. It returns in *min_subtract how much to + * subtract from the the actual length of the string to get a real minimum + * for one that could match it. This number is usually 0 except for the + * problematic sequences. This delta is used by the caller to adjust the + * min length of the match, and the delta between min and max, so that the + * optimizer doesn't reject these possibilities based on size constraints. + * 2) These sequences are not currently correctly handled by the trie code + * either, so it changes the joined node type to ops that are not handled + * by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE. + * 3) This is sufficient for the two Greek sequences (described below), but + * the one involving the Sharp s (\xDF) needs more. The node type + * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss" + * sequence in it. For non-UTF-8 patterns and strings, this is the only + * case where there is a possible fold length change. That means that a + * regular EXACTFU node without UTF-8 involvement doesn't have to concern + * itself with length changes, and so can be processed faster. regexec.c + * takes advantage of this. Generally, an EXACTFish node that is in UTF-8 + * is pre-folded by regcomp.c. This saves effort in regex matching. + * However, probably mostly for historical reasons, the pre-folding isn't + * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL + * nodes, as what they fold to isn't known until runtime.) The fold + * possibilities for the non-UTF8 patterns are quite simple, except for + * the sharp s. All the ones that don't involve a UTF-8 target string + * are members of a fold-pair, and arrays are set up for all of them + * that quickly find the other member of the pair. It might actually + * be faster to pre-fold these, but it isn't currently done, except for + * the sharp s. Code elsewhere in this file makes sure that it gets + * folded to 'ss', even if the pattern isn't UTF-8. This avoids the + * issues described in the next item. + * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches + * 'ss' or not is not knowable at compile time. It will match iff the + * target string is in UTF-8, unlike the EXACTFU nodes, where it always + * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus + * it can't be folded to "ss" at compile time, unlike EXACTFU does as + * described in item 3). An assumption that the optimizer part of + * regexec.c (probably unwittingly) makes is that a character in the + * pattern corresponds to at most a single character in the target string. + * (And I do mean character, and not byte here, unlike other parts of the + * documentation that have never been updated to account for multibyte + * Unicode.) This assumption is wrong only in this case, as all other + * cases are either 1-1 folds when no UTF-8 is involved; or is true by + * virtue of having this file pre-fold UTF-8 patterns. I'm + * reluctant to try to change this assumption, so instead the code punts. + * This routine examines EXACTF nodes for the sharp s, and returns a + * boolean indicating whether or not the node is an EXACTF node that + * contains a sharp s. When it is true, the caller sets a flag that later + * causes the optimizer in this file to not set values for the floating + * and fixed string lengths, and thus avoids the optimizer code in + * regexec.c that makes the invalid assumption. Thus, there is no + * optimization based on string lengths for EXACTF nodes that contain the + * sharp s. This only happens for /id rules (which means the pattern + * isn't in UTF-8). + */ - - -#define JOIN_EXACT(scan,min,flags) \ +#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \ if (PL_regkind[OP(scan)] == EXACT) \ - join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1) + join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1) STATIC U32 -S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) { +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) { /* Merge several consecutive EXACTish nodes into one. */ regnode *n = regnext(scan); U32 stringok = 1; @@ -2531,13 +2637,15 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags PERL_UNUSED_ARG(val); #endif DEBUG_PEEP("join",scan,depth); - - /* Skip NOTHING, merge EXACT*. */ - while (n && - ( PL_regkind[OP(n)] == NOTHING || - (stringok && (OP(n) == OP(scan)))) + + /* Look through the subsequent nodes in the chain. Skip NOTHING, merge + * EXACT ones that are mergeable to the current one. */ + while (n + && (PL_regkind[OP(n)] == NOTHING + || (stringok && OP(n) == OP(scan))) && NEXT_OFF(n) - && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { + && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) + { if (OP(n) == TAIL || n > next) stringok = 0; @@ -2554,12 +2662,13 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags else if (stringok) { const unsigned int oldl = STR_LEN(scan); regnode * const nnext = regnext(n); + + if (oldl + STR_LEN(n) > U8_MAX) + break; DEBUG_PEEP("merg",n,depth); - merged++; - if (oldl + STR_LEN(n) > U8_MAX) - break; + NEXT_OFF(scan) += NEXT_OFF(n); STR_LEN(scan) += STR_LEN(n); next = n + NODE_SZ_STR(n); @@ -2585,67 +2694,180 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags } #endif } -#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390 -#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS -#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0 -#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS - if (UTF - && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA) - && ( STR_LEN(scan) >= 6 ) ) - { - /* - Two problematic code points in Unicode casefolding of EXACT nodes: - - U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS - U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS - - which casefold to - - Unicode UTF-8 - - U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 - U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 - - This means that in case-insensitive matching (or "loose matching", - as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte - length of the above casefolded versions) can match a target string - of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). - This would rather mess up the minimum length computation. - - What we'll do is to look for the tail four bytes, and then peek - at the preceding two bytes to see whether we need to decrease - the minimum length by four (six minus two). - - Thanks to the design of UTF-8, there cannot be false matches: - A sequence of valid UTF-8 bytes cannot be a subsequence of - another valid sequence of UTF-8 bytes. - - */ - char * const s0 = STRING(scan), *s, *t; - char * const s1 = s0 + STR_LEN(scan) - 1; - char * const s2 = s1 - 4; + *min_subtract = 0; + *has_exactf_sharp_s = FALSE; + + /* Here, all the adjacent mergeable EXACTish nodes have been merged. We + * can now analyze for sequences of problematic code points. (Prior to + * this final joining, sequences could have been split over boundaries, and + * hence missed). The sequences only happen in folding, hence for any + * non-EXACT EXACTish node */ + if (OP(scan) != EXACT) { + U8 *s; + U8 * s0 = (U8*) STRING(scan); + U8 * const s_end = s0 + STR_LEN(scan); + + /* The below is perhaps overboard, but this allows us to save a test + * each time through the loop at the expense of a mask. This is + * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a + * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64. + * This uses an exclusive 'or' to find that bit and then inverts it to + * form a mask, with just a single 0, in the bit position where 'S' and + * 's' differ. */ + const U8 S_or_s_mask = ~ ('S' ^ 's'); + const U8 s_masked = 's' & S_or_s_mask; + + /* One pass is made over the node's string looking for all the + * possibilities. to avoid some tests in the loop, there are two main + * cases, for UTF-8 patterns (which can't have EXACTF nodes) and + * non-UTF-8 */ + if (UTF) { + + /* There are two problematic Greek code points in Unicode + * casefolding + * + * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + * + * which casefold to + * + * Unicode UTF-8 + * + * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 + * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 + * + * This means that in case-insensitive matching (or "loose + * matching", as Unicode calls it), an EXACTF of length six (the + * UTF-8 encoded byte length of the above casefolded versions) can + * match a target string of length two (the byte length of UTF-8 + * encoded U+0390 or U+03B0). This would rather mess up the + * minimum length computation. (there are other code points that + * also fold to these two sequences, but the delta is smaller) + * + * If these sequences are found, the minimum length is decreased by + * four (six minus two). + * + * Similarly, 'ss' may match the single char and byte LATIN SMALL + * LETTER SHARP S. We decrease the min length by 1 for each + * occurrence of 'ss' found */ + #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */ - const char t0[] = "\xaf\x49\xaf\x42"; +# define U390_first_byte 0xb4 + const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42"; +# define U3B0_first_byte 0xb5 + const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42"; #else - const char t0[] = "\xcc\x88\xcc\x81"; +# define U390_first_byte 0xce + const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81"; +# define U3B0_first_byte 0xcf + const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81"; #endif - const char * const t1 = t0 + 3; - - for (s = s0 + 2; - s < s2 && (t = ninstr(s, s1, t0, t1)); - s = t + 4) { -#ifdef EBCDIC - if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) || - ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5)) -#else - if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || - ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) -#endif - *min -= 4; - } + const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte; + yields a net of 0 */ + /* Examine the string for one of the problematic sequences */ + for (s = s0; + s < s_end - 1; /* Can stop 1 before the end, as minimum length + * sequence we are looking for is 2 */ + s += UTF8SKIP(s)) + { + + /* Look for the first byte in each problematic sequence */ + switch (*s) { + /* We don't have to worry about other things that fold to + * 's' (such as the long s, U+017F), as all above-latin1 + * code points have been pre-folded */ + case 's': + case 'S': + + /* Current character is an 's' or 'S'. If next one is + * as well, we have the dreaded sequence */ + if (((*(s+1) & S_or_s_mask) == s_masked) + /* These two node types don't have special handling + * for 'ss' */ + && OP(scan) != EXACTFL && OP(scan) != EXACTFA) + { + *min_subtract += 1; + OP(scan) = EXACTFU_SS; + s++; /* No need to look at this character again */ + } + break; + + case U390_first_byte: + if (s_end - s >= len + + /* The 1's are because are skipping comparing the + * first byte */ + && memEQ(s + 1, U390_tail, len - 1)) + { + goto greek_sequence; + } + break; + + case U3B0_first_byte: + if (! (s_end - s >= len + && memEQ(s + 1, U3B0_tail, len - 1))) + { + break; + } + greek_sequence: + *min_subtract += 4; + + /* This can't currently be handled by trie's, so change + * the node type to indicate this. If EXACTFA and + * EXACTFL were ever to be handled by trie's, this + * would have to be changed. If this node has already + * been changed to EXACTFU_SS in this loop, leave it as + * is. (I (khw) think it doesn't matter in regexec.c + * for UTF patterns, but no need to change it */ + if (OP(scan) == EXACTFU) { + OP(scan) = EXACTFU_NO_TRIE; + } + s += 6; /* We already know what this sequence is. Skip + the rest of it */ + break; + } + } + } + else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { + + /* Here, the pattern is not UTF-8. We need to look only for the + * 'ss' sequence, and in the EXACTF case, the sharp s, which can be + * in the final position. Otherwise we can stop looking 1 byte + * earlier because have to find both the first and second 's' */ + const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; + + for (s = s0; s < upper; s++) { + switch (*s) { + case 'S': + case 's': + if (s_end - s > 1 + && ((*(s+1) & S_or_s_mask) == s_masked)) + { + *min_subtract += 1; + + /* EXACTF nodes need to know that the minimum + * length changed so that a sharp s in the string + * can match this ss in the pattern, but they + * remain EXACTF nodes, as they are not trie'able, + * so don't have to invent a new node type to + * exclude them from the trie code */ + if (OP(scan) != EXACTF) { + OP(scan) = EXACTFU_SS; + } + s++; + } + break; + case LATIN_SMALL_LETTER_SHARP_S: + if (OP(scan) == EXACTF) { + *has_exactf_sharp_s = TRUE; + } + break; + } + } + } } - + #ifdef DEBUGGING /* Allow dumping but overwriting the collection of skipped * ops and/or strings with fake optimized ops */ @@ -2759,10 +2981,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, fake_study_recurse: while ( scan && OP(scan) != END && scan < last ){ + UV min_subtract = 0; /* How much to subtract from the minimum node + length to get a real minimum (because the + folded version may be shorter) */ + bool has_exactf_sharp_s = FALSE; /* Peephole optimizer: */ DEBUG_STUDYDATA("Peep:", data,depth); DEBUG_PEEP("Peep",scan,depth); - JOIN_EXACT(scan,&min,0); + + /* Its not clear to khw or hv why this is done here, and not in the + * clauses that deal with EXACT nodes. khw's guess is that it's + * because of a previous design */ + JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0); /* Follow the next-chain of the current node and optimize away all the NOTHINGs from it. */ @@ -2774,7 +3004,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); int noff; regnode *n = scan; - + /* Skip NOTHING and LONGJMP. */ while ((n = regnext(n)) && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) @@ -2796,7 +3026,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, next = regnext(scan); code = OP(scan); /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */ - + if (OP(next) == code || code == IFTHEN) { /* NOTE - There is similar code to this block below for handling TRIE nodes on a re-study. If you change stuff here check there @@ -2804,7 +3034,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 max1 = 0, min1 = I32_MAX, num = 0; struct regnode_charclass_class accum; regnode * const startbranch=scan; - + if (flags & SCF_DO_SUBSTR) SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ if (flags & SCF_DO_STCLASS) @@ -2941,7 +3171,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, a nested if into a case structure of sorts. */ - + int made=0; if (!re_trie_maxbuff) { re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); @@ -3057,6 +3287,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #define TRIE_TYPE_IS_SAFE 1 +Note that join_exact() assumes that the other types of EXACTFish nodes are not +used in tries, so that would have to be updated if this changed + */ #define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT) @@ -3091,7 +3324,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( last && TRIE_TYPE_IS_SAFE ) { made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 ); -#ifdef TRIE_STUDY_OPT +#ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && startbranch == first) || ( first_non_open == first )) && @@ -3276,9 +3509,24 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, l = utf8_length(s, s + l); uc = utf8_to_uvchr(s, NULL); } - min += l; - if (flags & SCF_DO_SUBSTR) - data->pos_min += l; + else if (has_exactf_sharp_s) { + RExC_seen |= REG_SEEN_EXACTF_SHARP_S; + } + min += l - min_subtract; + if (min < 0) { + min = 0; + } + delta += min_subtract; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += l - min_subtract; + if (data->pos_min < 0) { + data->pos_min = 0; + } + data->pos_delta += min_subtract; + if (min_subtract) { + data->longest = &(data->longest_float); + } + } if (flags & SCF_DO_STCLASS_AND) { /* Check whether it is compatible with what we know already! */ int compat = 1; @@ -3306,8 +3554,22 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* Also set the other member of the fold pair. In case * that unicode semantics is called for at runtime, use * the full latin1 fold. (Can't do this for locale, - * because not known until runtime */ + * because not known until runtime) */ ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); + + /* All other (EXACTFL handled above) folds except under + * /iaa that include s, S, and sharp_s also may include + * the others */ + if (OP(scan) != EXACTFA) { + if (uc == 's' || uc == 'S') { + ANYOF_BITMAP_SET(data->start_class, + LATIN_SMALL_LETTER_SHARP_S); + } + else if (uc == LATIN_SMALL_LETTER_SHARP_S) { + ANYOF_BITMAP_SET(data->start_class, 's'); + ANYOF_BITMAP_SET(data->start_class, 'S'); + } + } } } else if (uc >= 0x100) { @@ -3332,6 +3594,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * run-time */ ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); + + /* All folds except under /iaa that include s, S, + * and sharp_s also may include the others */ + if (OP(scan) != EXACTFA) { + if (uc == 's' || uc == 'S') { + ANYOF_BITMAP_SET(data->start_class, + LATIN_SMALL_LETTER_SHARP_S); + } + else if (uc == LATIN_SMALL_LETTER_SHARP_S) { + ANYOF_BITMAP_SET(data->start_class, 's'); + ANYOF_BITMAP_SET(data->start_class, 'S'); + } + } } } data->start_class->flags &= ~ANYOF_EOS; @@ -3737,18 +4012,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - else if (OP(scan) == FOLDCHAR) { - int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2; - flags &= ~SCF_DO_STCLASS; - min += 1; - delta += d; - if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ - data->pos_min += 1; - data->pos_delta += d; - data->longest = &(data->longest_float); - } - } else if (REGNODE_SIMPLE(OP(scan))) { int value = 0; @@ -3982,7 +4245,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, break; CASE_SYNST_FNC(VERTWS); CASE_SYNST_FNC(HORIZWS); - + } if (flags & SCF_DO_STCLASS_OR) cl_and(data->start_class, and_withp); @@ -4363,7 +4626,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, flags &= ~SCF_DO_SUBSTR; } #endif /* old or new */ -#endif /* TRIE_STUDY_OPT */ +#endif /* TRIE_STUDY_OPT */ /* Else: zero-length, ignore. */ scan = regnext(scan); @@ -4553,6 +4816,64 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) DEBUG_r(if (!PL_colorset) reginitcolors()); +#ifndef PERL_IN_XSUB_RE + /* Initialize these here instead of as-needed, as is quick and avoids + * having to test them each time otherwise */ + if (! PL_AboveLatin1) { + PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); + PL_ASCII = _new_invlist_C_array(ASCII_invlist); + PL_Latin1 = _new_invlist_C_array(Latin1_invlist); + + PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); + PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); + + PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist); + PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist); + + PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist); + PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist); + + PL_L1Cased = _new_invlist_C_array(L1Cased_invlist); + + PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist); + PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist); + + PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist); + + PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist); + PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist); + + PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); + PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); + + PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist); + PL_PosixLower = _new_invlist_C_array(PosixLower_invlist); + + PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist); + PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist); + + PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist); + PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist); + + PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist); + PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist); + + PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist); + PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist); + + PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist); + PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist); + + PL_VertSpace = _new_invlist_C_array(VertSpace_invlist); + + PL_PosixWord = _new_invlist_C_array(PosixWord_invlist); + PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist); + + PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist); + PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist); + } +#endif + exp = SvPV(pattern, plen); if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */ @@ -4919,7 +5240,7 @@ reStudy: sawplus = 1; else first += regarglen[OP(first)]; - + first = NEXTOPER(first); first_next= regnext(first); } @@ -4934,7 +5255,7 @@ reStudy: else ri->regstclass = first; } -#ifdef TRIE_STCLASS +#ifdef TRIE_STCLASS else if (PL_regkind[OP(first)] == TRIE && ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) { @@ -4955,7 +5276,7 @@ reStudy: make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); ri->regstclass = trie_op; } -#endif +#endif else if (REGNODE_SIMPLE(OP(first))) ri->regstclass = first; else if (PL_regkind[OP(first)] == BOUND || @@ -5021,7 +5342,7 @@ reStudy: * it happens that c_offset_min has been invalidated, since the * earlier string may buy us something the later one won't.] */ - + data.longest_fixed = newSVpvs(""); data.longest_float = newSVpvs(""); data.last_found = newSVpvs(""); @@ -5039,7 +5360,7 @@ reStudy: &data, -1, NULL, NULL, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); - + CHECK_RESTUDY_GOTO; @@ -5064,9 +5385,11 @@ reStudy: { I32 t,ml; - if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ - && data.offset_fixed == data.offset_float_min - && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) + /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ + if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S) + || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) goto remove_float; /* As in (a)+. */ /* copy the information about the longest float from the reg_scan_data @@ -5109,10 +5432,13 @@ reStudy: Be careful. */ longest_fixed_length = CHR_SVLEN(data.longest_fixed); - if (longest_fixed_length - || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ - && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE)))) + + /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ + if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) + && (longest_fixed_length + || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ + && (!(data.flags & SF_FIX_BEFORE_MEOL) + || (RExC_flags & RXf_PMf_MULTILINE)))) ) { I32 t,ml; @@ -5207,7 +5533,7 @@ reStudy: I32 fake; struct regnode_charclass_class ch_class; I32 last_close = 0; - + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); scan = ri->program + 1; @@ -5778,7 +6104,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) return sv_dat; } else { - Perl_croak(aTHX_ "panic: bad flag in reg_scan_name"); + Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", + (unsigned long) flags); } /* NOT REACHED */ } @@ -5874,7 +6201,19 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) #define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */ #define INVLIST_ITER_OFFSET 1 /* Current iteration position */ -#define INVLIST_ZERO_OFFSET 2 /* 0 or 1; must be last element in header */ +/* This is a combination of a version and data structure type, so that one + * being passed in can be validated to be an inversion list of the correct + * vintage. When the structure of the header is changed, a new random number + * in the range 2**31-1 should be generated and the new() method changed to + * insert that at this location. Then, if an auxiliary program doesn't change + * correspondingly, it will be discovered immediately */ +#define INVLIST_VERSION_ID_OFFSET 2 +#define INVLIST_VERSION_ID 1064334010 + +/* For safety, when adding new elements, remember to #undef them at the end of + * the inversion list code section */ + +#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */ /* The UV at position ZERO contains either 0 or 1. If 0, the inversion list * contains the code point U+00000, and begins here. If 1, the inversion list * doesn't contain U+0000, and it begins at the next UV in the array. @@ -6034,10 +6373,39 @@ Perl__new_invlist(pTHX_ IV initial_size) * properly */ *get_invlist_zero_addr(new_list) = UV_MAX; + *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID; +#if HEADER_LENGTH != 4 +# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length +#endif + return new_list; } #endif +STATIC SV* +S__new_invlist_C_array(pTHX_ UV* list) +{ + /* Return a pointer to a newly constructed inversion list, initialized to + * point to , which has to be in the exact correct inversion list + * form, including internal fields. Thus this is a dangerous routine that + * should not be used in the wrong hands */ + + SV* invlist = newSV_type(SVt_PV); + + PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; + + SvPV_set(invlist, (char *) list); + SvLEN_set(invlist, 0); /* Means we own the contents, and the system + shouldn't touch it */ + SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist))); + + if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) { + Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); + } + + return invlist; +} + STATIC void S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) { @@ -6064,6 +6432,8 @@ S_invlist_trim(pTHX_ SV* const invlist) #define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) #define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) +#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output) + #ifndef PERL_IN_XSUB_RE void Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) @@ -6093,7 +6463,9 @@ Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV if (array[final_element] > start || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) { - Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list"); + Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", + array[final_element], start, + ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); } /* Here, it is a legal append. If the new range begins with the first @@ -6267,12 +6639,17 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV return; } + void -Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output) { /* Take the union of two inversion lists and point to it. *output * should be defined upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented. + * the reference count to that list will be decremented. The first list, + * , may be NULL, in which case a copy of the second list is returned. + * If is TRUE, the union is taken of the complement + * (inversion) of instead of b itself. + * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some * length there. The preface says to incorporate its examples into your @@ -6307,17 +6684,21 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) */ UV count = 0; - PERL_ARGS_ASSERT__INVLIST_UNION; + PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; assert(a != b); /* If either one is empty, the union is the other one */ - len_a = invlist_len(a); - if (len_a == 0) { + if (a == NULL || ((len_a = invlist_len(a)) == 0)) { if (*output == a) { - SvREFCNT_dec(a); + if (a != NULL) { + SvREFCNT_dec(a); + } } if (*output != b) { *output = invlist_clone(b); + if (complement_b) { + _invlist_invert(*output); + } } /* else *output already = b; */ return; } @@ -6325,10 +6706,20 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) if (*output == b) { SvREFCNT_dec(b); } - if (*output != a) { - *output = invlist_clone(a); - } - /* else *output already = a; */ + + /* The complement of an empty list is a list that has everything in it, + * so the union with includes everything too */ + if (complement_b) { + if (a == *output) { + SvREFCNT_dec(a); + } + *output = _new_invlist(1); + _append_range_to_invlist(*output, 0, UV_MAX); + } + else if (*output != a) { + *output = invlist_clone(a); + } + /* else *output already = a; */ return; } @@ -6336,6 +6727,31 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) array_a = invlist_array(a); array_b = invlist_array(b); + /* If are to take the union of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later, and clear the + * flag as we don't have to do anything else later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + complement_b = FALSE; + } + else { + + /* But if the first element is not zero, we unshift a 0 before the + * array. The data structure reserves a space for that 0 (which + * should be a '1' right now), so physical shifting is unneeded, + * but temporarily change that element to 0. Before exiting the + * routine, we must restore the element to '1' */ + array_b--; + len_b++; + array_b[0] = 0; + } + } + /* Size the union for the worst case: that the sets are completely * disjoint */ u = _new_invlist(len_a + len_b); @@ -6454,16 +6870,24 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output) SvREFCNT_dec(*output); } + /* If we've changed b, restore it */ + if (complement_b) { + array_b[0] = 1; + } + *output = u; return; } void -Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i) +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i) { /* Take the intersection of two inversion lists and point to it. *i * should be defined upon input, and if it points to one of the two lists, * the reference count to that list will be decremented. + * If is TRUE, the result will be the intersection of + * and the complement (or inversion) of instead of directly. + * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some * length there. The preface says to incorporate its examples into your @@ -6494,22 +6918,38 @@ Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i) */ UV count = 0; - PERL_ARGS_ASSERT__INVLIST_INTERSECTION; + PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; assert(a != b); - /* If either one is empty, the intersection is null */ + /* Special case if either one is empty */ len_a = invlist_len(a); if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) { - /* If the result is the same as one of the inputs, the input is being - * overwritten */ + if (len_a != 0 && complement_b) { + + /* Here, 'a' is not empty, therefore from the above 'if', 'b' must + * be empty. Here, also we are using 'b's complement, which hence + * must be every possible code point. Thus the intersection is + * simply 'a'. */ + if (*i != a) { + *i = invlist_clone(a); + + if (*i == b) { + SvREFCNT_dec(b); + } + } + /* else *i is already 'a' */ + return; + } + + /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The + * intersection must be empty */ if (*i == a) { SvREFCNT_dec(a); } else if (*i == b) { SvREFCNT_dec(b); } - *i = _new_invlist(0); return; } @@ -6518,6 +6958,31 @@ Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i) array_a = invlist_array(a); array_b = invlist_array(b); + /* If are to take the intersection of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later, and clear the + * flag as we don't have to do anything else later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + complement_b = FALSE; + } + else { + + /* But if the first element is not zero, we unshift a 0 before the + * array. The data structure reserves a space for that 0 (which + * should be a '1' right now), so physical shifting is unneeded, + * but temporarily change that element to 0. Before exiting the + * routine, we must restore the element to '1' */ + array_b--; + len_b++; + array_b[0] = 0; + } + } + /* Size the intersection for the worst case: that the intersection ends up * fragmenting everything to be completely disjoint */ r= _new_invlist(len_a + len_b); @@ -6626,6 +7091,11 @@ Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i) SvREFCNT_dec(*i); } + /* If we've changed b, restore it */ + if (complement_b) { + array_b[0] = 1; + } + *i = r; return; } @@ -6767,46 +7237,6 @@ S_invlist_clone(pTHX_ SV* const invlist) return new_invlist; } -#ifndef PERL_IN_XSUB_RE -void -Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result) -{ - /* Point to an inversion list which consists of all elements in - * that aren't also in . *result should be defined upon input, and - * if it points to C its reference count will be decremented. */ - - PERL_ARGS_ASSERT__INVLIST_SUBTRACT; - assert(a != b); - - /* Subtracting nothing retains the original */ - if (invlist_len(b) == 0) { - - if (*result == b) { - SvREFCNT_dec(b); - } - - /* If the result is not to be the same variable as the original, create - * a copy */ - if (*result != a) { - *result = invlist_clone(a); - } - } else { - SV *b_copy = invlist_clone(b); - _invlist_invert(b_copy); /* Everything not in 'b' */ - - if (*result == b) { - SvREFCNT_dec(b); - } - - _invlist_intersection(a, b_copy, result); /* Everything in 'a' not in - 'b' */ - SvREFCNT_dec(b_copy); - } - - return; -} -#endif - PERL_STATIC_INLINE UV* S_get_invlist_iter_addr(pTHX_ SV* invlist) { @@ -6818,6 +7248,16 @@ S_get_invlist_iter_addr(pTHX_ SV* invlist) return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV))); } +PERL_STATIC_INLINE UV* +S_get_invlist_version_id_addr(pTHX_ SV* invlist) +{ + /* Return the address of the UV that contains the version id. */ + + PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR; + + return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV))); +} + PERL_STATIC_INLINE void S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */ { @@ -6923,6 +7363,7 @@ S_invlist_dump(pTHX_ SV* const invlist, const char * const header) #undef INVLIST_LEN_OFFSET #undef INVLIST_ZERO_OFFSET #undef INVLIST_ITER_OFFSET +#undef INVLIST_VERSION_ID /* End of inversion list object */ @@ -7305,7 +7746,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse++; if (*RExC_parse!=')') vFAIL("Expecting close bracket"); - + gen_recurse_regop: if ( paren == '-' ) { /* @@ -7382,7 +7823,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse++; } if (*RExC_parse != ')') { - RExC_parse = s; + RExC_parse = s; vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); } if (!SIZE_ONLY) { @@ -7440,7 +7881,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) || RExC_parse[1] == '<' || RExC_parse[1] == '{') { /* Lookahead or eval. */ I32 flag; - + ret = reg_node(pRExC_state, LOGICAL); if (!SIZE_ONLY) ret->flags = 1; @@ -7942,7 +8383,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) Set_Node_Length(ret, 1); } } - + if (!first && SIZE_ONLY) RExC_extralen += 1; /* BRANCHJ */ @@ -8812,7 +9253,7 @@ tryagain: break; case 'p': case 'P': - { + { char* const oldregxend = RExC_end; #ifdef DEBUGGING char* parse_start = RExC_parse - 2; @@ -8997,41 +9438,51 @@ tryagain: RExC_parse++; defchar: { - typedef enum { - generic_char = 0, - char_s, - upsilon_1, - upsilon_2, - iota_1, - iota_2, - } char_state; - char_state latest_char_state = generic_char; register STRLEN len; register UV ender; register char *p; char *s; STRLEN foldlen; U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; - regnode * orig_emit; + U8 node_type; + + /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so, + * it is folded to 'ss' even if not utf8 */ + bool is_exactfu_sharp_s; ender = 0; - orig_emit = RExC_emit; /* Save the original output node position in - case we need to output a different node - type */ - ret = reg_node(pRExC_state, - (U8) ((! FOLD) ? EXACT - : (LOC) - ? EXACTFL - : (MORE_ASCII_RESTRICTED) - ? EXACTFA - : (AT_LEAST_UNI_SEMANTICS) - ? EXACTFU - : EXACTF) - ); + node_type = ((! FOLD) ? EXACT + : (LOC) + ? EXACTFL + : (MORE_ASCII_RESTRICTED) + ? EXACTFA + : (AT_LEAST_UNI_SEMANTICS) + ? EXACTFU + : EXACTF); + ret = reg_node(pRExC_state, node_type); s = STRING(ret); + + /* XXX The node can hold up to 255 bytes, yet this only goes to + * 127. I (khw) do not know why. Keeping it somewhat less than + * 255 allows us to not have to worry about overflow due to + * converting to utf8 and fold expansion, but that value is + * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes + * split up by this limit into a single one using the real max of + * 255. Even at 127, this breaks under rare circumstances. If + * folding, we do not want to split a node at a character that is a + * non-final in a multi-char fold, as an input string could just + * happen to want to match across the node boundary. The join + * would solve that problem if the join actually happens. But a + * series of more than two nodes in a row each of 127 would cause + * the first join to succeed to get to 254, but then there wouldn't + * be room for the next one, which could at be one of those split + * multi-char folds. I don't know of any fool-proof solution. One + * could back off to end with only a code point that isn't such a + * non-final, but it is possible for there not to be any in the + * entire node. */ for (len = 0, p = RExC_parse - 1; - len < 127 && p < RExC_end; - len++) + len < 127 && p < RExC_end; + len++) { char * const oldp = p; @@ -9139,7 +9590,7 @@ tryagain: case 'x': if (*++p == '{') { char* const e = strchr(p, '}'); - + if (!e) { RExC_parse = p + 1; vFAIL("Missing right brace on \\x{}"); @@ -9224,223 +9675,16 @@ tryagain: break; } /* End of switch on the literal */ - /* Certain characters are problematic because their folded - * length is so different from their original length that it - * isn't handleable by the optimizer. They are therefore not - * placed in an EXACTish node; and are here handled specially. - * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S, - * putting it in a special node keeps regexec from having to - * deal with a non-utf8 multi-char fold */ - if (FOLD - && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))) - { - /* We look for either side of the fold. For example \xDF - * folds to 'ss'. We look for both the single character - * \xDF and the sequence 'ss'. When we find something that - * could be one of those, we stop and flush whatever we - * have output so far into the EXACTish node that was being - * built. Then restore the input pointer to what it was. - * regatom will return that EXACT node, and will be called - * again, positioned so the first character is the one in - * question, which we return in a different node type. - * The multi-char folds are a sequence, so the occurrence - * of the first character in that sequence doesn't - * necessarily mean that what follows is the rest of the - * sequence. We keep track of that with a state machine, - * with the state being set to the latest character - * processed before the current one. Most characters will - * set the state to 0, but if one occurs that is part of a - * potential tricky fold sequence, the state is set to that - * character, and the next loop iteration sees if the state - * should progress towards the final folded-from character, - * or if it was a false alarm. If it turns out to be a - * false alarm, the character(s) will be output in a new - * EXACTish node, and join_exact() will later combine them. - * In the case of the 'ss' sequence, which is more common - * and more easily checked, some look-ahead is done to - * save time by ruling-out some false alarms */ - switch (ender) { - default: - latest_char_state = generic_char; - break; - case 's': - case 'S': - case 0x17F: /* LATIN SMALL LETTER LONG S */ - if (AT_LEAST_UNI_SEMANTICS) { - if (latest_char_state == char_s) { /* 'ss' */ - ender = LATIN_SMALL_LETTER_SHARP_S; - goto do_tricky; - } - else if (p < RExC_end) { - - /* Look-ahead at the next character. If it - * is also an s, we handle as a sharp s - * tricky regnode. */ - if (*p == 's' || *p == 'S') { - - /* But first flush anything in the - * EXACTish buffer */ - if (len != 0) { - p = oldp; - goto loopdone; - } - p++; /* Account for swallowing this - 's' up */ - ender = LATIN_SMALL_LETTER_SHARP_S; - goto do_tricky; - } - /* Here, the next character is not a - * literal 's', but still could - * evaluate to one if part of a \o{}, - * \x or \OCTAL-DIGIT. The minimum - * length required for that is 4, eg - * \x53 or \123 */ - else if (*p == '\\' - && p < RExC_end - 4 - && (isDIGIT(*(p + 1)) - || *(p + 1) == 'x' - || *(p + 1) == 'o' )) - { - - /* Here, it could be an 's', too much - * bother to figure it out here. Flush - * the buffer if any; when come back - * here, set the state so know that the - * previous char was an 's' */ - if (len != 0) { - latest_char_state = generic_char; - p = oldp; - goto loopdone; - } - latest_char_state = char_s; - break; - } - } - } - - /* Here, can't be an 'ss' sequence, or at least not - * one that could fold to/from the sharp ss */ - latest_char_state = generic_char; - break; - case 0x03C5: /* First char in upsilon series */ - case 0x03A5: /* Also capital UPSILON, which folds to - 03C5, and hence exhibits the same - problem */ - if (p < RExC_end - 4) { /* Need >= 4 bytes left */ - latest_char_state = upsilon_1; - if (len != 0) { - p = oldp; - goto loopdone; - } - } - else { - latest_char_state = generic_char; - } - break; - case 0x03B9: /* First char in iota series */ - case 0x0399: /* Also capital IOTA */ - case 0x1FBE: /* GREEK PROSGEGRAMMENI folds to 3B9 */ - case 0x0345: /* COMBINING GREEK YPOGEGRAMMENI folds - to 3B9 */ - if (p < RExC_end - 4) { - latest_char_state = iota_1; - if (len != 0) { - p = oldp; - goto loopdone; - } - } - else { - latest_char_state = generic_char; - } - break; - case 0x0308: - if (latest_char_state == upsilon_1) { - latest_char_state = upsilon_2; - } - else if (latest_char_state == iota_1) { - latest_char_state = iota_2; - } - else { - latest_char_state = generic_char; - } - break; - case 0x301: - if (latest_char_state == upsilon_2) { - ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS; - goto do_tricky; - } - else if (latest_char_state == iota_2) { - ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS; - goto do_tricky; - } - latest_char_state = generic_char; - break; - - /* These are the tricky fold characters. Flush any - * buffer first. (When adding to this list, also should - * add them to fold_grind.t to make sure get tested) */ - case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS: - case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS: - case LATIN_SMALL_LETTER_SHARP_S: - case LATIN_CAPITAL_LETTER_SHARP_S: - case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */ - case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */ - if (len != 0) { - p = oldp; - goto loopdone; - } - /* FALL THROUGH */ - do_tricky: { - char* const oldregxend = RExC_end; - U8 tmpbuf[UTF8_MAXBYTES+1]; - - /* Here, we know we need to generate a special - * regnode, and 'ender' contains the tricky - * character. What's done is to pretend it's in a - * [bracketed] class, and let the code that deals - * with those handle it, as that code has all the - * intelligence necessary. First save the current - * parse state, get rid of the already allocated - * but empty EXACT node that the ANYOFV node will - * replace, and point the parse to a buffer which - * we fill with the character we want the regclass - * code to think is being parsed */ - RExC_emit = orig_emit; - RExC_parse = (char *) tmpbuf; - if (UTF) { - U8 *d = uvchr_to_utf8(tmpbuf, ender); - *d = '\0'; - RExC_end = (char *) d; - } - else { /* ender above 255 already excluded */ - tmpbuf[0] = (U8) ender; - tmpbuf[1] = '\0'; - RExC_end = RExC_parse + 1; - } - - ret = regclass(pRExC_state,depth+1); - - /* Here, have parsed the buffer. Reset the parse to - * the actual input, and return */ - RExC_end = oldregxend; - RExC_parse = p - 1; - - Set_Node_Offset(ret, RExC_parse); - Set_Node_Cur_Length(ret); - nextchar(pRExC_state); - *flagp |= HASWIDTH|SIMPLE; - return ret; - } - } - } - + is_exactfu_sharp_s = (node_type == EXACTFU + && ender == LATIN_SMALL_LETTER_SHARP_S); if ( RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); - if (UTF && FOLD) { + if ((UTF && FOLD) || is_exactfu_sharp_s) { /* Prime the casefolded buffer. Locale rules, which apply * only to code points < 256, aren't known until execution, * so for them, just output the original character using - * utf8 */ + * utf8. If we start to fold non-UTF patterns, be sure to + * update join_exact() */ if (LOC && ender < 256) { if (UNI_IS_INVARIANT(ender)) { *tmpbuf = (U8) ender; @@ -9499,7 +9743,7 @@ tryagain: if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; - else if (UTF) { + else if (UTF || is_exactfu_sharp_s) { if (FOLD) { /* Emit all the Unicode characters. */ STRLEN numlen; @@ -9535,7 +9779,7 @@ tryagain: } break; } - if (UTF) { + if (UTF || is_exactfu_sharp_s) { if (FOLD) { /* Emit all the Unicode characters. */ STRLEN numlen; @@ -9585,7 +9829,7 @@ tryagain: *flagp |= HASWIDTH; if (len == 1 && UNI_IS_INVARIANT(ender)) *flagp |= SIMPLE; - + if (SIZE_ONLY) RExC_size += STR_SZ(len); else { @@ -9654,7 +9898,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) POSIXCC(UCHARAT(RExC_parse))) { const char c = UCHARAT(RExC_parse); char* const s = RExC_parse++; - + while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) RExC_parse++; if (RExC_parse == RExC_end) @@ -9789,91 +10033,132 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) } } -/* No locale test, and always Unicode semantics, no ignore-case differences */ -#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ -ANYOF_##NAME: \ - for (value = 0; value < 256; value++) \ - if (TEST) \ - stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - yesno = '+'; \ - what = WORD; \ - break; \ -case ANYOF_N##NAME: \ - for (value = 0; value < 256; value++) \ - if (!TEST) \ - stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - yesno = '!'; \ - what = WORD; \ - break - -/* Like the above, but there are differences if we are in uni-8-bit or not, so - * there are two tests passed in, to use depending on that. There aren't any - * cases where the label is different from the name, so no need for that - * parameter. - * Sets 'what' to WORD which is the property name for non-bitmap code points; - * But, uses FOLD_WORD instead if /i has been selected, to allow a different - * property name */ -#define _C_C_T_(NAME, TEST_8, TEST_7, WORD, FOLD_WORD) \ -ANYOF_##NAME: \ - if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \ - else if (UNI_SEMANTICS) { \ - for (value = 0; value < 256; value++) { \ - if (TEST_8(value)) stored += \ - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - } \ - } \ - else { \ - for (value = 0; value < 128; value++) { \ - if (TEST_7(UNI_TO_NATIVE(value))) stored += \ - set_regclass_bit(pRExC_state, ret, \ - (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ - } \ - } \ - yesno = '+'; \ - if (FOLD) { \ - what = FOLD_WORD; \ - } \ - else { \ - what = WORD; \ - } \ - break; \ -case ANYOF_N##NAME: \ - if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \ - else if (UNI_SEMANTICS) { \ - for (value = 0; value < 256; value++) { \ - if (! TEST_8(value)) stored += \ - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ - } \ - } \ - else { \ - for (value = 0; value < 128; value++) { \ - if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \ - pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ - } \ - if (AT_LEAST_ASCII_RESTRICTED) { \ - for (value = 128; value < 256; value++) { \ - stored += set_regclass_bit( \ - pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ - } \ - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \ - } \ - else { \ - /* For a non-ut8 target string with DEPENDS semantics, all above \ - * ASCII Latin1 code points match the complement of any of the \ - * classes. But in utf8, they have their Unicode semantics, so \ - * can't just set them in the bitmap, or else regexec.c will think \ - * they matched when they shouldn't. */ \ - ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \ - } \ - } \ - yesno = '!'; \ - if (FOLD) { \ - what = FOLD_WORD; \ - } \ - else { \ - what = WORD; \ - } \ - break +/* Generate the code to add a full posix character to the bracketed + * character class given by . ( is needed only under locale rules) + * destlist is the inversion list for non-locale rules that this class is + * to be added to + * sourcelist is the ASCII-range inversion list to add under /a rules + * Xsourcelist is the full Unicode range list to use otherwise. */ +#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \ + if (LOC) { \ + SV* scratch_list = NULL; \ + \ + /* Set this class in the node for runtime matching */ \ + ANYOF_CLASS_SET(node, class); \ + \ + /* For above Latin1 code points, we use the full Unicode range */ \ + _invlist_intersection(PL_AboveLatin1, \ + Xsourcelist, \ + &scratch_list); \ + /* And set the output to it, adding instead if there already is an \ + * output. Checking if is NULL first saves an extra \ + * clone. Its reference count will be decremented at the next \ + * union, etc, or if this is the only instance, at the end of the \ + * routine */ \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + } \ + else { \ + /* For non-locale, just add it to any existing list */ \ + _invlist_union(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : Xsourcelist, \ + &destlist); \ + } + +/* Like DO_POSIX, but matches the complement of and . + */ +#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \ + if (LOC) { \ + SV* scratch_list = NULL; \ + ANYOF_CLASS_SET(node, class); \ + _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + } \ + else { \ + _invlist_union_complement_2nd(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : Xsourcelist, \ + &destlist); \ + /* Under /d, everything in the upper half of the Latin1 range \ + * matches this complement */ \ + if (DEPENDS_SEMANTICS) { \ + ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \ + } \ + } + +/* Generate the code to add a posix character to the bracketed + * character class given by . ( is needed only under locale rules) + * destlist is the inversion list for non-locale rules that this class is + * to be added to + * sourcelist is the ASCII-range inversion list to add under /a rules + * l1_sourcelist is the Latin1 range list to use otherwise. + * Xpropertyname is the name to add to of the property to + * specify the code points above Latin1 that will have to be + * determined at run-time + * run_time_list is a SV* that contains text names of properties that are to + * be computed at run time. This concatenates + * to it, apppropriately + * This is essentially DO_POSIX, but we know only the Latin1 values at compile + * time */ +#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ + l1_sourcelist, Xpropertyname, run_time_list) \ + /* If not /a matching, there are going to be code points we will have \ + * to defer to runtime to look-up */ \ + if (! AT_LEAST_ASCII_RESTRICTED) { \ + Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \ + } \ + if (LOC) { \ + ANYOF_CLASS_SET(node, class); \ + } \ + else { \ + _invlist_union(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : l1_sourcelist, \ + &destlist); \ + } + +/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of + * this and DO_N_POSIX */ +#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ + l1_sourcelist, Xpropertyname, run_time_list) \ + if (AT_LEAST_ASCII_RESTRICTED) { \ + _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \ + } \ + else { \ + Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \ + if (LOC) { \ + ANYOF_CLASS_SET(node, namedclass); \ + } \ + else { \ + SV* scratch_list = NULL; \ + _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + if (DEPENDS_SEMANTICS) { \ + ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \ + } \ + } \ + } STATIC U8 S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr) @@ -10299,7 +10584,6 @@ parseit: /* Look up the property name, and get its swash and * inversion list, if the property is found */ - if (! (ANYOF_FLAGS(ret) & ANYOF_INVERT)) { if (swash) { SvREFCNT_dec(swash); } @@ -10310,10 +10594,7 @@ parseit: undefined properties */ NULL, FALSE /* No inversion list */ ); - } - - if ( ANYOF_FLAGS(ret) & ANYOF_INVERT - || ! swash + if ( ! swash || ! SvROK(swash) || ! SvTYPE(SvRV(swash)) == SVt_PVHV || ! (invlistsvp = @@ -10354,18 +10635,7 @@ parseit: /* Invert if asking for the complement */ if (value == 'P') { - - /* Add to any existing list */ - if (! properties) { - properties = invlist_clone(invlist); - _invlist_invert(properties); - } - else { - invlist = invlist_clone(invlist); - _invlist_invert(invlist); - _invlist_union(properties, invlist, &properties); - SvREFCNT_dec(invlist); - } + _invlist_union_complement_2nd(properties, invlist, &properties); /* The swash can't be used as-is, because we've * inverted things; delay removing it to here after @@ -10374,21 +10644,12 @@ parseit: swash = NULL; } else { - if (! properties) { - properties = invlist_clone(invlist); - } - else { - _invlist_union(properties, invlist, &properties); - } + _invlist_union(properties, invlist, &properties); } } Safefree(name); } RExC_parse = e + 1; - - /* The \p could match something in the Latin1 range, hence - * something that isn't utf8 */ - ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; namedclass = ANYOF_MAX; /* no official name, but it's named */ /* \p means they want Unicode semantics */ @@ -10526,88 +10787,210 @@ parseit: range = 0; /* this was not a true range */ } - - if (!SIZE_ONLY) { - const char *what = NULL; - char yesno = 0; /* Possible truncation here but in some 64-bit environments * the compiler gets heartburn about switch on 64-bit values. * A similar issue a little earlier when switching on value. * --jhi */ switch ((I32)namedclass) { - - case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum", "XPosixAlnum"); - case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha", "XPosixAlpha"); - case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank", "XPosixBlank"); - case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl", "XPosixCntrl"); - case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph", "XPosixGraph"); - case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower", "__XPosixLower_i"); - case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint", "XPosixPrint"); - case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace", "XPosixSpace"); - case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct", "XPosixPunct"); - case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper", "__XPosixUpper_i"); - /* \s, \w match all unicode if utf8. */ - case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl", "SpacePerl"); - case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word", "Word"); - case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit", "XPosixXDigit"); - case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); - case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); + + case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */ + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); + break; + case ANYOF_NALNUMC: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); + break; + case ANYOF_ALPHA: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); + break; + case ANYOF_NALPHA: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); + break; case ANYOF_ASCII: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_ASCII); - else { - for (value = 0; value < 128; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); + if (LOC) { + ANYOF_CLASS_SET(ret, namedclass); } - yesno = '+'; - what = NULL; /* Doesn't match outside ascii, so - don't want to add +utf8:: */ + else { + _invlist_union(properties, PL_ASCII, &properties); + } break; case ANYOF_NASCII: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NASCII); - else { - for (value = 128; value < 256; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); + if (LOC) { + ANYOF_CLASS_SET(ret, namedclass); } - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; - yesno = '!'; - what = "ASCII"; - break; + else { + _invlist_union_complement_2nd(properties, + PL_ASCII, &properties); + if (DEPENDS_SEMANTICS) { + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; + } + } + break; + case ANYOF_BLANK: + DO_POSIX(ret, namedclass, properties, + PL_PosixBlank, PL_XPosixBlank); + break; + case ANYOF_NBLANK: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixBlank, PL_XPosixBlank); + break; + case ANYOF_CNTRL: + DO_POSIX(ret, namedclass, properties, + PL_PosixCntrl, PL_XPosixCntrl); + break; + case ANYOF_NCNTRL: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixCntrl, PL_XPosixCntrl); + break; case ANYOF_DIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_DIGIT); + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv); + break; + case ANYOF_NDIGIT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv); + break; + case ANYOF_GRAPH: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); + break; + case ANYOF_NGRAPH: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); + break; + case ANYOF_HORIZWS: + /* For these, we use the nonbitmap, as /d doesn't make a + * difference in what these match. There would be problems + * if these characters had folds other than themselves, as + * nonbitmap is subject to folding. It turns out that \h + * is just a synonym for XPosixBlank */ + _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap); + break; + case ANYOF_NHORIZWS: + _invlist_union_complement_2nd(nonbitmap, + PL_XPosixBlank, &nonbitmap); + break; + case ANYOF_LOWER: + case ANYOF_NLOWER: + { /* These require special handling, as they differ under + folding, matching Cased there (which in the ASCII range + is the same as Alpha */ + + SV* ascii_source; + SV* l1_source; + const char *Xname; + + if (FOLD && ! LOC) { + ascii_source = PL_PosixAlpha; + l1_source = PL_L1Cased; + Xname = "Cased"; + } + else { + ascii_source = PL_PosixLower; + l1_source = PL_L1PosixLower; + Xname = "XPosixLower"; + } + if (namedclass == ANYOF_LOWER) { + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + ascii_source, l1_source, Xname, listsv); + } else { - /* consecutive digits assumed */ - for (value = '0'; value <= '9'; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, + properties, ascii_source, l1_source, Xname, listsv); } - yesno = '+'; - what = "Digit"; break; - case ANYOF_NDIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); + } + case ANYOF_PRINT: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); + break; + case ANYOF_NPRINT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); + break; + case ANYOF_PUNCT: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); + break; + case ANYOF_NPUNCT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); + break; + case ANYOF_PSXSPC: + DO_POSIX(ret, namedclass, properties, + PL_PosixSpace, PL_XPosixSpace); + break; + case ANYOF_NPSXSPC: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixSpace, PL_XPosixSpace); + break; + case ANYOF_SPACE: + DO_POSIX(ret, namedclass, properties, + PL_PerlSpace, PL_XPerlSpace); + break; + case ANYOF_NSPACE: + DO_N_POSIX(ret, namedclass, properties, + PL_PerlSpace, PL_XPerlSpace); + break; + case ANYOF_UPPER: /* Same as LOWER, above */ + case ANYOF_NUPPER: + { + SV* ascii_source; + SV* l1_source; + const char *Xname; + + if (FOLD && ! LOC) { + ascii_source = PL_PosixAlpha; + l1_source = PL_L1Cased; + Xname = "Cased"; + } else { - /* consecutive digits assumed */ - for (value = 0; value < '0'; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); - for (value = '9' + 1; value < 256; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); + ascii_source = PL_PosixUpper; + l1_source = PL_L1PosixUpper; + Xname = "XPosixUpper"; } - yesno = '!'; - what = "Digit"; - if (AT_LEAST_ASCII_RESTRICTED ) { - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; + if (namedclass == ANYOF_UPPER) { + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + ascii_source, l1_source, Xname, listsv); } - break; + else { + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, + properties, ascii_source, l1_source, Xname, listsv); + } + break; + } + case ANYOF_ALNUM: /* Really is 'Word' */ + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + break; + case ANYOF_NALNUM: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + break; + case ANYOF_VERTWS: + /* For these, we use the nonbitmap, as /d doesn't make a + * difference in what these match. There would be problems + * if these characters had folds other than themselves, as + * nonbitmap is subject to folding */ + _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap); + break; + case ANYOF_NVERTWS: + _invlist_union_complement_2nd(nonbitmap, + PL_VertSpace, &nonbitmap); + break; + case ANYOF_XDIGIT: + DO_POSIX(ret, namedclass, properties, + PL_PosixXDigit, PL_XPosixXDigit); + break; + case ANYOF_NXDIGIT: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixXDigit, PL_XPosixXDigit); + break; case ANYOF_MAX: /* this is to handle \p and \P */ break; @@ -10615,10 +10998,6 @@ parseit: vFAIL("Invalid [::] class"); break; } - if (what && ! (AT_LEAST_ASCII_RESTRICTED)) { - /* Strings such as "+utf8::isWord\n" */ - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", yesno, what); - } continue; } @@ -10727,6 +11106,7 @@ parseit: if (! PL_utf8_foldable) { SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0); PL_utf8_foldable = _swash_to_invlist(swash); + SvREFCNT_dec(swash); } /* This is a hash that for a particular fold gives all characters @@ -10915,9 +11295,66 @@ parseit: } } + /* Here, contains all the code points we can determine at + * compile time that we haven't put into the bitmap. Go through it, and + * for things that belong in the bitmap, put them there, and delete from + * */ + if (nonbitmap) { + + /* Above-ASCII code points in /d have to stay in , as they + * possibly only should match when the target string is UTF-8 */ + UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255; + + /* This gets set if we actually need to modify things */ + bool change_invlist = FALSE; + + UV start, end; + + /* Start looking through */ + invlist_iterinit(nonbitmap); + while (invlist_iternext(nonbitmap, &start, &end)) { + UV high; + int i; + + /* Quit if are above what we should change */ + if (start > max_cp_to_set) { + break; + } + + change_invlist = TRUE; + + /* Set all the bits in the range, up to the max that we are doing */ + high = (end < max_cp_to_set) ? end : max_cp_to_set; + for (i = start; i <= (int) high; i++) { + if (! ANYOF_BITMAP_TEST(ret, i)) { + ANYOF_BITMAP_SET(ret, i); + stored++; + prevvalue = value; + value = i; + } + } + } + + /* Done with loop; remove any code points that are in the bitmap from + * */ + if (change_invlist) { + _invlist_subtract(nonbitmap, + (DEPENDS_SEMANTICS) + ? PL_ASCII + : PL_Latin1, + &nonbitmap); + } + + /* If have completely emptied it, remove it completely */ + if (invlist_len(nonbitmap) == 0) { + SvREFCNT_dec(nonbitmap); + nonbitmap = NULL; + } + } /* Here, we have calculated what code points should be in the character - * class. + * class. does not overlap the bitmap except possibly in the + * case of DEPENDS rules. * * Now we can see about various optimizations. Fold calculation (which we * did above) needs to take place before inversion. Otherwise /[^k]/i @@ -10941,31 +11378,92 @@ parseit: || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8)) && SvCUR(listsv) == initial_listsv_len) { + int i; if (! nonbitmap) { - for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) - ANYOF_BITMAP(ret)[value] ^= 0xFF; + for (i = 0; i < 256; ++i) { + if (ANYOF_BITMAP_TEST(ret, i)) { + ANYOF_BITMAP_CLEAR(ret, i); + } + else { + ANYOF_BITMAP_SET(ret, i); + prevvalue = value; + value = i; + } + } /* The inversion means that everything above 255 is matched */ ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; } else { - /* Here, also has things outside the bitmap. Go through each bit - * individually and add it to the list to get rid of from those - * things not in the bitmap */ - SV *remove_list = _new_invlist(2); + /* Here, also has things outside the bitmap that may overlap with + * the bitmap. We have to sync them up, so that they get inverted + * in both places. Earlier, we removed all overlaps except in the + * case of /d rules, so no syncing is needed except for this case + */ + SV *remove_list = NULL; + + if (DEPENDS_SEMANTICS) { + UV start, end; + + /* Set the bits that correspond to the ones that aren't in the + * bitmap. Otherwise, when we invert, we'll miss these. + * Earlier, we removed from the nonbitmap all code points + * < 128, so there is no extra work here */ + invlist_iterinit(nonbitmap); + while (invlist_iternext(nonbitmap, &start, &end)) { + if (start > 255) { /* The bit map goes to 255 */ + break; + } + if (end > 255) { + end = 255; + } + for (i = start; i <= (int) end; ++i) { + ANYOF_BITMAP_SET(ret, i); + prevvalue = value; + value = i; + } + } + } + + /* Now invert both the bitmap and the nonbitmap. Anything in the + * bitmap has to also be removed from the non-bitmap, but again, + * there should not be overlap unless is /d rules. */ _invlist_invert(nonbitmap); - for (value = 0; value < 256; ++value) { - if (ANYOF_BITMAP_TEST(ret, value)) { - ANYOF_BITMAP_CLEAR(ret, value); - remove_list = add_cp_to_invlist(remove_list, value); + + /* Any swash can't be used as-is, because we've inverted things */ + if (swash) { + SvREFCNT_dec(swash); + swash = NULL; + } + + for (i = 0; i < 256; ++i) { + if (ANYOF_BITMAP_TEST(ret, i)) { + ANYOF_BITMAP_CLEAR(ret, i); + if (DEPENDS_SEMANTICS) { + if (! remove_list) { + remove_list = _new_invlist(2); + } + remove_list = add_cp_to_invlist(remove_list, i); + } } else { - ANYOF_BITMAP_SET(ret, value); + ANYOF_BITMAP_SET(ret, i); + prevvalue = value; + value = i; } } /* And do the removal */ - _invlist_subtract(nonbitmap, remove_list, &nonbitmap); - SvREFCNT_dec(remove_list); + if (DEPENDS_SEMANTICS) { + if (remove_list) { + _invlist_subtract(nonbitmap, remove_list, &nonbitmap); + SvREFCNT_dec(remove_list); + } + } + else { + /* There is no overlap for non-/d, so just delete anything + * below 256 */ + _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap); + } } stored = 256 - stored; @@ -10976,8 +11474,15 @@ parseit: /* Folding in the bitmap is taken care of above, but not for locale (for * which we have to wait to see what folding is in effect at runtime), and - * for things not in the bitmap. Set run-time fold flag for these */ - if (FOLD && (LOC || nonbitmap || unicode_alternate)) { + * for some things not in the bitmap (only the upper latin folds in this + * case, as all other single-char folding has been set above). Set + * run-time fold flag for these */ + if (FOLD && (LOC + || (DEPENDS_SEMANTICS + && nonbitmap + && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8)) + || unicode_alternate)) + { ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD; } @@ -11042,12 +11547,16 @@ parseit: * is just the lower case of the current one (which may resolve to * itself, or to the other one */ value = toLOWER_LATIN1(value); - if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) { - /* To join adjacent nodes, they must be the exact EXACTish - * type. Try to use the most likely type, by using EXACTFU if - * the regex calls for them, or is required because the - * character is non-ASCII */ + /* To join adjacent nodes, they must be the exact EXACTish type. + * Try to use the most likely type, by using EXACTFA if possible, + * then EXACTFU if the regex calls for it, or is required because + * the character is non-ASCII. (If is ASCII, its fold is + * also ASCII for the cases where we get here.) */ + if (MORE_ASCII_RESTRICTED && isASCII(value)) { + op = EXACTFA; + } + else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) { op = EXACTFU; } else { /* Otherwise, more likely to be EXACTF type */ @@ -11087,19 +11596,21 @@ parseit: SvREFCNT_dec(unicode_alternate); } else { - + /* av[0] stores the character class description in its textual form: + * used later (regexec.c:Perl_regclass_swash()) to initialize the + * appropriate swash, and is also useful for dumping the regnode. + * av[1] if NULL, is a placeholder to later contain the swash computed + * from av[0]. But if no further computation need be done, the + * swash is stored there now. + * av[2] stores the multicharacter foldings, used later in + * regexec.c:S_reginclass(). + * av[3] stores the nonbitmap inversion list for use in addition or + * instead of av[0]; not used if av[1] isn't NULL + * av[4] is set if any component of the class is from a user-defined + * property; not used if av[1] isn't NULL */ AV * const av = newAV(); SV *rv; - /* The 0th element stores the character class description - * in its textual form: used later (regexec.c:Perl_regclass_swash()) - * to initialize the appropriate swash (which gets stored in - * element [1]), and also useful for dumping the regnode. - * Element [2] stores the multicharacter foldings, - * used later (regexec.c:S_reginclass()). - * Element [3] stores the nonbitmap inversion list for use in addition - * or instead of element [0]. - * Element [4] is set if any component of the class is from a - * user-defined property */ + av_store(av, 0, (SvCUR(listsv) == initial_listsv_len) ? &PL_sv_undef : listsv); @@ -11133,7 +11644,6 @@ parseit: } return ret; } -#undef _C_C_T_ /* reg_skipcomment() @@ -11190,8 +11700,11 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) PERL_ARGS_ASSERT_NEXTCHAR; for (;;) { - if (*RExC_parse == '(' && RExC_parse[1] == '?' && - RExC_parse[2] == '#') { + if (RExC_end - RExC_parse >= 3 + && *RExC_parse == '(' + && RExC_parse[1] == '?' + && RExC_parse[2] == '#') + { while (*RExC_parse != ')') { if (RExC_parse == RExC_end) FAIL("Sequence (?#... not terminated"); @@ -11233,7 +11746,8 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) return(ret); } if (RExC_emit >= RExC_emit_bound) - Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", + op, RExC_emit, RExC_emit_bound); NODE_ALIGN_FILL(ret); ptr = ret; @@ -11275,7 +11789,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) We can't do this: assert(2==regarglen[op]+1); - + Anything larger than this has to allocate the extra amount. If we changed this to be: @@ -11288,7 +11802,8 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) return(ret); } if (RExC_emit >= RExC_emit_bound) - Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", + op, RExC_emit, RExC_emit_bound); NODE_ALIGN_FILL(ret); ptr = ret; @@ -11499,9 +12014,11 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, for (;;) { regnode * const temp = regnext(scan); #ifdef EXPERIMENTAL_INPLACESCAN - if (PL_regkind[OP(scan)] == EXACT) - if (join_exact(pRExC_state,scan,&min,1,val,depth+1)) + if (PL_regkind[OP(scan)] == EXACT) { + bool has_exactf_sharp_s; /* Unexamined in this routine */ + if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1)) return EXACT; + } #endif if ( exact ) { switch (OP(scan)) { @@ -11509,6 +12026,8 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, case EXACTF: case EXACTFA: case EXACTFU: + case EXACTFU_SS: + case EXACTFU_NO_TRIE: case EXACTFL: if( exact == PSEUDO ) exact= OP(scan); @@ -11833,8 +12352,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ - else if (k == FOLDCHAR) - Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); @@ -11881,7 +12398,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); if (flags & ANYOF_INVERT) sv_catpvs(sv, "^"); - + /* output what the standard cp 0-255 bitmap matches */ for (i = 0; i <= 256; i++) { if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { @@ -11927,63 +12444,81 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (ANYOF_NONBITMAP(o)) { SV *lv; /* Set if there is something outside the bit map */ SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0); - + bool byte_output = FALSE; /* If something in the bitmap has been + output */ + if (lv && lv != &PL_sv_undef) { if (sw) { U8 s[UTF8_MAXBYTES_CASE+1]; for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */ uvchr_to_utf8(s, i); - - if (i < 256 && swash_fetch(sw, s, TRUE)) { + + if (i < 256 + && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate + things already + output as part + of the bitmap */ + && swash_fetch(sw, s, TRUE)) + { if (rangestart == -1) rangestart = i; } else if (rangestart != -1) { + byte_output = TRUE; if (i <= rangestart + 3) for (; rangestart < i; rangestart++) { - const U8 * const e = uvchr_to_utf8(s,rangestart); - U8 *p; - for(p = s; p < e; p++) - put_byte(sv, *p); + put_byte(sv, rangestart); } else { - const U8 *e = uvchr_to_utf8(s,rangestart); - U8 *p; - for (p = s; p < e; p++) - put_byte(sv, *p); + put_byte(sv, rangestart); sv_catpvs(sv, "-"); - e = uvchr_to_utf8(s, i-1); - for (p = s; p < e; p++) - put_byte(sv, *p); - } - rangestart = -1; + put_byte(sv, i-1); } + rangestart = -1; } - - sv_catpvs(sv, "..."); /* et cetera */ + } } { char *s = savesvpv(lv); char * const origs = s; - + while (*s && *s != '\n') s++; - + if (*s == '\n') { const char * const t = ++s; - + + if (byte_output) { + sv_catpvs(sv, " "); + } + while (*s) { - if (*s == '\n') + if (*s == '\n') { + + /* Truncate very long output */ + if (s - origs > 256) { + Perl_sv_catpvf(aTHX_ sv, + "%.*s...", + (int) (s - origs - 1), + t); + goto out_dump; + } *s = ' '; + } + else if (*s == '\t') { + *s = '-'; + } s++; } if (s[-1] == ' ') s[-1] = 0; - + sv_catpv(sv, t); } - + + out_dump: + Safefree(origs); } SvREFCNT_dec(lv); @@ -12515,7 +13050,7 @@ Perl_regnext(pTHX_ register regnode *p) } #endif -STATIC void +STATIC void S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) { va_list args; @@ -12692,7 +13227,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, goto after_print; } else CLEAR_OPTSTART; - + regprop(r, sv, node); PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), (int)(2*indent + 1), "", SvPVX_const(sv)); @@ -12740,7 +13275,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, sv_setpvs(sv, ""); for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); - + PerlIO_printf(Perl_debug_log, "%*s%s ", (int)(2*(indent+3)), "", elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,