X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/5a279aa01b85454d24e44898e581820d24e712b3..243effed56c5bea983c9cdbdc24b329f19ff0aad:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 7460680..979dcae 100644 --- a/regcomp.c +++ b/regcomp.c @@ -81,16 +81,25 @@ #define REG_COMP_C #ifdef PERL_IN_XSUB_RE # include "re_comp.h" +extern const struct regexp_engine my_reg_engine; #else # include "regcomp.h" #endif #include "dquote_static.c" -#ifndef PERL_IN_XSUB_RE -# include "charclass_invlists.h" +#include "charclass_invlists.h" +#include "inline_invlist.c" +#include "unicode_constants.h" + +#ifdef HAS_ISBLANK +# define hasISBLANK 1 +#else +# define hasISBLANK 0 #endif #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) +#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) #ifdef op #undef op @@ -150,6 +159,7 @@ typedef struct RExC_state_t { I32 in_lookbehind; I32 contains_locale; I32 override_recoding; + I32 in_multi_char_class; struct reg_code_block *code_blocks; /* positions of literal (?{}) within pattern */ int num_code_blocks; /* size of code_blocks[] */ @@ -204,7 +214,8 @@ typedef struct RExC_state_t { #define RExC_recurse_count (pRExC_state->recurse_count) #define RExC_in_lookbehind (pRExC_state->in_lookbehind) #define RExC_contains_locale (pRExC_state->contains_locale) -#define RExC_override_recoding (pRExC_state->override_recoding) +#define RExC_override_recoding (pRExC_state->override_recoding) +#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') @@ -220,11 +231,12 @@ typedef struct RExC_state_t { #define WORST 0 /* Worst case. */ #define HASWIDTH 0x01 /* Known to match non-null strings. */ -/* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single - * character, and if utf8, must be invariant. Note that this is not the same - * thing as REGNODE_SIMPLE */ +/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single + * character. (There needs to be a case: in the switch statement in regexec.c + * for any node marked SIMPLE.) Note that this is not the same thing as + * REGNODE_SIMPLE */ #define SIMPLE 0x02 -#define SPSTART 0x04 /* Starts with * or +. */ +#define SPSTART 0x04 /* Starts with * or + */ #define TRYAGAIN 0x08 /* Weeded out a declaration. */ #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */ @@ -289,8 +301,8 @@ typedef struct RExC_state_t { string can occur infinitely far to the right. - minlenp - A pointer to the minimum length of the pattern that the string - was found inside. This is important as in the case of positive + A pointer to the minimum number of characters of the pattern that the + string was found inside. This is important as in the case of positive lookahead or positive lookbehind we can have multiple patterns involved. Consider @@ -398,8 +410,8 @@ static const scan_data_t zero_scan_data = #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) -#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) @@ -437,7 +449,7 @@ static const scan_data_t zero_scan_data = IV len = RExC_end - RExC_precomp; \ \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ + SAVEFREESV(RExC_rx_sv); \ if (len > RegexLengthToShowInErrorMessages) { \ /* chop 10 shorter than the max, to ensure meaning of "..." */ \ len = RegexLengthToShowInErrorMessages - 10; \ @@ -468,7 +480,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL(m) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ + SAVEFREESV(RExC_rx_sv); \ Simple_vFAIL(m); \ } STMT_END @@ -486,7 +498,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL2(m,a1) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ + SAVEFREESV(RExC_rx_sv); \ Simple_vFAIL2(m, a1); \ } STMT_END @@ -505,7 +517,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL3(m,a1,a2) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ + SAVEFREESV(RExC_rx_sv); \ Simple_vFAIL3(m, a1, a2); \ } STMT_END @@ -683,8 +695,6 @@ DEBUG_OPTIMISE_MORE_r(if(data){ \ PerlIO_printf(Perl_debug_log,"\n"); \ }); -static void clear_re(pTHX_ void *r); - /* Mark that we cannot extend a found fixed substring at this point. Update the longest found anchored substring and the longest found floating substrings if needed. */ @@ -748,7 +758,7 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c ANYOF_BITMAP_SETALL(cl); cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL - |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL; + |ANYOF_NON_UTF8_LATIN1_ALL; /* If any portion of the regex is to operate under locale rules, * initialization includes it. The reason this isn't done for all regexes @@ -759,7 +769,7 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c * necessary. */ if (RExC_contains_locale) { ANYOF_CLASS_SETALL(cl); /* /l uses class */ - cl->flags |= ANYOF_LOCALE; + cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD; } else { ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */ @@ -814,8 +824,8 @@ S_cl_and(struct regnode_charclass_class *cl, if (!(ANYOF_CLASS_TEST_ANY_SET(and_with)) && !(ANYOF_CLASS_TEST_ANY_SET(cl)) && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD) - && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) { + && !(and_with->flags & ANYOF_LOC_FOLD) + && !(cl->flags & ANYOF_LOC_FOLD)) { int i; if (and_with->flags & ANYOF_INVERT) @@ -948,8 +958,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) */ else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) - && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) { + && !(or_with->flags & ANYOF_LOC_FOLD) + && !(cl->flags & ANYOF_LOC_FOLD) ) { int i; for (i = 0; i < ANYOF_BITMAP_SIZE; i++) @@ -975,8 +985,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con } else { /* 'or_with' is not inverted */ /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) - || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) { + && (!(or_with->flags & ANYOF_LOC_FOLD) + || (cl->flags & ANYOF_LOC_FOLD)) ) { int i; /* OR char bitmap and class bitmap separately */ @@ -2579,89 +2589,72 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode /* The below joins as many adjacent EXACTish nodes as possible into a single - * one, and looks for problematic sequences of characters whose folds vs. - * non-folds have sufficiently different lengths, that the optimizer would be - * fooled into rejecting legitimate matches of them, and the trie construction - * code can't cope with them. The joining is only done if: + * one. The regop may be changed if the node(s) contain certain sequences that + * require special handling. The joining is only done if: * 1) there is room in the current conglomerated node to entirely contain the * next one. * 2) they are the exact same node type * - * The adjacent nodes actually may be separated by NOTHING kind nodes, and + * The adjacent nodes actually may be separated by NOTHING-kind nodes, and * these get optimized out * - * If there are problematic code sequences, *min_subtract is set to the delta - * that the minimum size of the node can be less than its actual size. And, - * the node type of the result is changed to reflect that it contains these - * sequences. + * If a node is to match under /i (folded), the number of characters it matches + * can be different than its character length if it contains a multi-character + * fold. *min_subtract is set to the total delta of the input nodes. * * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF * and contains LATIN SMALL LETTER SHARP S * * This is as good a place as any to discuss the design of handling these - * problematic sequences. It's been wrong in Perl for a very long time. There - * are three code points in Unicode whose folded lengths differ so much from - * the un-folded lengths that it causes problems for the optimizer and trie - * construction. Why only these are problematic, and not others where lengths - * also differ is something I (khw) do not understand. New versions of Unicode - * might add more such code points. Hopefully the logic in fold_grind.t that - * figures out what to test (in part by verifying that each size-combination - * gets tested) will catch any that do come along, so they can be added to the - * special handling below. The chances of new ones are actually rather small, - * as most, if not all, of the world's scripts that have casefolding have - * already been encoded by Unicode. Also, a number of Unicode's decisions were - * made to allow compatibility with pre-existing standards, and almost all of - * those have already been dealt with. These would otherwise be the most - * likely candidates for generating further tricky sequences. In other words, - * Unicode by itself is unlikely to add new ones unless it is for compatibility - * with pre-existing standards, and there aren't many of those left. - * - * The previous designs for dealing with these involved assigning a special - * node for them. This approach doesn't work, as evidenced by this example: + * multi-character fold sequences. It's been wrong in Perl for a very long + * time. There are three code points in Unicode whose multi-character folds + * were long ago discovered to mess things up. The previous designs for + * dealing with these involved assigning a special node for them. This + * approach doesn't work, as evidenced by this example: * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches - * Both these fold to "sss", but if the pattern is parsed to create a node of - * that would match just the \xDF, it won't be able to handle the case where a + * Both these fold to "sss", but if the pattern is parsed to create a node that + * would match just the \xDF, it won't be able to handle the case where a * successful match would have to cross the node's boundary. The new approach * that hopefully generally solves the problem generates an EXACTFU_SS node * that is "sss". * - * There are a number of components to the approach (a lot of work for just - * three code points!): - * 1) This routine examines each EXACTFish node that could contain the - * problematic sequences. It returns in *min_subtract how much to + * It turns out that there are problems with all multi-character folds, and not + * just these three. Now the code is general, for all such cases, but the + * three still have some special handling. The approach taken is: + * 1) This routine examines each EXACTFish node that could contain multi- + * character fold sequences. It returns in *min_subtract how much to * subtract from the the actual length of the string to get a real minimum - * for one that could match it. This number is usually 0 except for the - * problematic sequences. This delta is used by the caller to adjust the - * min length of the match, and the delta between min and max, so that the - * optimizer doesn't reject these possibilities based on size constraints. - * 2) These sequences require special handling by the trie code, so it - * changes the joined node type to ops for the trie's benefit, those new - * ops being EXACTFU_SS and EXACTFU_TRICKYFOLD. - * 3) This is sufficient for the two Greek sequences (described below), but - * the one involving the Sharp s (\xDF) needs more. The node type - * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss" - * sequence in it. For non-UTF-8 patterns and strings, this is the only - * case where there is a possible fold length change. That means that a - * regular EXACTFU node without UTF-8 involvement doesn't have to concern - * itself with length changes, and so can be processed faster. regexec.c - * takes advantage of this. Generally, an EXACTFish node that is in UTF-8 - * is pre-folded by regcomp.c. This saves effort in regex matching. - * However, probably mostly for historical reasons, the pre-folding isn't - * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL - * nodes, as what they fold to isn't known until runtime.) The fold + * match length; it is 0 if there are no multi-char folds. This delta is + * used by the caller to adjust the min length of the match, and the delta + * between min and max, so that the optimizer doesn't reject these + * possibilities based on size constraints. + * 2) Certain of these sequences require special handling by the trie code, + * so, if found, this code changes the joined node type to special ops: + * EXACTFU_TRICKYFOLD and EXACTFU_SS. + * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS + * is used for an EXACTFU node that contains at least one "ss" sequence in + * it. For non-UTF-8 patterns and strings, this is the only case where + * there is a possible fold length change. That means that a regular + * EXACTFU node without UTF-8 involvement doesn't have to concern itself + * with length changes, and so can be processed faster. regexec.c takes + * advantage of this. Generally, an EXACTFish node that is in UTF-8 is + * pre-folded by regcomp.c. This saves effort in regex matching. + * However, the pre-folding isn't done for non-UTF8 patterns because the + * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things + * down by forcing the pattern into UTF8 unless necessary. Also what + * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold * possibilities for the non-UTF8 patterns are quite simple, except for - * the sharp s. All the ones that don't involve a UTF-8 target string - * are members of a fold-pair, and arrays are set up for all of them - * that quickly find the other member of the pair. It might actually - * be faster to pre-fold these, but it isn't currently done, except for - * the sharp s. Code elsewhere in this file makes sure that it gets - * folded to 'ss', even if the pattern isn't UTF-8. This avoids the - * issues described in the next item. + * the sharp s. All the ones that don't involve a UTF-8 target string are + * members of a fold-pair, and arrays are set up for all of them so that + * the other member of the pair can be found quickly. Code elsewhere in + * this file makes sure that in EXACTFU nodes, the sharp s gets folded to + * 'ss', even if the pattern isn't UTF-8. This avoids the issues + * described in the next item. * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches * 'ss' or not is not knowable at compile time. It will match iff the * target string is in UTF-8, unlike the EXACTFU nodes, where it always * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus - * it can't be folded to "ss" at compile time, unlike EXACTFU does as + * it can't be folded to "ss" at compile time, unlike EXACTFU does (as * described in item 3). An assumption that the optimizer part of * regexec.c (probably unwittingly) makes is that a character in the * pattern corresponds to at most a single character in the target string. @@ -2733,6 +2726,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b const unsigned int oldl = STR_LEN(scan); regnode * const nnext = regnext(n); + /* XXX I (khw) kind of doubt that this works on platforms where + * U8_MAX is above 255 because of lots of other assumptions */ if (oldl + STR_LEN(n) > U8_MAX) break; @@ -2774,19 +2769,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * hence missed). The sequences only happen in folding, hence for any * non-EXACT EXACTish node */ if (OP(scan) != EXACT) { - U8 *s; - U8 * s0 = (U8*) STRING(scan); - U8 * const s_end = s0 + STR_LEN(scan); - - /* The below is perhaps overboard, but this allows us to save a test - * each time through the loop at the expense of a mask. This is - * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a - * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64. - * This uses an exclusive 'or' to find that bit and then inverts it to - * form a mask, with just a single 0, in the bit position where 'S' and - * 's' differ. */ - const U8 S_or_s_mask = (U8) ~ ('S' ^ 's'); - const U8 s_masked = 's' & S_or_s_mask; + const U8 * const s0 = (U8*) STRING(scan); + const U8 * s = s0; + const U8 * const s_end = s0 + STR_LEN(scan); /* One pass is made over the node's string looking for all the * possibilities. to avoid some tests in the loop, there are two main @@ -2794,146 +2779,137 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * non-UTF-8 */ if (UTF) { - /* There are two problematic Greek code points in Unicode - * casefolding - * - * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS - * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS - * - * which casefold to - * - * Unicode UTF-8 - * - * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 - * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 - * - * This means that in case-insensitive matching (or "loose - * matching", as Unicode calls it), an EXACTF of length six (the - * UTF-8 encoded byte length of the above casefolded versions) can - * match a target string of length two (the byte length of UTF-8 - * encoded U+0390 or U+03B0). This would rather mess up the - * minimum length computation. (there are other code points that - * also fold to these two sequences, but the delta is smaller) - * - * If these sequences are found, the minimum length is decreased by - * four (six minus two). - * - * Similarly, 'ss' may match the single char and byte LATIN SMALL - * LETTER SHARP S. We decrease the min length by 1 for each - * occurrence of 'ss' found */ - -#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */ -# define U390_first_byte 0xb4 - const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42"; -# define U3B0_first_byte 0xb5 - const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42"; -#else -# define U390_first_byte 0xce - const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81"; -# define U3B0_first_byte 0xcf - const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81"; -#endif - const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte; - yields a net of 0 */ - /* Examine the string for one of the problematic sequences */ - for (s = s0; - s < s_end - 1; /* Can stop 1 before the end, as minimum length - * sequence we are looking for is 2 */ - s += UTF8SKIP(s)) + /* Examine the string for a multi-character fold sequence. UTF-8 + * patterns have all characters pre-folded by the time this code is + * executed */ + while (s < s_end - 1) /* Can stop 1 before the end, as minimum + length sequence we are looking for is 2 */ { + int count = 0; + int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); + if (! len) { /* Not a multi-char fold: get next char */ + s += UTF8SKIP(s); + continue; + } - /* Look for the first byte in each problematic sequence */ - switch (*s) { - /* We don't have to worry about other things that fold to - * 's' (such as the long s, U+017F), as all above-latin1 - * code points have been pre-folded */ - case 's': - case 'S': - - /* Current character is an 's' or 'S'. If next one is - * as well, we have the dreaded sequence */ - if (((*(s+1) & S_or_s_mask) == s_masked) - /* These two node types don't have special handling - * for 'ss' */ - && OP(scan) != EXACTFL && OP(scan) != EXACTFA) - { - *min_subtract += 1; - OP(scan) = EXACTFU_SS; - s++; /* No need to look at this character again */ - } - break; - - case U390_first_byte: - if (s_end - s >= len - - /* The 1's are because are skipping comparing the - * first byte */ - && memEQ(s + 1, U390_tail, len - 1)) - { - goto greek_sequence; - } - break; + /* Nodes with 'ss' require special handling, except for EXACTFL + * and EXACTFA for which there is no multi-char fold to this */ + if (len == 2 && *s == 's' && *(s+1) == 's' + && OP(scan) != EXACTFL && OP(scan) != EXACTFA) + { + count = 2; + OP(scan) = EXACTFU_SS; + s += 2; + } + else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */ + && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8 + COMBINING_DIAERESIS_UTF8 + COMBINING_ACUTE_ACCENT_UTF8, + 6) + || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8 + COMBINING_DIAERESIS_UTF8 + COMBINING_ACUTE_ACCENT_UTF8, + 6))) + { + count = 3; + + /* These two folds require special handling by trie's, so + * change the node type to indicate this. If EXACTFA and + * EXACTFL were ever to be handled by trie's, this would + * have to be changed. If this node has already been + * changed to EXACTFU_SS in this loop, leave it as is. (I + * (khw) think it doesn't matter in regexec.c for UTF + * patterns, but no need to change it */ + if (OP(scan) == EXACTFU) { + OP(scan) = EXACTFU_TRICKYFOLD; + } + s += 6; + } + else { /* Here is a generic multi-char fold. */ + const U8* multi_end = s + len; + + /* Count how many characters in it. In the case of /l and + * /aa, no folds which contain ASCII code points are + * allowed, so check for those, and skip if found. (In + * EXACTFL, no folds are allowed to any Latin1 code point, + * not just ASCII. But there aren't any of these + * currently, nor ever likely, so don't take the time to + * test for them. The code that generates the + * is_MULTI_foo() macros croaks should one actually get put + * into Unicode .) */ + if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { + count = utf8_length(s, multi_end); + s = multi_end; + } + else { + while (s < multi_end) { + if (isASCII(*s)) { + s++; + goto next_iteration; + } + else { + s += UTF8SKIP(s); + } + count++; + } + } + } - case U3B0_first_byte: - if (! (s_end - s >= len - && memEQ(s + 1, U3B0_tail, len - 1))) - { - break; - } - greek_sequence: - *min_subtract += 4; - - /* This can't currently be handled by trie's, so change - * the node type to indicate this. If EXACTFA and - * EXACTFL were ever to be handled by trie's, this - * would have to be changed. If this node has already - * been changed to EXACTFU_SS in this loop, leave it as - * is. (I (khw) think it doesn't matter in regexec.c - * for UTF patterns, but no need to change it */ - if (OP(scan) == EXACTFU) { - OP(scan) = EXACTFU_TRICKYFOLD; - } - s += 6; /* We already know what this sequence is. Skip - the rest of it */ - break; - } + /* The delta is how long the sequence is minus 1 (1 is how long + * the character that folds to the sequence is) */ + *min_subtract += count - 1; + next_iteration: ; } } else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { - /* Here, the pattern is not UTF-8. We need to look only for the - * 'ss' sequence, and in the EXACTF case, the sharp s, which can be - * in the final position. Otherwise we can stop looking 1 byte - * earlier because have to find both the first and second 's' */ + /* Here, the pattern is not UTF-8. Look for the multi-char folds + * that are all ASCII. As in the above case, EXACTFL and EXACTFA + * nodes can't have multi-char folds to this range (and there are + * no existing ones in the upper latin1 range). In the EXACTF + * case we look also for the sharp s, which can be in the final + * position. Otherwise we can stop looking 1 byte earlier because + * have to find at least two characters for a multi-fold */ const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; - for (s = s0; s < upper; s++) { - switch (*s) { - case 'S': - case 's': - if (s_end - s > 1 - && ((*(s+1) & S_or_s_mask) == s_masked)) - { - *min_subtract += 1; - - /* EXACTF nodes need to know that the minimum - * length changed so that a sharp s in the string - * can match this ss in the pattern, but they - * remain EXACTF nodes, as they are not trie'able, - * so don't have to invent a new node type to - * exclude them from the trie code */ - if (OP(scan) != EXACTF) { - OP(scan) = EXACTFU_SS; - } - s++; - } - break; - case LATIN_SMALL_LETTER_SHARP_S: - if (OP(scan) == EXACTF) { - *has_exactf_sharp_s = TRUE; - } - break; + /* The below is perhaps overboard, but this allows us to save a + * test each time through the loop at the expense of a mask. This + * is because on both EBCDIC and ASCII machines, 'S' and 's' differ + * by a single bit. On ASCII they are 32 apart; on EBCDIC, they + * are 64. This uses an exclusive 'or' to find that bit and then + * inverts it to form a mask, with just a single 0, in the bit + * position where 'S' and 's' differ. */ + const U8 S_or_s_mask = (U8) ~ ('S' ^ 's'); + const U8 s_masked = 's' & S_or_s_mask; + + while (s < upper) { + int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); + if (! len) { /* Not a multi-char fold. */ + if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF) + { + *has_exactf_sharp_s = TRUE; + } + s++; + continue; + } + + if (len == 2 + && ((*s & S_or_s_mask) == s_masked) + && ((*(s+1) & S_or_s_mask) == s_masked)) + { + + /* EXACTF nodes need to know that the minimum length + * changed so that a sharp s in the string can match this + * ss in the pattern, but they remain EXACTF nodes, as they + * won't match this unless the target string is is UTF-8, + * which we don't know until runtime */ + if (OP(scan) != EXACTF) { + OP(scan) = EXACTFU_SS; + } } + + *min_subtract += len - 1; + s += len; } } } @@ -3024,7 +3000,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { dVAR; - I32 min = 0, pars = 0, code; + I32 min = 0; /* There must be at least this number of characters to match */ + I32 pars = 0, code; regnode *scan = *scanp, *next; I32 delta = 0; int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); @@ -3051,9 +3028,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, fake_study_recurse: while ( scan && OP(scan) != END && scan < last ){ - UV min_subtract = 0; /* How much to subtract from the minimum node - length to get a real minimum (because the - folded version may be shorter) */ + UV min_subtract = 0; /* How mmany chars to subtract from the minimum + node length to get a real minimum (because + the folded version may be shorter) */ bool has_exactf_sharp_s = FALSE; /* Peephole optimizer: */ DEBUG_STUDYDATA("Peep:", data,depth); @@ -3418,7 +3395,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * trietype so we can turn them into a trie. If/when we * allow NOTHING to start a trie sequence this condition will be * required, and it isn't expensive so we leave it in for now. */ - if ( trietype != NOTHING ) + if ( trietype && trietype != NOTHING ) make_trie( pRExC_state, startbranch, first, cur, tail, count, trietype, depth+1 ); @@ -3449,7 +3426,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); - if ( last ) { + if ( last && trietype ) { if ( trietype != NOTHING ) { /* the last branch of the sequence was part of a trie, * so we have to construct it here outside of the loop @@ -3606,7 +3583,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (uc >= 0x100 || (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) && !ANYOF_BITMAP_TEST(data->start_class, uc) - && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) + && (!(data->start_class->flags & ANYOF_LOC_FOLD) || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) ) { @@ -3661,13 +3638,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); } - else if (has_exactf_sharp_s) { + if (has_exactf_sharp_s) { RExC_seen |= REG_SEEN_EXACTF_SHARP_S; } min += l - min_subtract; - if (min < 0) { - min = 0; - } + assert (min >= 0); delta += min_subtract; if (flags & SCF_DO_SUBSTR) { data->pos_min += l - min_subtract; @@ -3694,12 +3669,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (compat) { ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; - data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD; if (OP(scan) == EXACTFL) { /* XXX This set is probably no longer necessary, and * probably wrong as LOCALE now is on in the initial * state */ - data->start_class->flags |= ANYOF_LOCALE; + data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD; } else { @@ -3734,7 +3708,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } else if (flags & SCF_DO_STCLASS_OR) { - if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) { + if (data->start_class->flags & ANYOF_LOC_FOLD) { /* false positive possible if the class is case-folded. Assume that the locale settings are the same... */ if (uc < 0x100) { @@ -3897,8 +3871,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) && maxcount <= REG_INFTY/3) /* Complement check for big count */ { + /* Fatal warnings may leak the regexp without this: */ + SAVEFREESV(RExC_rx_sv); ckWARNreg(RExC_parse, "Quantifier unexpected on zero-length expression"); + ReREFCNT_inc(RExC_rx_sv); } min += minnext * mincount; @@ -3958,6 +3935,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && !(data->flags & SF_HAS_EVAL) && !deltanext /* atom is fixed width */ && minnext != 0 /* CURLYM can't handle zero width */ + && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */ ) { /* XXXX How to optimize if data == 0? */ /* Optimize to a simpler form. */ @@ -4155,8 +4133,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; } - min += 1; - delta += 1; + min++; + delta++; /* Because of the 2 char string cr-lf */ if (flags & SCF_DO_SUBSTR) { SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ data->pos_min += 1; @@ -4207,7 +4185,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, case ALNUM: if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR); if (OP(scan) == ALNUMU) { for (value = 0; value < 256; value++) { if (!isWORDCHAR_L1(value)) { @@ -4225,7 +4203,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else { if (data->start_class->flags & ANYOF_LOCALE) - ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); + ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR); /* Even if under locale, set the bits for non-locale * in case it isn't a true locale-node. This will @@ -4248,7 +4226,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, case NALNUM: if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR); if (OP(scan) == NALNUMU) { for (value = 0; value < 256; value++) { if (isWORDCHAR_L1(value)) { @@ -4266,7 +4244,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else { if (data->start_class->flags & ANYOF_LOCALE) - ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); + ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR); /* Even if under locale, set the bits for non-locale in * case it isn't a true locale-node. This will create @@ -4895,13 +4873,18 @@ Perl_reginitcolors(pTHX) #ifdef TRIE_STUDY_OPT -#define CHECK_RESTUDY_GOTO \ +#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \ + STMT_START { \ if ( \ (data.flags & SCF_TRIE_RESTUDY) \ && ! restudied++ \ - ) goto reStudy + ) { \ + dOsomething; \ + goto reStudy; \ + } \ + } STMT_END #else -#define CHECK_RESTUDY_GOTO +#define CHECK_RESTUDY_GOTO_butfirst #endif /* @@ -4961,16 +4944,23 @@ Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) } #endif -/* public(ish) wrapper for Perl_re_op_compile that only takes an SV - * pattern rather than a list of OPs */ +/* public(ish) entry point for the perl core's own regex compiling code. + * It's actually a wrapper for Perl_re_op_compile that only takes an SV + * pattern rather than a list of OPs, and uses the internal engine rather + * than the current one */ REGEXP * Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) { SV *pat = pattern; /* defeat constness! */ PERL_ARGS_ASSERT_RE_COMPILE; - return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(), - NULL, NULL, rx_flags, 0); + return Perl_re_op_compile(aTHX_ &pat, 1, NULL, +#ifdef PERL_IN_XSUB_RE + &my_reg_engine, +#else + &PL_core_reg_engine, +#endif + NULL, NULL, rx_flags, 0); } /* see if there are any run-time code blocks in the pattern. @@ -5021,7 +5011,7 @@ S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr, * * becomes * - * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno' + * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno' * * After eval_sv()-ing that, grab any new code blocks from the returned qr * and merge them with any code blocks of the original regexp. @@ -5074,7 +5064,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, /* blank out literal code block */ assert(pat[s] == '('); while (s <= pRExC_state->code_blocks[n].end) { - *p++ = ' '; + *p++ = '_'; s++; } s--; @@ -5111,8 +5101,15 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, SPAGAIN; qr_ref = POPs; PUTBACK; - if (SvTRUE(ERRSV)) - Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV)); + { + SV * const errsv = ERRSV; + if (SvTRUE_NN(errsv)) + { + Safefree(pRExC_state->code_blocks); + /* use croak_sv ? */ + Perl_croak_nocontext("%s", SvPV_nolen_const(errsv)); + } + } assert(SvROK(qr_ref)); qr = SvRV(qr_ref); assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); @@ -5139,13 +5136,16 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, /* merge the main (r1) and run-time (r2) code blocks into one */ { - RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2); + RXi_GET_DECL(ReANY((REGEXP *)qr), r2); struct reg_code_block *new_block, *dst; RExC_state_t * const r1 = pRExC_state; /* convenient alias */ int i1 = 0, i2 = 0; if (!r2->num_code_blocks) /* we guessed wrong */ + { + SvREFCNT_dec(qr); return 1; + } Newx(new_block, r1->num_code_blocks + r2->num_code_blocks, @@ -5197,6 +5197,50 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, } +STATIC bool +S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol) +{ + /* This is the common code for setting up the floating and fixed length + * string data extracted from Perlre_op_compile() below. Returns a boolean + * as to whether succeeded or not */ + + I32 t,ml; + + if (! (longest_length + || (eol /* Can't have SEOL and MULTI */ + && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) + ) + /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ + || (RExC_seen & REG_SEEN_EXACTF_SHARP_S)) + { + return FALSE; + } + + /* copy the information about the longest from the reg_scan_data + over to the program. */ + if (SvUTF8(sv_longest)) { + *rx_utf8 = sv_longest; + *rx_substr = NULL; + } else { + *rx_substr = sv_longest; + *rx_utf8 = NULL; + } + /* end_shift is how many chars that must be matched that + follow this item. We calculate it ahead of time as once the + lookbehind offset is added in we lose the ability to correctly + calculate it.*/ + ml = minlen ? *(minlen) : (I32)longest_length; + *rx_end_shift = ml - offset + - longest_length + (SvTAIL(sv_longest) != 0) + + lookbehind; + + t = (eol/* Can't have SEOL and MULTI */ + && (! meol || (RExC_flags & RXf_PMf_MULTILINE))); + fbm_compile(sv_longest, t ? FBMcf_TAIL : 0); + + return TRUE; +} + /* * Perl_re_op_compile - the perl internal RE engine's function to compile a * regular expression into internal code. @@ -5249,7 +5293,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, dVAR; REGEXP *rx; struct regexp *r; - register regexp_internal *ri; + regexp_internal *ri; STRLEN plen; char * VOL exp; char* xend; @@ -5258,6 +5302,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, I32 minlen = 0; U32 rx_flags; SV * VOL pat; + SV * VOL code_blocksv = NULL; /* these are all flags - maybe they should be turned * into a single int with different bit masks */ @@ -5311,9 +5356,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist); PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist); - PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); - PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); - PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist); PL_PosixLower = _new_invlist_C_array(PosixLower_invlist); @@ -5339,6 +5381,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist); PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist); + + PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist); } #endif @@ -5399,7 +5443,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (pRExC_state->num_code_blocks) { o = cLISTOPx(expr)->op_first; - assert(o->op_type == OP_PUSHMARK); + assert( o->op_type == OP_PUSHMARK + || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK) + || o->op_type == OP_PADRANGE); o = o->op_sibling; } @@ -5423,6 +5469,17 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, SV *sv, *msv = *svp; SV *rx; bool code = 0; + /* we make the assumption here that each op in the list of + * op_siblings maps to one SV pushed onto the stack, + * except for code blocks, with have both an OP_NULL and + * and OP_CONST. + * This allows us to match up the list of SVs against the + * list of OPs to find the next code block. + * + * Note that PUSHMARK PADSV PADSV .. + * is optimised to + * PADRANGE NULL NULL .. + * so the alignment still works. */ if (o) { if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) { assert(n < pRExC_state->num_code_blocks); @@ -5451,8 +5508,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, else { while (SvAMAGIC(msv) && (sv = AMG_CALLunary(msv, string_amg)) - && sv != msv) - { + && sv != msv + && !( SvROK(msv) + && SvROK(sv) + && SvRV(msv) == SvRV(sv)) + ) { msv = sv; SvGETMAGIC(msv); } @@ -5470,7 +5530,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && RX_ENGINE((REGEXP*)rx)->op_comp) { - RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri); + RXi_GET_DECL(ReANY((REGEXP *)rx), ri); if (ri->num_code_blocks) { int i; /* the presence of an embedded qr// with code means @@ -5485,7 +5545,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, for (i=0; i < ri->num_code_blocks; i++) { struct reg_code_block *src, *dst; STRLEN offset = orig_patlen - + ((struct regexp *)SvANY(rx))->pre_prefix; + + ReANY((REGEXP *)rx)->pre_prefix; assert(n < pRExC_state->num_code_blocks); src = &ri->code_blocks[i]; dst = &pRExC_state->code_blocks[n]; @@ -5688,7 +5748,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags, exp, plen); if (!runtime_code) { - ReREFCNT_inc(old_re); if (used_setjump) { JMPENV_POP; } @@ -5726,7 +5785,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_pm_flags = pm_flags; if (runtime_code) { - if (PL_tainting && PL_tainted) + if (TAINTING_get && TAINT_get) Perl_croak(aTHX_ "Eval-group in insecure regular expression"); if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { @@ -5744,6 +5803,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; RExC_override_recoding = 0; + RExC_in_multi_char_class = 0; /* First pass: determine size, legality. */ RExC_parse = exp; @@ -5775,11 +5835,23 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_lastnum=0; RExC_lastparse=NULL; ); + /* reg may croak on us, not giving us a chance to free + pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may + need it to survive as long as the regexp (qr/(?{})/). + We must check that code_blocksv is not already set, because we may + have longjmped back. */ + if (pRExC_state->code_blocks && !code_blocksv) { + code_blocksv = newSV_type(SVt_PV); + SAVEFREESV(code_blocksv); + SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks); + SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/ + } if (reg(pRExC_state, 0, &flags,1) == NULL) { RExC_precomp = NULL; - Safefree(pRExC_state->code_blocks); return(NULL); } + if (code_blocksv) + SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ /* Here, finished first pass. Get rid of any added setjmp */ if (used_setjump) { @@ -5815,7 +5887,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, of zeroing when in debug mode, thus anything assigned has to happen after that */ rx = (REGEXP*) newSV_type(SVt_REGEXP); - r = (struct regexp*)SvANY(rx); + r = ReANY(rx); Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char, regexp_internal); if ( r == NULL || ri == NULL ) @@ -5837,7 +5909,13 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, ri->num_code_blocks = pRExC_state->num_code_blocks; } else + { + int n; + for (n = 0; n < pRExC_state->num_code_blocks; n++) + if (pRExC_state->code_blocks[n].src_regex) + SAVEFREESV(pRExC_state->code_blocks[n].src_regex); SAVEFREEPV(pRExC_state->code_blocks); + } { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); @@ -5868,8 +5946,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); - p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */ - SvPOK_on(rx); + Newx(p, wraplen + 1, char); /* +1 for the ending NUL */ + r->xpv_len_u.xpvlenu_pv = p; if (RExC_utf8) SvFLAGS(rx) |= SVf_UTF8; *p++='('; *p++='?'; @@ -5904,7 +5982,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, *p++ = '\n'; *p++ = ')'; *p = 0; - SvCUR_set(rx, p - SvPVX_const(rx)); + SvCUR_set(rx, p - RX_WRAPPED(rx)); } r->intflags = 0; @@ -5972,11 +6050,6 @@ reStudy: RExC_seen |= REG_TOP_LEVEL_BRANCHES; else RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; - if (data.last_found) { - SvREFCNT_dec(data.longest_fixed); - SvREFCNT_dec(data.longest_float); - SvREFCNT_dec(data.last_found); - } StructCopy(&zero_scan_data, &data, scan_data_t); } #else @@ -6142,6 +6215,10 @@ reStudy: data.longest_float = newSVpvs(""); data.last_found = newSVpvs(""); data.longest = &(data.longest_fixed); + ENTER_with_name("study_chunk"); + SAVEFREESV(data.longest_fixed); + SAVEFREESV(data.longest_float); + SAVEFREESV(data.last_found); first = scan; if (!ri->regstclass) { cl_init(pRExC_state, &ch_class); @@ -6156,7 +6233,7 @@ reStudy: SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); - CHECK_RESTUDY_GOTO; + CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) @@ -6166,107 +6243,58 @@ reStudy: && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) r->extflags |= RXf_CHECK_ALL; scan_commit(pRExC_state, &data,&minlen,0); - SvREFCNT_dec(data.last_found); - /* Note that code very similar to this but for anchored string - follows immediately below, changes may need to be made to both. - Be careful. - */ longest_float_length = CHR_SVLEN(data.longest_float); - if (longest_float_length - || (data.flags & SF_FL_BEFORE_EOL - && (!(data.flags & SF_FL_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE)))) - { - I32 t,ml; - /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ - if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S) - || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ - && data.offset_fixed == data.offset_float_min - && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) - goto remove_float; /* As in (a)+. */ - - /* copy the information about the longest float from the reg_scan_data - over to the program. */ - if (SvUTF8(data.longest_float)) { - r->float_utf8 = data.longest_float; - r->float_substr = NULL; - } else { - r->float_substr = data.longest_float; - r->float_utf8 = NULL; - } - /* float_end_shift is how many chars that must be matched that - follow this item. We calculate it ahead of time as once the - lookbehind offset is added in we lose the ability to correctly - calculate it.*/ - ml = data.minlen_float ? *(data.minlen_float) - : (I32)longest_float_length; - r->float_end_shift = ml - data.offset_float_min - - longest_float_length + (SvTAIL(data.longest_float) != 0) - + data.lookbehind_float; + if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) + && S_setup_longest (aTHX_ pRExC_state, + data.longest_float, + &(r->float_utf8), + &(r->float_substr), + &(r->float_end_shift), + data.lookbehind_float, + data.offset_float_min, + data.minlen_float, + longest_float_length, + data.flags & SF_FL_BEFORE_EOL, + data.flags & SF_FL_BEFORE_MEOL)) + { r->float_min_offset = data.offset_float_min - data.lookbehind_float; r->float_max_offset = data.offset_float_max; if (data.offset_float_max < I32_MAX) /* Don't offset infinity */ r->float_max_offset -= data.lookbehind_float; - - t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ - && (!(data.flags & SF_FL_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE))); - fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0); + SvREFCNT_inc_simple_void_NN(data.longest_float); } else { - remove_float: r->float_substr = r->float_utf8 = NULL; - SvREFCNT_dec(data.longest_float); longest_float_length = 0; } - /* Note that code very similar to this but for floating string - is immediately above, changes may need to be made to both. - Be careful. - */ longest_fixed_length = CHR_SVLEN(data.longest_fixed); - /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ - if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) - && (longest_fixed_length - || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ - && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE)))) ) + if (S_setup_longest (aTHX_ pRExC_state, + data.longest_fixed, + &(r->anchored_utf8), + &(r->anchored_substr), + &(r->anchored_end_shift), + data.lookbehind_fixed, + data.offset_fixed, + data.minlen_fixed, + longest_fixed_length, + data.flags & SF_FIX_BEFORE_EOL, + data.flags & SF_FIX_BEFORE_MEOL)) { - I32 t,ml; - - /* copy the information about the longest fixed - from the reg_scan_data over to the program. */ - if (SvUTF8(data.longest_fixed)) { - r->anchored_utf8 = data.longest_fixed; - r->anchored_substr = NULL; - } else { - r->anchored_substr = data.longest_fixed; - r->anchored_utf8 = NULL; - } - /* fixed_end_shift is how many chars that must be matched that - follow this item. We calculate it ahead of time as once the - lookbehind offset is added in we lose the ability to correctly - calculate it.*/ - ml = data.minlen_fixed ? *(data.minlen_fixed) - : (I32)longest_fixed_length; - r->anchored_end_shift = ml - data.offset_fixed - - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0) - + data.lookbehind_fixed; r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; - - t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ - && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE))); - fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0); + SvREFCNT_inc_simple_void_NN(data.longest_fixed); } else { r->anchored_substr = r->anchored_utf8 = NULL; - SvREFCNT_dec(data.longest_fixed); longest_fixed_length = 0; } + LEAVE_with_name("study_chunk"); + if (ri->regstclass && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) ri->regstclass = NULL; @@ -6340,7 +6368,7 @@ reStudy: minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0); - CHECK_RESTUDY_GOTO; + CHECK_RESTUDY_GOTO_butfirst(NOOP); r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; @@ -6385,7 +6413,10 @@ reStudy: if (RExC_seen & REG_SEEN_CANY) r->extflags |= RXf_CANY_SEEN; if (RExC_seen & REG_SEEN_VERBARG) + { r->intflags |= PREGf_VERBARG_SEEN; + r->extflags |= RXf_MODIFIES_VARS; + } if (RExC_seen & REG_SEEN_CUTGROUP) r->intflags |= PREGf_CUTGROUP_SEEN; if (pm_flags & PMf_USE_RE_EVAL) @@ -6398,18 +6429,12 @@ reStudy: #ifdef STUPID_PATTERN_CHECKS if (RX_PRELEN(rx) == 0) r->extflags |= RXf_NULL; - if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') - /* XXX: this should happen BEFORE we compile */ - r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); - else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3)) + if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3)) r->extflags |= RXf_WHITE; else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^') r->extflags |= RXf_START_ONLY; #else - if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') - /* XXX: this should happen BEFORE we compile */ - r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); - else { + { regnode *first = ri->program + 1; U8 fop = OP(first); @@ -6472,7 +6497,7 @@ Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, if (flags & RXapif_FETCH) { return reg_named_buff_fetch(rx, key, flags); } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { - Perl_croak_no_modify(aTHX); + Perl_croak_no_modify(); return NULL; } else if (flags & RXapif_EXISTS) { return reg_named_buff_exists(rx, key, flags) @@ -6511,7 +6536,7 @@ Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, { AV *retarray = NULL; SV *ret; - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; @@ -6551,7 +6576,7 @@ bool Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, const U32 flags) { - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; @@ -6575,7 +6600,7 @@ Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, SV* Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) { - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; @@ -6591,7 +6616,7 @@ Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) SV* Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) { - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; @@ -6627,7 +6652,7 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) SV *ret; AV *av; I32 length; - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; @@ -6651,7 +6676,7 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) SV* Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) { - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); AV *av = newAV(); PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; @@ -6687,47 +6712,67 @@ void Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, SV * const sv) { - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); char *s = NULL; I32 i = 0; I32 s1, t1; + I32 n = paren; PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; - if (!rx->subbeg) { - sv_setsv(sv,&PL_sv_undef); - return; - } - else - if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) { - /* $` */ + if ( ( n == RX_BUFF_IDX_CARET_PREMATCH + || n == RX_BUFF_IDX_CARET_FULLMATCH + || n == RX_BUFF_IDX_CARET_POSTMATCH + ) + && !(rx->extflags & RXf_PMf_KEEPCOPY) + ) + goto ret_undef; + + if (!rx->subbeg) + goto ret_undef; + + if (n == RX_BUFF_IDX_CARET_FULLMATCH) + /* no need to distinguish between them any more */ + n = RX_BUFF_IDX_FULLMATCH; + + if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) + && rx->offs[0].start != -1) + { + /* $`, ${^PREMATCH} */ i = rx->offs[0].start; s = rx->subbeg; } else - if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) { - /* $' */ - s = rx->subbeg + rx->offs[0].end; - i = rx->sublen - rx->offs[0].end; + if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) + && rx->offs[0].end != -1) + { + /* $', ${^POSTMATCH} */ + s = rx->subbeg - rx->suboffset + rx->offs[0].end; + i = rx->sublen + rx->suboffset - rx->offs[0].end; } else - if ( 0 <= paren && paren <= (I32)rx->nparens && - (s1 = rx->offs[paren].start) != -1 && - (t1 = rx->offs[paren].end) != -1) + if ( 0 <= n && n <= (I32)rx->nparens && + (s1 = rx->offs[n].start) != -1 && + (t1 = rx->offs[n].end) != -1) { - /* $& $1 ... */ + /* $&, ${^MATCH}, $1 ... */ i = t1 - s1; - s = rx->subbeg + s1; + s = rx->subbeg + s1 - rx->suboffset; } else { - sv_setsv(sv,&PL_sv_undef); - return; + goto ret_undef; } + + assert(s >= rx->subbeg); assert(rx->sublen >= (s - rx->subbeg) + i ); if (i >= 0) { - const int oldtainted = PL_tainted; +#if NO_TAINT_SUPPORT + sv_setpvn(sv, s, i); +#else + const int oldtainted = TAINT_get; TAINT_NOT; sv_setpvn(sv, s, i); - PL_tainted = oldtainted; + TAINT_set(oldtainted); +#endif if ( (rx->extflags & RXf_CANY_SEEN) ? (RXp_MATCH_UTF8(rx) && (!i || is_utf8_string((U8*)s, i))) @@ -6737,12 +6782,12 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, } else SvUTF8_off(sv); - if (PL_tainting) { + if (TAINTING_get) { if (RXp_MATCH_TAINTED(rx)) { if (SvTYPE(sv) >= SVt_PVMG) { MAGIC* const mg = SvMAGIC(sv); MAGIC* mgt; - PL_tainted = 1; + TAINT; SvMAGIC_set(sv, mg->mg_moremagic); SvTAINT(sv); if ((mgt = SvMAGIC(sv))) { @@ -6750,13 +6795,14 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, SvMAGIC_set(sv, mg); } } else { - PL_tainted = 1; + TAINT; SvTAINT(sv); } } else SvTAINTED_off(sv); } } else { + ret_undef: sv_setsv(sv,&PL_sv_undef); return; } @@ -6773,23 +6819,27 @@ Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, PERL_UNUSED_ARG(value); if (!PL_localizing) - Perl_croak_no_modify(aTHX); + Perl_croak_no_modify(); } I32 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, const I32 paren) { - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); I32 i; I32 s1, t1; PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; /* Some of this code was originally in C in F */ - switch (paren) { - /* $` / ${^PREMATCH} */ - case RX_BUFF_IDX_PREMATCH: + switch (paren) { + case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ + if (!(rx->extflags & RXf_PMf_KEEPCOPY)) + goto warn_undef; + /*FALLTHROUGH*/ + + case RX_BUFF_IDX_PREMATCH: /* $` */ if (rx->offs[0].start != -1) { i = rx->offs[0].start; if (i > 0) { @@ -6799,8 +6849,11 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } } return 0; - /* $' / ${^POSTMATCH} */ - case RX_BUFF_IDX_POSTMATCH: + + case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ + if (!(rx->extflags & RXf_PMf_KEEPCOPY)) + goto warn_undef; + case RX_BUFF_IDX_POSTMATCH: /* $' */ if (rx->offs[0].end != -1) { i = rx->sublen - rx->offs[0].end; if (i > 0) { @@ -6810,6 +6863,12 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } } return 0; + + case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */ + if (!(rx->extflags & RXf_PMf_KEEPCOPY)) + goto warn_undef; + /*FALLTHROUGH*/ + /* $& / ${^MATCH}, $1, $2, ... */ default: if (paren <= (I32)rx->nparens && @@ -6819,6 +6878,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, i = t1 - s1; goto getlen; } else { + warn_undef: if (ckWARN(WARN_UNINITIALIZED)) report_uninit((const SV *)sv); return 0; @@ -6826,7 +6886,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } getlen: if (i > 0 && RXp_MATCH_UTF8(rx)) { - const char * const s = rx->subbeg + s1; + const char * const s = rx->subbeg - rx->suboffset + s1; const U8 *ep; STRLEN el; @@ -6979,9 +7039,10 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * list.) * Taking the complement (inverting) an inversion list is quite simple, if the * first element is 0, remove it; otherwise add a 0 element at the beginning. - * This implementation reserves an element at the beginning of each inversion list - * to contain 0 when the list contains 0, and contains 1 otherwise. The actual - * beginning of the list is either that element if 0, or the next one if 1. + * This implementation reserves an element at the beginning of each inversion + * list to contain 0 when the list contains 0, and contains 1 otherwise. The + * actual beginning of the list is either that element if 0, or the next one if + * 1. * * More about inversion lists can be found in "Unicode Demystified" * Chapter 13 by Richard Gillam, published by Addison-Wesley. @@ -6995,32 +7056,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * Some of the methods should always be private to the implementation, and some * should eventually be made public */ -#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */ -#define INVLIST_ITER_OFFSET 1 /* Current iteration position */ - -/* This is a combination of a version and data structure type, so that one - * being passed in can be validated to be an inversion list of the correct - * vintage. When the structure of the header is changed, a new random number - * in the range 2**31-1 should be generated and the new() method changed to - * insert that at this location. Then, if an auxiliary program doesn't change - * correspondingly, it will be discovered immediately */ -#define INVLIST_VERSION_ID_OFFSET 2 -#define INVLIST_VERSION_ID 1064334010 +/* The header definitions are in F */ -/* For safety, when adding new elements, remember to #undef them at the end of - * the inversion list code section */ - -#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */ -/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list - * contains the code point U+00000, and begins here. If 1, the inversion list - * doesn't contain U+0000, and it begins at the next UV in the array. - * Inverting an inversion list consists of adding or removing the 0 at the - * beginning of it. By reserving a space for that 0, inversion can be made - * very fast */ - -#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1) - -/* Internally things are UVs */ #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV)) #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH) @@ -7042,7 +7079,7 @@ S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0) PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; /* Must be empty */ - assert(! *get_invlist_len_addr(invlist)); + assert(! *_get_invlist_len_addr(invlist)); /* 1^1 = 0; 1^0 = 1 */ *zero = 1 ^ will_have_0; @@ -7060,7 +7097,7 @@ S_invlist_array(pTHX_ SV* const invlist) /* Must not be empty. If these fail, you probably didn't check for * being non-zero before trying to get the array */ - assert(*get_invlist_len_addr(invlist)); + assert(*_get_invlist_len_addr(invlist)); assert(*get_invlist_zero_addr(invlist) == 0 || *get_invlist_zero_addr(invlist) == 1); @@ -7071,28 +7108,6 @@ S_invlist_array(pTHX_ SV* const invlist) + *get_invlist_zero_addr(invlist)); } -PERL_STATIC_INLINE UV* -S_get_invlist_len_addr(pTHX_ SV* invlist) -{ - /* Return the address of the UV that contains the current number - * of used elements in the inversion list */ - - PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR; - - return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV))); -} - -PERL_STATIC_INLINE UV -S_invlist_len(pTHX_ SV* const invlist) -{ - /* Returns the current number of elements stored in the inversion list's - * array */ - - PERL_ARGS_ASSERT_INVLIST_LEN; - - return *get_invlist_len_addr(invlist); -} - PERL_STATIC_INLINE void S_invlist_set_len(pTHX_ SV* const invlist, const UV len) { @@ -7100,7 +7115,7 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len) PERL_ARGS_ASSERT_INVLIST_SET_LEN; - *get_invlist_len_addr(invlist) = len; + *_get_invlist_len_addr(invlist) = len; assert(len <= SvLEN(invlist)); @@ -7120,6 +7135,39 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len) * Note that when inverting, SvCUR shouldn't change */ } +PERL_STATIC_INLINE IV* +S_get_invlist_previous_index_addr(pTHX_ SV* invlist) +{ + /* Return the address of the UV that is reserved to hold the cached index + * */ + + PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; + + return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV))); +} + +PERL_STATIC_INLINE IV +S_invlist_previous_index(pTHX_ SV* const invlist) +{ + /* Returns cached index of previous search */ + + PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX; + + return *get_invlist_previous_index_addr(invlist); +} + +PERL_STATIC_INLINE void +S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index) +{ + /* Caches for later retrieval */ + + PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX; + + assert(index == 0 || index < (int) _invlist_len(invlist)); + + *get_invlist_previous_index_addr(invlist) = index; +} + PERL_STATIC_INLINE UV S_invlist_max(pTHX_ SV* const invlist) { @@ -7170,8 +7218,9 @@ Perl__new_invlist(pTHX_ IV initial_size) * properly */ *get_invlist_zero_addr(new_list) = UV_MAX; + *get_invlist_previous_index_addr(new_list) = 0; *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID; -#if HEADER_LENGTH != 4 +#if HEADER_LENGTH != 5 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length #endif @@ -7194,7 +7243,7 @@ S__new_invlist_C_array(pTHX_ UV* list) SvPV_set(invlist, (char *) list); SvLEN_set(invlist, 0); /* Means we own the contents, and the system shouldn't touch it */ - SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist))); + SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist))); if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) { Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); @@ -7224,11 +7273,6 @@ S_invlist_trim(pTHX_ SV* const invlist) SvPV_shrink_to_cur((SV *) invlist); } -/* An element is in an inversion list iff its index is even numbered: 0, 2, 4, - * etc */ -#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) -#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) - #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output) STATIC void @@ -7240,7 +7284,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end UV* array; UV max = invlist_max(invlist); - UV len = invlist_len(invlist); + UV len = _invlist_len(invlist); PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; @@ -7312,8 +7356,8 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end #ifndef PERL_IN_XSUB_RE -STATIC IV -S_invlist_search(pTHX_ SV* const invlist, const UV cp) +IV +Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) { /* Searches the inversion list for the entry that contains the input code * point . If is not in the list, -1 is returned. Otherwise, the @@ -7321,23 +7365,68 @@ S_invlist_search(pTHX_ SV* const invlist, const UV cp) * contains */ IV low = 0; - IV high = invlist_len(invlist); - const UV * const array = invlist_array(invlist); + IV mid; + IV high = _invlist_len(invlist); + const IV highest_element = high - 1; + const UV* array; - PERL_ARGS_ASSERT_INVLIST_SEARCH; + PERL_ARGS_ASSERT__INVLIST_SEARCH; - /* If list is empty or the code point is before the first element, return - * failure. */ - if (high == 0 || cp < array[0]) { + /* If list is empty, return failure. */ + if (high == 0) { return -1; } + /* If the code point is before the first element, return failure. (We + * can't combine this with the test above, because we can't get the array + * unless we know the list is non-empty) */ + array = invlist_array(invlist); + + mid = invlist_previous_index(invlist); + assert(mid >=0 && mid <= highest_element); + + /* contains the cache of the result of the previous call to this + * function (0 the first time). See if this call is for the same result, + * or if it is for mid-1. This is under the theory that calls to this + * function will often be for related code points that are near each other. + * And benchmarks show that caching gives better results. We also test + * here if the code point is within the bounds of the list. These tests + * replace others that would have had to be made anyway to make sure that + * the array bounds were not exceeded, and these give us extra information + * at the same time */ + if (cp >= array[mid]) { + if (cp >= array[highest_element]) { + return highest_element; + } + + /* Here, array[mid] <= cp < array[highest_element]. This means that + * the final element is not the answer, so can exclude it; it also + * means that is not the final element, so can refer to 'mid + 1' + * safely */ + if (cp < array[mid + 1]) { + return mid; + } + high--; + low = mid + 1; + } + else { /* cp < aray[mid] */ + if (cp < array[0]) { /* Fail if outside the array */ + return -1; + } + high = mid; + if (cp >= array[mid - 1]) { + goto found_entry; + } + } + /* Binary search. What we are looking for is such that * array[i] <= cp < array[i+1] - * The loop below converges on the i+1. */ + * The loop below converges on the i+1. Note that there may not be an + * (i+1)th element in the array, and things work nonetheless */ while (low < high) { - IV mid = (low + high) / 2; - if (array[mid] <= cp) { + mid = (low + high) / 2; + assert(mid <= highest_element); + if (array[mid] <= cp) { /* cp >= array[mid] */ low = mid + 1; /* We could do this extra test to exit the loop early. @@ -7351,7 +7440,10 @@ S_invlist_search(pTHX_ SV* const invlist, const UV cp) } } - return high - 1; + found_entry: + high--; + invlist_set_previous_index(invlist, high); + return high; } void @@ -7365,7 +7457,7 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV * that is all 0's on input */ UV current = start; - const IV len = invlist_len(invlist); + const IV len = _invlist_len(invlist); IV i; const UV * array; @@ -7378,7 +7470,7 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV array = invlist_array(invlist); /* Find which element it is */ - i = invlist_search(invlist, start); + i = _invlist_search(invlist, start); /* We populate from to */ while (current < end) { @@ -7397,7 +7489,15 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV current = array[i]; if (current >= end) { /* Finished if beyond the end of what we are populating */ - return; + if (LIKELY(end < UV_MAX)) { + return; + } + + /* We get here when the upper bound is the maximum + * representable on the machine, and we are looking for just + * that code point. Have to special case it */ + i = len; + goto join_end_of_list; } } assert(current >= start); @@ -7414,6 +7514,8 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV swatch[offset >> 3] |= 1 << (offset & 7); } + join_end_of_list: + /* Quit if at the end of the list */ if (i >= len) { @@ -7485,7 +7587,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co assert(a != b); /* If either one is empty, the union is the other one */ - if (a == NULL || ((len_a = invlist_len(a)) == 0)) { + if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { if (*output == a) { if (a != NULL) { SvREFCNT_dec(a); @@ -7499,7 +7601,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co } /* else *output already = b; */ return; } - else if ((len_b = invlist_len(b)) == 0) { + else if ((len_b = _invlist_len(b)) == 0) { if (*output == b) { SvREFCNT_dec(b); } @@ -7640,7 +7742,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co /* Set result to final length, which can change the pointer to array_u, so * re-find it */ - if (len_u != invlist_len(u)) { + if (len_u != _invlist_len(u)) { invlist_set_len(u, len_u); invlist_trim(u); array_u = invlist_array(u); @@ -7719,8 +7821,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, assert(a != b); /* Special case if either one is empty */ - len_a = invlist_len(a); - if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) { + len_a = _invlist_len(a); + if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { if (len_a != 0 && complement_b) { @@ -7866,7 +7968,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Set result to final length, which can change the pointer to array_r, so * re-find it */ - if (len_r != invlist_len(r)) { + if (len_r != _invlist_len(r)) { invlist_set_len(r, len_r); invlist_trim(r); array_r = invlist_array(r); @@ -7914,13 +8016,13 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) len = 0; } else { - len = invlist_len(invlist); + len = _invlist_len(invlist); } /* If comes after the final entry, can just append it to the end */ if (len == 0 || start >= invlist_array(invlist) - [invlist_len(invlist) - 1]) + [_invlist_len(invlist) - 1]) { _append_range_to_invlist(invlist, start, end); return invlist; @@ -7954,7 +8056,7 @@ Perl__invlist_invert(pTHX_ SV* const invlist) * have a zero; removes it otherwise. As described above, the data * structure is set up so that this is very efficient */ - UV* len_pos = get_invlist_len_addr(invlist); + UV* len_pos = _get_invlist_len_addr(invlist); PERL_ARGS_ASSERT__INVLIST_INVERT; @@ -7991,7 +8093,7 @@ Perl__invlist_invert_prop(pTHX_ SV* const invlist) _invlist_invert(invlist); - len = invlist_len(invlist); + len = _invlist_len(invlist); if (len != 0) { /* If empty do nothing */ array = invlist_array(invlist); @@ -8023,7 +8125,7 @@ S_invlist_clone(pTHX_ SV* const invlist) /* Need to allocate extra space to accommodate Perl's addition of a * trailing NUL to SvPV's, since it thinks they are always strings */ - SV* new_invlist = _new_invlist(invlist_len(invlist) + 1); + SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1); STRLEN length = SvCUR(invlist); PERL_ARGS_ASSERT_INVLIST_CLONE; @@ -8074,7 +8176,7 @@ S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) * will start over at the beginning of the list */ UV* pos = get_invlist_iter_addr(invlist); - UV len = invlist_len(invlist); + UV len = _invlist_len(invlist); UV *array; PERL_ARGS_ASSERT_INVLIST_ITERNEXT; @@ -8106,7 +8208,7 @@ S_invlist_highest(pTHX_ SV* const invlist) * 0, or if the list is empty. If this distinction matters to you, check * for emptiness before calling this function */ - UV len = invlist_len(invlist); + UV len = _invlist_len(invlist); UV *array; PERL_ARGS_ASSERT_INVLIST_HIGHEST; @@ -8159,15 +8261,17 @@ Perl__invlist_contents(pTHX_ SV* const invlist) } #endif -#if 0 +#ifdef PERL_ARGS_ASSERT__INVLIST_DUMP void -S_invlist_dump(pTHX_ SV* const invlist, const char * const header) +Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header) { /* Dumps out the ranges in an inversion list. The string 'header' * if present is output on a line before the first range */ UV start, end; + PERL_ARGS_ASSERT__INVLIST_DUMP; + if (header && strlen(header)) { PerlIO_printf(Perl_debug_log, "%s\n", header); } @@ -8176,13 +8280,88 @@ S_invlist_dump(pTHX_ SV* const invlist, const char * const header) if (end == UV_MAX) { PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start); } + else if (end != start) { + PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", + start, end); + } else { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end); + PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start); } } } #endif +#if 0 +bool +S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) +{ + /* Return a boolean as to if the two passed in inversion lists are + * identical. The final argument, if TRUE, says to take the complement of + * the second inversion list before doing the comparison */ + + UV* array_a = invlist_array(a); + UV* array_b = invlist_array(b); + UV len_a = _invlist_len(a); + UV len_b = _invlist_len(b); + + UV i = 0; /* current index into the arrays */ + bool retval = TRUE; /* Assume are identical until proven otherwise */ + + PERL_ARGS_ASSERT__INVLISTEQ; + + /* If are to compare 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* The complement of nothing is everything, so would have to have + * just one element, starting at zero (ending at infinity) */ + if (len_b == 0) { + return (len_a == 1 && array_a[0] == 0); + } + else if (array_b[0] == 0) { + + /* Otherwise, to complement, we invert. Here, the first element is + * 0, just remove it. To do this, we just pretend the array starts + * one later, and clear the flag as we don't have to do anything + * else later */ + + array_b++; + len_b--; + complement_b = FALSE; + } + else { + + /* But if the first element is not zero, we unshift a 0 before the + * array. The data structure reserves a space for that 0 (which + * should be a '1' right now), so physical shifting is unneeded, + * but temporarily change that element to 0. Before exiting the + * routine, we must restore the element to '1' */ + array_b--; + len_b++; + array_b[0] = 0; + } + } + + /* Make sure that the lengths are the same, as well as the final element + * before looping through the remainder. (Thus we test the length, final, + * and first elements right off the bat) */ + if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) { + retval = FALSE; + } + else for (i = 0; i < len_a - 1; i++) { + if (array_a[i] != array_b[i]) { + retval = FALSE; + break; + } + } + + if (complement_b) { + array_b[0] = 1; + } + return retval; +} +#endif + #undef HEADER_LENGTH #undef INVLIST_INITIAL_LENGTH #undef TO_INTERNAL_SIZE @@ -8215,11 +8394,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ { dVAR; - register regnode *ret; /* Will be the head of the group. */ - register regnode *br; - register regnode *lastbr; - register regnode *ender = NULL; - register I32 parno = 0; + regnode *ret; /* Will be the head of the group. */ + regnode *br; + regnode *lastbr; + regnode *ender = NULL; + I32 parno = 0; I32 flags; U32 oregflags = RExC_flags; bool have_branch = 0; @@ -8390,7 +8569,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, ((! FOLD) ? NREF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? NREFFA : (AT_LEAST_UNI_SEMANTICS) ? NREFFU @@ -9225,9 +9404,9 @@ STATIC regnode * S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) { dVAR; - register regnode *ret; - register regnode *chain = NULL; - register regnode *latest; + regnode *ret; + regnode *chain = NULL; + regnode *latest; I32 flags = 0, c = 0; GET_RE_DEBUG_FLAGS_DECL; @@ -9298,9 +9477,9 @@ STATIC regnode * S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { dVAR; - register regnode *ret; - register char op; - register char *next; + regnode *ret; + char op; + char *next; I32 flags; const char * const origparse = RExC_parse; I32 min; @@ -9309,6 +9488,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char *parse_start; #endif const char *maxpos = NULL; + + /* Save the original in case we change the emitted regop to a FAIL. */ + regnode * const orig_emit = RExC_emit; + GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGPIECE; @@ -9355,6 +9538,23 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); RExC_parse = next; nextchar(pRExC_state); + if (max < min) { /* If can't match, warn and optimize to fail + unconditionally */ + if (SIZE_ONLY) { + ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); + + /* We can't back off the size because we have to reserve + * enough space for all the things we are about to throw + * away, but we can shrink it by the ammount we are about + * to re-use here */ + RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; + } + else { + RExC_emit = orig_emit; + } + ret = reg_node(pRExC_state, OPFAIL); + return ret; + } do_curly: if ((flags&SIMPLE)) { @@ -9392,8 +9592,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp = WORST; if (max > 0) *flagp |= HASWIDTH; - if (max < min) - vFAIL("Can't do {n,m} with n > m"); if (!SIZE_ONLY) { ARG1_SET(ret, (U16)min); ARG2_SET(ret, (U16)max); @@ -9455,10 +9653,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } nest_check: if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { + SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ ckWARN3reg(RExC_parse, "%.*s matches null string many times", (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), origparse); + ReREFCNT_inc(RExC_rx_sv); } if (RExC_parse < RExC_end && *RExC_parse == '?') { @@ -9489,86 +9689,95 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) return(ret); } - -/* reg_namedseq(pRExC_state,UVp, UV depth) +STATIC bool +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class) +{ - This is expected to be called by a parser routine that has - recognized '\N' and needs to handle the rest. RExC_parse is - expected to point at the first char following the N at the time - of the call. + /* This is expected to be called by a parser routine that has recognized '\N' + and needs to handle the rest. RExC_parse is expected to point at the first + char following the N at the time of the call. On successful return, + RExC_parse has been updated to point to just after the sequence identified + by this routine, and <*flagp> has been updated. - The \N may be inside (indicated by valuep not being NULL) or outside a + The \N may be inside (indicated by the boolean ) or outside a character class. \N may begin either a named sequence, or if outside a character class, mean to match a non-newline. For non single-quoted regexes, the tokenizer has - attempted to decide which, and in the case of a named sequence converted it + attempted to decide which, and in the case of a named sequence, converted it into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, where c1... are the characters in the sequence. For single-quoted regexes, the tokenizer passes the \N sequence through unchanged; this code will not - attempt to determine this nor expand those. The net effect is that if the - beginning of the passed-in pattern isn't '{U+' or there is no '}', it - signals that this \N occurrence means to match a non-newline. - + attempt to determine this nor expand those, instead raising a syntax error. + The net effect is that if the beginning of the passed-in pattern isn't '{U+' + or there is no '}', it signals that this \N occurrence means to match a + non-newline. + Only the \N{U+...} form should occur in a character class, for the same reason that '.' inside a character class means to just match a period: it just doesn't make sense. - - If valuep is non-null then it is assumed that we are parsing inside - of a charclass definition and the first codepoint in the resolved - string is returned via *valuep and the routine will return NULL. - In this mode if a multichar string is returned from the charnames - handler, a warning will be issued, and only the first char in the - sequence will be examined. If the string returned is zero length - then the value of *valuep is undefined and NON-NULL will - be returned to indicate failure. (This will NOT be a valid pointer - to a regnode.) - - If valuep is null then it is assumed that we are parsing normal text and a - new EXACT node is inserted into the program containing the resolved string, - and a pointer to the new node is returned. But if the string is zero length - a NOTHING node is emitted instead. - On success RExC_parse is set to the char following the endbrace. - Parsing failures will generate a fatal error via vFAIL(...) + The function raises an error (via vFAIL), and doesn't return for various + syntax errors. Otherwise it returns TRUE and sets or on + success; it returns FALSE otherwise. + + If is non-null, it means the caller can accept an input sequence + consisting of a just a single code point; <*valuep> is set to that value + if the input is such. + + If is non-null it signifies that the caller can accept any other + legal sequence (i.e., one that isn't just a single code point). <*node_p> + is set as follows: + 1) \N means not-a-NL: points to a newly created REG_ANY node; + 2) \N{}: points to a new NOTHING node; + 3) otherwise: points to a new EXACT node containing the resolved + string. + Note that FALSE is returned for single code point sequences if is + null. */ -STATIC regnode * -S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth) -{ + char * endbrace; /* '}' following the name */ - regnode *ret = NULL; char* p; + char *endchar; /* Points to '.' or '}' ending cur char in the input + stream */ + bool has_multiple_chars; /* true if the input stream contains a sequence of + more than one character */ GET_RE_DEBUG_FLAGS_DECL; - PERL_ARGS_ASSERT_REG_NAMEDSEQ; + PERL_ARGS_ASSERT_GROK_BSLASH_N; GET_RE_DEBUG_FLAGS; + assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ + /* The [^\n] meaning of \N ignores spaces and comments under the /x * modifier. The other meaning does not */ p = (RExC_flags & RXf_PMf_EXTENDED) ? regwhite( pRExC_state, RExC_parse ) : RExC_parse; - + /* Disambiguate between \N meaning a named character versus \N meaning * [^\n]. The former is assumed when it can't be the latter. */ if (*p != '{' || regcurly(p)) { RExC_parse = p; - if (valuep) { + if (! node_p) { /* no bare \N in a charclass */ - vFAIL("\\N in a character class must be a named character: \\N{...}"); - } + if (in_char_class) { + vFAIL("\\N in a character class must be a named character: \\N{...}"); + } + return FALSE; + } nextchar(pRExC_state); - ret = reg_node(pRExC_state, REG_ANY); + *node_p = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; RExC_naughty++; RExC_parse--; - Set_Node_Length(ret, 1); /* MJD */ - return ret; + Set_Node_Length(*node_p, 1); /* MJD */ + return TRUE; } - /* Here, we have decided it should be a named sequence */ + /* Here, we have decided it should be a named character or sequence */ /* The test above made sure that the next real character is a '{', but * under the /x modifier, it could be separated by space (or a comment and @@ -9590,44 +9799,48 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept } if (endbrace == RExC_parse) { /* empty: \N{} */ - if (! valuep) { - RExC_parse = endbrace + 1; - return reg_node(pRExC_state,NOTHING); - } - - if (SIZE_ONLY) { - ckWARNreg(RExC_parse, - "Ignoring zero length \\N{} in character class" - ); - RExC_parse = endbrace + 1; + bool ret = TRUE; + if (node_p) { + *node_p = reg_node(pRExC_state,NOTHING); + } + else if (in_char_class) { + if (SIZE_ONLY && in_char_class) { + ckWARNreg(RExC_parse, + "Ignoring zero length \\N{} in character class" + ); + } + ret = FALSE; } - *valuep = 0; - return (regnode *) &RExC_parse; /* Invalid regnode pointer */ + else { + return FALSE; + } + nextchar(pRExC_state); + return ret; } - REQUIRE_UTF8; /* named sequences imply Unicode semantics */ + RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ RExC_parse += 2; /* Skip past the 'U+' */ - if (valuep) { /* In a bracketed char class */ - /* We only pay attention to the first char of - multichar strings being returned. I kinda wonder + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + + /* Code points are separated by dots. If none, there is only one code + * point, and is terminated by the brace */ + has_multiple_chars = (endchar < endbrace); + + if (valuep && (! has_multiple_chars || in_char_class)) { + /* We only pay attention to the first char of + multichar strings being returned in char classes. I kinda wonder if this makes sense as it does change the behaviour from earlier versions, OTOH that behaviour was broken as well. XXX Solution is to recharacterize as [rest-of-class]|multi1|multi2... */ - STRLEN length_of_hex; - I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); + I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES | PERL_SCAN_DISALLOW_PREFIX | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); - - char * endchar = RExC_parse + strcspn(RExC_parse, ".}"); - if (endchar < endbrace) { - ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); - } - length_of_hex = (STRLEN)(endchar - RExC_parse); - *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL); + *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL); /* The tokenizer should have guaranteed validity, but it's possible to * bypass it by using single quoting, so check */ @@ -9639,16 +9852,27 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept ? UTF8SKIP(RExC_parse) : 1; /* Guard against malformed utf8 */ - if (RExC_parse >= endchar) RExC_parse = endchar; + if (RExC_parse >= endchar) { + RExC_parse = endchar; + } vFAIL("Invalid hexadecimal number in \\N{U+...}"); - } + } + + if (in_char_class && has_multiple_chars) { + ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); + } - RExC_parse = endbrace + 1; - if (endchar == endbrace) return NULL; + RExC_parse = endbrace + 1; + } + else if (! node_p || ! has_multiple_chars) { - ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */ + /* Here, the input is legal, but not according to the caller's + * options. We fail without advancing the parse, so that the + * caller can try again */ + RExC_parse = p; + return FALSE; } - else { /* Not a char class */ + else { /* What is done here is to convert this to a sub-pattern of the form * (?:\x{char1}\x{char2}...) @@ -9661,16 +9885,11 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP); STRLEN len; - char *endchar; /* Points to '.' or '}' ending cur char in the input - stream */ char *orig_end = RExC_end; + I32 flags; while (RExC_parse < endbrace) { - /* Code points are separated by dots. If none, there is only one - * code point, and is terminated by the brace */ - endchar = RExC_parse + strcspn(RExC_parse, ".}"); - /* Convert to notation the rest of the code understands */ sv_catpv(substitute_parse, "\\x{"); sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); @@ -9678,6 +9897,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept /* Point to the beginning of the next character in the sequence. */ RExC_parse = endchar + 1; + endchar = RExC_parse + strcspn(RExC_parse, ".}"); } sv_catpv(substitute_parse, ")"); @@ -9692,16 +9912,17 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept /* The values are Unicode, and therefore not subject to recoding */ RExC_override_recoding = 1; - ret = reg(pRExC_state, 1, flagp, depth+1); + *node_p = reg(pRExC_state, 1, &flags, depth+1); + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); RExC_parse = endbrace; RExC_end = orig_end; RExC_override_recoding = 0; - nextchar(pRExC_state); + nextchar(pRExC_state); } - return ret; + return TRUE; } @@ -9738,6 +9959,100 @@ S_reg_recode(pTHX_ const char value, SV **encp) return uv; } +PERL_STATIC_INLINE U8 +S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) +{ + U8 op; + + PERL_ARGS_ASSERT_COMPUTE_EXACTISH; + + if (! FOLD) { + return EXACT; + } + + op = get_regex_charset(RExC_flags); + if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { + op--; /* /a is same as /u, and map /aa's offset to what /a's would have + been, so there is no hole */ + } + + return op + EXACTF; +} + +PERL_STATIC_INLINE void +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point) +{ + /* This knows the details about sizing an EXACTish node, setting flags for + * it (by setting <*flagp>, and potentially populating it with a single + * character. + * + * If (the length in bytes) is non-zero, this function assumes that + * the node has already been populated, and just does the sizing. In this + * case should be the final code point that has already been + * placed into the node. This value will be ignored except that under some + * circumstances <*flagp> is set based on it. + * + * If is zero, the function assumes that the node is to contain only + * the single character given by and calculates what + * should be. In pass 1, it sizes the node appropriately. In pass 2, it + * additionally will populate the node's STRING with , if + * is 0. In both cases <*flagp> is appropriately set + * + * It knows that under FOLD, UTF characters and the Latin Sharp S must be + * folded (the latter only when the rules indicate it can match 'ss') */ + + bool len_passed_in = cBOOL(len != 0); + U8 character[UTF8_MAXBYTES_CASE+1]; + + PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + + if (! len_passed_in) { + if (UTF) { + if (FOLD) { + to_uni_fold(NATIVE_TO_UNI(code_point), character, &len); + } + else { + uvchr_to_utf8( character, code_point); + len = UTF8SKIP(character); + } + } + else if (! FOLD + || code_point != LATIN_SMALL_LETTER_SHARP_S + || ASCII_FOLD_RESTRICTED + || ! AT_LEAST_UNI_SEMANTICS) + { + *character = (U8) code_point; + len = 1; + } + else { + *character = 's'; + *(character + 1) = 's'; + len = 2; + } + } + + if (SIZE_ONLY) { + RExC_size += STR_SZ(len); + } + else { + RExC_emit += STR_SZ(len); + STR_LEN(node) = len; + if (! len_passed_in) { + Copy((char *) character, STRING(node), len, char); + } + } + + *flagp |= HASWIDTH; + + /* A single character node is SIMPLE, except for the special-cased SHARP S + * under /di. */ + if ((len == 1 || (UTF && len == UNISKIP(code_point))) + && (code_point != LATIN_SMALL_LETTER_SHARP_S + || ! FOLD || ! DEPENDS_SEMANTICS)) + { + *flagp |= SIMPLE; + } +} /* - regatom - the lowest level @@ -9803,7 +10118,7 @@ STATIC regnode * S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { dVAR; - register regnode *ret = NULL; + regnode *ret = NULL; I32 flags; char *parse_start = RExC_parse; U8 op; @@ -9851,13 +10166,12 @@ tryagain: case '[': { char * const oregcomp_parse = ++RExC_parse; - ret = regclass(pRExC_state,depth+1); + ret = regclass(pRExC_state, flagp,depth+1); if (*RExC_parse != ']') { RExC_parse = oregcomp_parse; vFAIL("Unmatched ["); } nextchar(pRExC_state); - *flagp |= HASWIDTH|SIMPLE; Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ break; } @@ -10068,7 +10382,7 @@ tryagain: } RExC_parse--; - ret = regclass(pRExC_state,depth+1); + ret = regclass(pRExC_state, flagp,depth+1); RExC_end = oldregxend; RExC_parse--; @@ -10076,16 +10390,24 @@ tryagain: Set_Node_Offset(ret, parse_start + 2); Set_Node_Cur_Length(ret); nextchar(pRExC_state); - *flagp |= HASWIDTH|SIMPLE; } break; case 'N': - /* Handle \N and \N{NAME} here and not below because it can be - multicharacter. join_exact() will join them up later on. - Also this makes sure that things like /\N{BLAH}+/ and - \N{BLAH} being multi char Just Happen. dmq*/ + /* Handle \N and \N{NAME} with multiple code points here and not + * below because it can be multicharacter. join_exact() will join + * them up later on. Also this makes sure that things like + * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq. + * The options to the grok function call causes it to fail if the + * sequence is just a single code point. We then go treat it as + * just another character in the current EXACT node, and hence it + * gets uniform treatment with all the other characters. The + * special treatment for quantifiers is not needed for such single + * character sequences */ ++RExC_parse; - ret= reg_namedseq(pRExC_state, NULL, flagp, depth); + if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) { + RExC_parse--; + goto defchar; + } break; case 'k': /* Handle \k and \k'NAME' */ parse_named_seq: @@ -10115,7 +10437,7 @@ tryagain: ret = reganode(pRExC_state, ((! FOLD) ? NREF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? NREFFA : (AT_LEAST_UNI_SEMANTICS) ? NREFFU @@ -10186,7 +10508,7 @@ tryagain: ret = reganode(pRExC_state, ((! FOLD) ? REF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? REFFA : (AT_LEAST_UNI_SEMANTICS) ? REFFU @@ -10230,33 +10552,40 @@ tryagain: RExC_parse++; defchar: { - register STRLEN len; - register UV ender; - register char *p; + STRLEN len = 0; + UV ender; + char *p; char *s; +#define MAX_NODE_STRING_SIZE 127 + char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; + char *s0; + U8 upper_parse = MAX_NODE_STRING_SIZE; STRLEN foldlen; - U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; U8 node_type; + bool next_is_quantifier; + char * oldp = NULL; - /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so, - * it is folded to 'ss' even if not utf8 */ - bool is_exactfu_sharp_s; + /* If a folding node contains only code points that don't + * participate in folds, it can be changed into an EXACT node, + * which allows the optimizer more things to look for */ + bool maybe_exact; ender = 0; - if (! FOLD) { - node_type = EXACT; - } - else { - node_type = get_regex_charset(RExC_flags); - if (node_type >= REGEX_ASCII_RESTRICTED_CHARSET) { - node_type--; /* /a is same as /u, and map /aa's offset to - what /a's would have been, so there is no - hole */ - } - node_type += EXACTF; - } + node_type = compute_EXACTish(pRExC_state); ret = reg_node(pRExC_state, node_type); - s = STRING(ret); + + /* In pass1, folded, we use a temporary buffer instead of the + * actual node, as the node doesn't exist yet */ + s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret); + + s0 = s; + + reparse: + + /* We do the EXACTFish to EXACT node only if folding, and not if in + * locale, as whether a character folds or not isn't known until + * runtime */ + maybe_exact = FOLD && ! LOC; /* XXX The node can hold up to 255 bytes, yet this only goes to * 127. I (khw) do not know why. Keeping it somewhat less than @@ -10276,11 +10605,11 @@ tryagain: * could back off to end with only a code point that isn't such a * non-final, but it is possible for there not to be any in the * entire node. */ - for (len = 0, p = RExC_parse - 1; - len < 127 && p < RExC_end; + for (p = RExC_parse - 1; + len < upper_parse && p < RExC_end; len++) { - char * const oldp = p; + oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); @@ -10316,7 +10645,6 @@ tryagain: case 'g': case 'G': /* generic-backref, pos assertion */ case 'h': case 'H': /* HORIZWS */ case 'k': case 'K': /* named backref, keep marker */ - case 'N': /* named char sequence */ case 'p': case 'P': /* Unicode property */ case 'R': /* LNBREAK */ case 's': case 'S': /* space class */ @@ -10334,6 +10662,22 @@ tryagain: ender = '\n'; p++; break; + case 'N': /* Handle a single-code point named character. */ + /* The options cause it to fail if a multiple code + * point sequence. Handle those in the switch() above + * */ + RExC_parse = p + 1; + if (! grok_bslash_N(pRExC_state, NULL, &ender, + flagp, depth, FALSE)) + { + RExC_parse = p = oldp; + goto loopdone; + } + p = RExC_parse; + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; case 'r': ender = '\r'; p++; @@ -10479,150 +10823,308 @@ tryagain: break; } /* End of switch on the literal */ - is_exactfu_sharp_s = (node_type == EXACTFU - && ender == LATIN_SMALL_LETTER_SHARP_S); + /* Here, have looked at the literal character and + * contains its ordinal,

points to the character after it + */ + if ( RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); - if ((UTF && FOLD) || is_exactfu_sharp_s) { - /* Prime the casefolded buffer. Locale rules, which apply - * only to code points < 256, aren't known until execution, - * so for them, just output the original character using - * utf8. If we start to fold non-UTF patterns, be sure to - * update join_exact() */ - if (LOC && ender < 256) { - if (UNI_IS_INVARIANT(ender)) { - *tmpbuf = (U8) ender; - foldlen = 1; - } else { - *tmpbuf = UTF8_TWO_BYTE_HI(ender); - *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender); - foldlen = 2; - } - } - else if (isASCII(ender)) { /* Note: Here can't also be LOC - */ - ender = toLOWER(ender); - *tmpbuf = (U8) ender; - foldlen = 1; - } - else if (! MORE_ASCII_RESTRICTED && ! LOC) { - /* Locale and /aa require more selectivity about the - * fold, so are handled below. Otherwise, here, just - * use the fold */ - ender = toFOLD_uni(ender, tmpbuf, &foldlen); - } - else { - /* Under locale rules or /aa we are not to mix, - * respectively, ords < 256 or ASCII with non-. So - * reject folds that mix them, using only the - * non-folded code point. So do the fold to a - * temporary, and inspect each character in it. */ - U8 trialbuf[UTF8_MAXBYTES_CASE+1]; - U8* s = trialbuf; - UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen); - U8* e = s + foldlen; - bool fold_ok = TRUE; - - while (s < e) { - if (isASCII(*s) - || (LOC && (UTF8_IS_INVARIANT(*s) - || UTF8_IS_DOWNGRADEABLE_START(*s)))) - { - fold_ok = FALSE; - break; - } - s += UTF8SKIP(s); - } - if (fold_ok) { - Copy(trialbuf, tmpbuf, foldlen, U8); - ender = tmpender; - } - else { - uvuni_to_utf8(tmpbuf, ender); - foldlen = UNISKIP(ender); - } - } - } - if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ - if (len) - p = oldp; - else if (UTF || is_exactfu_sharp_s) { - if (FOLD) { - /* Emit all the Unicode characters. */ - STRLEN numlen; - for (foldbuf = tmpbuf; - foldlen; - foldlen -= numlen) { - - /* tmpbuf has been constructed by us, so we - * know it is valid utf8 */ - ender = valid_utf8_to_uvchr(foldbuf, &numlen); - if (numlen > 0) { - const STRLEN unilen = reguni(pRExC_state, ender, s); - s += unilen; - len += unilen; - /* In EBCDIC the numlen - * and unilen can differ. */ - foldbuf += numlen; - if (numlen >= foldlen) - break; - } - else - break; /* "Can't happen." */ - } - } - else { - const STRLEN unilen = reguni(pRExC_state, ender, s); - if (unilen > 0) { - s += unilen; - len += unilen; - } - } - } - else { - len++; - REGC((char)ender, s++); - } - break; + /* If the next thing is a quantifier, it applies to this + * character only, which means that this character has to be in + * its own node and can't just be appended to the string in an + * existing node, so if there are already other characters in + * the node, close the node with just them, and set up to do + * this character again next time through, when it will be the + * only thing in its new node */ + if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len) + { + p = oldp; + goto loopdone; + } + + if (FOLD) { + if (UTF + /* See comments for join_exact() as to why we fold + * this non-UTF at compile time */ + || (node_type == EXACTFU + && ender == LATIN_SMALL_LETTER_SHARP_S)) + { + + + /* Prime the casefolded buffer. Locale rules, which + * apply only to code points < 256, aren't known until + * execution, so for them, just output the original + * character using utf8. If we start to fold non-UTF + * patterns, be sure to update join_exact() */ + if (LOC && ender < 256) { + if (UNI_IS_INVARIANT(ender)) { + *s = (U8) ender; + foldlen = 1; + } else { + *s = UTF8_TWO_BYTE_HI(ender); + *(s + 1) = UTF8_TWO_BYTE_LO(ender); + foldlen = 2; + } + } + else { + UV folded = _to_uni_fold_flags( + ender, + (U8 *) s, + &foldlen, + FOLD_FLAGS_FULL + | ((LOC) ? FOLD_FLAGS_LOCALE + : (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0) + ); + + /* If this node only contains non-folding code + * points so far, see if this new one is also + * non-folding */ + if (maybe_exact) { + if (folded != ender) { + maybe_exact = FALSE; + } + else { + /* Here the fold is the original; we have + * to check further to see if anything + * folds to it */ + if (! PL_utf8_foldable) { + SV* swash = swash_init("utf8", + "_Perl_Any_Folds", + &PL_sv_undef, 1, 0); + PL_utf8_foldable = + _get_swash_invlist(swash); + SvREFCNT_dec(swash); + } + if (_invlist_contains_cp(PL_utf8_foldable, + ender)) + { + maybe_exact = FALSE; + } + } + } + ender = folded; + } + s += foldlen; + + /* The loop increments each time, as all but this + * path (and the one just below for UTF) through it add + * a single byte to the EXACTish node. But this one + * has changed len to be the correct final value, so + * subtract one to cancel out the increment that + * follows */ + len += foldlen - 1; + } + else { + *(s++) = ender; + maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender); + } } - if (UTF || is_exactfu_sharp_s) { - if (FOLD) { - /* Emit all the Unicode characters. */ - STRLEN numlen; - for (foldbuf = tmpbuf; - foldlen; - foldlen -= numlen) { - ender = valid_utf8_to_uvchr(foldbuf, &numlen); - if (numlen > 0) { - const STRLEN unilen = reguni(pRExC_state, ender, s); - len += unilen; - s += unilen; - /* In EBCDIC the numlen - * and unilen can differ. */ - foldbuf += numlen; - if (numlen >= foldlen) - break; - } - else - break; - } - } - else { - const STRLEN unilen = reguni(pRExC_state, ender, s); - if (unilen > 0) { - s += unilen; - len += unilen; - } - } - len--; + else if (UTF) { + const STRLEN unilen = reguni(pRExC_state, ender, s); + if (unilen > 0) { + s += unilen; + len += unilen; + } + + /* See comment just above for - 1 */ + len--; } else { REGC((char)ender, s++); + } + + if (next_is_quantifier) { + + /* Here, the next input is a quantifier, and to get here, + * the current character is the only one in the node. + * Also, here doesn't include the final byte for this + * character */ + len++; + goto loopdone; } - } + + } /* End of loop through literal characters */ + + /* Here we have either exhausted the input or ran out of room in + * the node. (If we encountered a character that can't be in the + * node, transfer is made directly to , and so we + * wouldn't have fallen off the end of the loop.) In the latter + * case, we artificially have to split the node into two, because + * we just don't have enough space to hold everything. This + * creates a problem if the final character participates in a + * multi-character fold in the non-final position, as a match that + * should have occurred won't, due to the way nodes are matched, + * and our artificial boundary. So back off until we find a non- + * problematic character -- one that isn't at the beginning or + * middle of such a fold. (Either it doesn't participate in any + * folds, or appears only in the final position of all the folds it + * does participate in.) A better solution with far fewer false + * positives, and that would fill the nodes more completely, would + * be to actually have available all the multi-character folds to + * test against, and to back-off only far enough to be sure that + * this node isn't ending with a partial one. is set + * further below (if we need to reparse the node) to include just + * up through that final non-problematic character that this code + * identifies, so when it is set to less than the full node, we can + * skip the rest of this */ + if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { + + const STRLEN full_len = len; + + assert(len >= MAX_NODE_STRING_SIZE); + + /* Here, points to the final byte of the final character. + * Look backwards through the string until find a non- + * problematic character */ + + if (! UTF) { + + /* These two have no multi-char folds to non-UTF characters + */ + if (ASCII_FOLD_RESTRICTED || LOC) { + goto loopdone; + } + + while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { } + len = s - s0 + 1; + } + else { + if (! PL_NonL1NonFinalFold) { + PL_NonL1NonFinalFold = _new_invlist_C_array( + NonL1_Perl_Non_Final_Folds_invlist); + } + + /* Point to the first byte of the final character */ + s = (char *) utf8_hop((U8 *) s, -1); + + while (s >= s0) { /* Search backwards until find + non-problematic char */ + if (UTF8_IS_INVARIANT(*s)) { + + /* There are no ascii characters that participate + * in multi-char folds under /aa. In EBCDIC, the + * non-ascii invariants are all control characters, + * so don't ever participate in any folds. */ + if (ASCII_FOLD_RESTRICTED + || ! IS_NON_FINAL_FOLD(*s)) + { + break; + } + } + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + + /* No Latin1 characters participate in multi-char + * folds under /l */ + if (LOC + || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI( + *s, *(s+1)))) + { + break; + } + } + else if (! _invlist_contains_cp( + PL_NonL1NonFinalFold, + valid_utf8_to_uvchr((U8 *) s, NULL))) + { + break; + } + + /* Here, the current character is problematic in that + * it does occur in the non-final position of some + * fold, so try the character before it, but have to + * special case the very first byte in the string, so + * we don't read outside the string */ + s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); + } /* End of loop backwards through the string */ + + /* If there were only problematic characters in the string, + * will point to before s0, in which case the length + * should be 0, otherwise include the length of the + * non-problematic character just found */ + len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s); + } + + /* Here, have found the final character, if any, that is + * non-problematic as far as ending the node without splitting + * it across a potential multi-char fold. contains the + * number of bytes in the node up-to and including that + * character, or is 0 if there is no such character, meaning + * the whole node contains only problematic characters. In + * this case, give up and just take the node as-is. We can't + * do any better */ + if (len == 0) { + len = full_len; + } else { + + /* Here, the node does contain some characters that aren't + * problematic. If one such is the final character in the + * node, we are done */ + if (len == full_len) { + goto loopdone; + } + else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) { + + /* If the final character is problematic, but the + * penultimate is not, back-off that last character to + * later start a new node with it */ + p = oldp; + goto loopdone; + } + + /* Here, the final non-problematic character is earlier + * in the input than the penultimate character. What we do + * is reparse from the beginning, going up only as far as + * this final ok one, thus guaranteeing that the node ends + * in an acceptable character. The reason we reparse is + * that we know how far in the character is, but we don't + * know how to correlate its position with the input parse. + * An alternate implementation would be to build that + * correlation as we go along during the original parse, + * but that would entail extra work for every node, whereas + * this code gets executed only when the string is too + * large for the node, and the final two characters are + * problematic, an infrequent occurrence. Yet another + * possible strategy would be to save the tail of the + * string, and the next time regatom is called, initialize + * with that. The problem with this is that unless you + * back off one more character, you won't be guaranteed + * regatom will get called again, unless regbranch, + * regpiece ... are also changed. If you do back off that + * extra character, so that there is input guaranteed to + * force calling regatom, you can't handle the case where + * just the first character in the node is acceptable. I + * (khw) decided to try this method which doesn't have that + * pitfall; if performance issues are found, we can do a + * combination of the current approach plus that one */ + upper_parse = len; + len = 0; + s = s0; + goto reparse; + } + } /* End of verifying node ends with an appropriate char */ + loopdone: /* Jumped to when encounters something that shouldn't be in the node */ + + /* If 'maybe_exact' is still set here, means there are no + * code points in the node that participate in folds */ + if (FOLD && maybe_exact) { + OP(ret) = EXACT; + } + + /* I (khw) don't know if you can get here with zero length, but the + * old code handled this situation by creating a zero-length EXACT + * node. Might as well be NOTHING instead */ + if (len == 0) { + OP(ret) = NOTHING; + } + else{ + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); + } + RExC_parse = p - 1; Set_Node_Cur_Length(ret); /* MJD */ nextchar(pRExC_state); @@ -10632,20 +11134,10 @@ tryagain: if (iv < 0) vFAIL("Internal disaster"); } - if (len > 0) - *flagp |= HASWIDTH; - if (len == 1 && UNI_IS_INVARIANT(ender)) - *flagp |= SIMPLE; - if (SIZE_ONLY) - RExC_size += STR_SZ(len); - else { - STR_LEN(ret) = len; - RExC_emit += STR_SZ(len); - } - } + } /* End of label 'defchar:' */ break; - } + } /* End of giant switch on input character */ return(ret); } @@ -10687,8 +11179,8 @@ S_regwhite( RExC_state_t *pRExC_state, char *p ) #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) -STATIC I32 -S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) +PERL_STATIC_INLINE I32 +S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me) { dVAR; I32 namedclass = OOB_NAMEDCLASS; @@ -10722,7 +11214,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) switch (skip) { case 4: if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */ - namedclass = ANYOF_ALNUM; + namedclass = ANYOF_WORDCHAR; break; case 5: /* Names all of length 5. */ @@ -10798,7 +11290,8 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) the class closes */ while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') RExC_parse++; - Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); + SvREFCNT_dec(free_me); + vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); } } else { /* Maternal grandfather: @@ -10811,36 +11304,6 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) return namedclass; } -STATIC void -S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) -{ - dVAR; - - PERL_ARGS_ASSERT_CHECKPOSIXCC; - - if (POSIXCC(UCHARAT(RExC_parse))) { - const char *s = RExC_parse; - const char c = *s++; - - while (isALNUM(*s)) - s++; - if (*s && c == *s && s[1] == ']') { - ckWARN3reg(s+2, - "POSIX syntax [%c %c] belongs inside character classes", - c, c); - - /* [[=foo=]] and [[.foo.]] are still future. */ - if (POSIXCC_NOTYET(c)) { - /* adjust RExC_parse so the error shows after - the class closes */ - while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']') - NOOP; - Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); - } - } - } -} - /* Generate the code to add a full posix character to the bracketed * character class given by . ( is needed only under locale rules) * destlist is the inversion list for non-locale rules that this class is @@ -10974,50 +11437,49 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) } \ } -STATIC void -S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) -{ - /* Adds input 'string' with length 'len' to the ANYOF node's unicode - * alternate list, pointed to by 'alternate_ptr'. This is an array of - * the multi-character folds of characters in the node */ - SV *sv; - - PERL_ARGS_ASSERT_ADD_ALTERNATE; - - if (! *alternate_ptr) { - *alternate_ptr = newAV(); - } - sv = newSVpvn_utf8((char*)string, len, TRUE); - av_push(*alternate_ptr, sv); - return; -} - /* The names of properties whose definitions are not known at compile time are * stored in this SV, after a constant heading. So if the length has been * changed since initialization, then there is a run-time definition. */ #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len) -/* - parse a class specification and produce either an ANYOF node that - matches the pattern or perhaps will be optimized into an EXACTish node - instead. The node contains a bit map for the first 256 characters, with the - corresponding bit set if that character is in the list. For characters - above 255, a range list is used */ +/* This converts the named class defined in regcomp.h to its equivalent class + * number defined in handy.h. */ +#define namedclass_to_classnum(class) ((class) / 2) STATIC regnode * -S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) +S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { + /* parse a bracketed class specification. Most of these will produce an ANYOF node; + * but something like [a] will produce an EXACT node; [aA], an EXACTFish + * node; [[:ascii:]], a POSIXA node; etc. It is more complex under /i with + * multi-character folds: it will be rewritten following the paradigm of + * this example, where the s are characters which fold to + * multiple character sequences: + * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i + * gets effectively rewritten as: + * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i + * reg() gets called (recursively) on the rewritten version, and this + * function will return what it constructs. (Actually the s + * aren't physically removed from the [abcdefghi], it's just that they are + * ignored in the recursion by means of a flag: + * .) + * + * ANYOF nodes contain a bit map for the first 256 characters, with the + * corresponding bit set if that character is in the list. For characters + * above 255, a range list or swash is used. There are extra bits for \w, + * etc. in locale ANYOFs, as what these match is not determinable at + * compile time */ + dVAR; - register UV nextvalue; - register IV prevvalue = OOB_UNICODE; - register IV range = 0; - UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ - register regnode *ret; + UV nextvalue; + UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; + IV range = 0; + UV value = OOB_UNICODE, save_value = OOB_UNICODE; + regnode *ret; STRLEN numlen; IV namedclass = OOB_NAMEDCLASS; char *rangebegin = NULL; bool need_class = 0; - bool allow_full_fold = TRUE; /* Assume wants multi-char folding */ SV *listsv = NULL; STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more than just initialized. */ @@ -11026,6 +11488,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) extended beyond the Latin1 range */ UV element_count = 0; /* Number of distinct elements in the class. Optimizations may be possible if this is tiny */ + AV * multi_char_matches = NULL; /* Code points that fold to more than one + character; used under /i */ UV n; /* Unicode properties are stored in a swash; this holds the current one @@ -11049,14 +11513,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) * of the target string */ SV* cp_list = NULL; - /* List of multi-character folds that are matched by this node */ - AV* unicode_alternate = NULL; #ifdef EBCDIC /* In a range, counts how many 0-2 of the ends of it came from literals, * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ UV literal_endpoint = 0; #endif - UV stored = 0; /* how many chars stored in the bitmap */ bool invert = FALSE; /* Is this class to be complemented */ /* Is there any thing like \W or [:^digit:] that matches above the legal @@ -11066,6 +11527,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; + const I32 orig_size = RExC_size; GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCLASS; @@ -11078,25 +11540,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) /* Assume we are going to generate an ANYOF node. */ ret = reganode(pRExC_state, ANYOF, 0); - if (!SIZE_ONLY) { ANYOF_FLAGS(ret) = 0; } if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ - RExC_naughty++; RExC_parse++; invert = TRUE; - - /* We have decided to not allow multi-char folds in inverted character - * classes, due to the confusion that can happen, especially with - * classes that are designed for a non-Unicode world: You have the - * peculiar case that: - "s s" =~ /^[^\xDF]+$/i => Y - "ss" =~ /^[^\xDF]+$/i => N - * - * See [perl #89750] */ - allow_full_fold = FALSE; + RExC_naughty++; } if (SIZE_ONLY) { @@ -11108,7 +11559,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) if (LOC) { ANYOF_FLAGS(ret) |= ANYOF_LOCALE; } - ANYOF_BITMAP_ZERO(ret); listsv = newSVpvs("# comment\n"); initial_listsv_len = SvCUR(listsv); } @@ -11116,7 +11566,22 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; if (!SIZE_ONLY && POSIXCC(nextvalue)) - checkposixcc(pRExC_state); + { + const char *s = RExC_parse; + const char c = *s++; + + while (isALNUM(*s)) + s++; + if (*s && c == *s && s[1] == ']') { + SAVEFREESV(RExC_rx_sv); + SAVEFREESV(listsv); + ckWARN3reg(s+2, + "POSIX syntax [%c %c] belongs inside character classes", + c, c); + ReREFCNT_inc(RExC_rx_sv); + SvREFCNT_inc_simple_void_NN(listsv); + } + } /* allow 1st char to be ] (allowing it to be - is dealt with later) */ if (UCHARAT(RExC_parse) == ']') @@ -11128,6 +11593,8 @@ parseit: charclassloop: namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ + save_value = value; + save_prevvalue = prevvalue; if (!range) { rangebegin = RExC_parse; @@ -11144,7 +11611,7 @@ parseit: nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; if (value == '[' && POSIXCC(nextvalue)) - namedclass = regpposixcc(pRExC_state, value); + namedclass = regpposixcc(pRExC_state, value, listsv); else if (value == '\\') { if (UTF) { value = utf8n_to_uvchr((U8*)RExC_parse, @@ -11160,8 +11627,8 @@ parseit: * A similar issue a little bit later when switching on * namedclass. --jhi */ switch ((I32)value) { - case 'w': namedclass = ANYOF_ALNUM; break; - case 'W': namedclass = ANYOF_NALNUM; break; + case 'w': namedclass = ANYOF_WORDCHAR; break; + case 'W': namedclass = ANYOF_NWORDCHAR; break; case 's': namedclass = ANYOF_SPACE; break; case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; @@ -11177,17 +11644,21 @@ parseit: if this makes sense as it does change the behaviour from earlier versions, OTOH that behaviour was broken as well. */ - UV v; /* value is register so we cant & it /grrr */ - if (reg_namedseq(pRExC_state, &v, NULL, depth)) { + if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth, + TRUE /* => charclass */)) + { goto parseit; } - value= v; } break; case 'p': case 'P': { char *e; + + /* This routine will handle any undefined properties */ + U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF; + if (RExC_parse >= RExC_end) vFAIL2("Empty \\%c{}", (U8)value); if (*RExC_parse == '{') { @@ -11208,7 +11679,6 @@ parseit: n = 1; } if (!SIZE_ONLY) { - SV** invlistsvp; SV* invlist; char* name; @@ -11243,18 +11713,10 @@ parseit: swash = _core_swash_init("utf8", name, &PL_sv_undef, 1, /* binary */ 0, /* not tr/// */ - TRUE, /* this routine will handle - undefined properties */ - NULL, FALSE /* No inversion list */ + NULL, /* No inversion list */ + &swash_init_flags ); - if ( ! swash - || ! SvROK(swash) - || ! SvTYPE(SvRV(swash)) == SVt_PVHV - || ! (invlistsvp = - hv_fetchs(MUTABLE_HV(SvRV(swash)), - "INVLIST", FALSE)) - || ! (invlist = *invlistsvp)) - { + if (! swash || ! (invlist = _get_swash_invlist(swash))) { if (swash) { SvREFCNT_dec(swash); swash = NULL; @@ -11283,7 +11745,8 @@ parseit: * the swash is from a user-defined property, then this * whole character class should be regarded as such */ has_user_defined_property = - _is_swash_user_defined(swash); + (swash_init_flags + & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY); /* Invert if asking for the complement */ if (value == 'P') { @@ -11304,7 +11767,7 @@ parseit: Safefree(name); } RExC_parse = e + 1; - namedclass = ANYOF_MAX; /* no official name, but it's named */ + namedclass = ANYOF_UNIPROP; /* no official name, but it's named */ /* \p means they want Unicode semantics */ RExC_uni_semantics = 1; @@ -11379,9 +11842,13 @@ parseit: default: /* Allow \_ to not give an error */ if (!SIZE_ONLY && isALNUM(value) && value != '_') { + SAVEFREESV(RExC_rx_sv); + SAVEFREESV(listsv); ckWARN2reg(RExC_parse, "Unrecognized escape \\%c in character class passed through", (int)value); + ReREFCNT_inc(RExC_rx_sv); + SvREFCNT_inc_simple_void_NN(listsv); } break; } @@ -11426,9 +11893,13 @@ parseit: const int w = RExC_parse >= rangebegin ? RExC_parse - rangebegin : 0; + SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ + SAVEFREESV(listsv); ckWARN4reg(RExC_parse, "False [] range \"%*.*s\"", w, w, rangebegin); + ReREFCNT_inc(RExC_rx_sv); + SvREFCNT_inc_simple_void_NN(listsv); cp_list = add_cp_to_invlist(cp_list, '-'); cp_list = add_cp_to_invlist(cp_list, prevvalue); } @@ -11459,32 +11930,86 @@ parseit: runtime_posix_matches_above_Unicode); break; case ANYOF_ASCII: +#ifdef HAS_ISASCII if (LOC) { ANYOF_CLASS_SET(ret, namedclass); } - else { + else +#endif /* Not isascii(); just use the hard-coded definition for it */ _invlist_union(posixes, PL_ASCII, &posixes); - } break; case ANYOF_NASCII: +#ifdef HAS_ISASCII if (LOC) { ANYOF_CLASS_SET(ret, namedclass); } else { +#endif _invlist_union_complement_2nd(posixes, PL_ASCII, &posixes); if (DEPENDS_SEMANTICS) { ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; } +#ifdef HAS_ISASCII } +#endif break; case ANYOF_BLANK: - DO_POSIX(ret, namedclass, posixes, + if (hasISBLANK || ! LOC) { + DO_POSIX(ret, namedclass, posixes, PL_PosixBlank, PL_XPosixBlank); + } + else { /* There is no isblank() and we are in locale: We + use the ASCII range and the above-Latin1 range + code points */ + SV* scratch_list = NULL; + + /* Include all above-Latin1 blanks */ + _invlist_intersection(PL_AboveLatin1, + PL_XPosixBlank, + &scratch_list); + /* Add it to the running total of posix classes */ + if (! posixes) { + posixes = scratch_list; + } + else { + _invlist_union(posixes, scratch_list, &posixes); + SvREFCNT_dec(scratch_list); + } + /* Add the ASCII-range blanks to the running total. */ + _invlist_union(posixes, PL_PosixBlank, &posixes); + } break; case ANYOF_NBLANK: - DO_N_POSIX(ret, namedclass, posixes, - PL_PosixBlank, PL_XPosixBlank); + if (hasISBLANK || ! LOC) { + DO_N_POSIX(ret, namedclass, posixes, + PL_PosixBlank, PL_XPosixBlank); + } + else { /* There is no isblank() and we are in locale */ + SV* scratch_list = NULL; + + /* Include all above-Latin1 non-blanks */ + _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, + &scratch_list); + + /* Add them to the running total of posix classes */ + _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, + &scratch_list); + if (! posixes) { + posixes = scratch_list; + } + else { + _invlist_union(posixes, scratch_list, &posixes); + SvREFCNT_dec(scratch_list); + } + + /* Get the list of all non-ASCII-blanks in Latin 1, and + * add them to the running total */ + _invlist_subtract(PL_Latin1, PL_PosixBlank, + &scratch_list); + _invlist_union(posixes, scratch_list, &posixes); + SvREFCNT_dec(scratch_list); + } break; case ANYOF_CNTRL: DO_POSIX(ret, namedclass, posixes, @@ -11620,11 +12145,11 @@ parseit: } break; } - case ANYOF_ALNUM: /* Really is 'Word' */ + case ANYOF_WORDCHAR: DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); break; - case ANYOF_NALNUM: + case ANYOF_NWORDCHAR: DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv, runtime_posix_matches_above_Unicode); @@ -11648,8 +12173,7 @@ parseit: DO_N_POSIX(ret, namedclass, posixes, PL_PosixXDigit, PL_XPosixXDigit); break; - case ANYOF_MAX: - /* this is to handle \p and \P */ + case ANYOF_UNIPROP: /* this is to handle \p and \P */ break; default: vFAIL("Invalid [::] class"); @@ -11661,7 +12185,7 @@ parseit: } /* end of namedclass \blah */ if (range) { - if (prevvalue > (IV)value) /* b-a */ { + if (prevvalue > value) /* b-a */ { const int w = RExC_parse - rangebegin; Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); range = 0; /* not a valid range */ @@ -11685,8 +12209,10 @@ parseit: "False [] range \"%*.*s\"", w, w, rangebegin); } - if (!SIZE_ONLY) + if (!SIZE_ONLY) { cp_list = add_cp_to_invlist(cp_list, '-'); + } + element_count++; } else range = 1; /* yeah, it's a range! */ continue; /* but do it the next time */ @@ -11702,8 +12228,100 @@ parseit: RExC_uni_semantics = 1; } - /* Ready to process either the single value, or the completed range */ - if (!SIZE_ONLY) { + /* Ready to process either the single value, or the completed range. + * For single-valued non-inverted ranges, we consider the possibility + * of multi-char folds. (We made a conscious decision to not do this + * for the other cases because it can often lead to non-intuitive + * results. For example, you have the peculiar case that: + * "s s" =~ /^[^\xDF]+$/i => Y + * "ss" =~ /^[^\xDF]+$/i => N + * + * See [perl #89750] */ + if (FOLD && ! invert && value == prevvalue) { + if (value == LATIN_SMALL_LETTER_SHARP_S + || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold, + value))) + { + /* Here is indeed a multi-char fold. Get what it is */ + + U8 foldbuf[UTF8_MAXBYTES_CASE]; + STRLEN foldlen; + + UV folded = _to_uni_fold_flags( + value, + foldbuf, + &foldlen, + FOLD_FLAGS_FULL + | ((LOC) ? FOLD_FLAGS_LOCALE + : (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0) + ); + + /* Here, should be the first character of the + * multi-char fold of , with containing the + * whole thing. But, if this fold is not allowed (because of + * the flags), will be the same as , and should + * be processed like any other character, so skip the special + * handling */ + if (folded != value) { + + /* Skip if we are recursed, currently parsing the class + * again. Otherwise add this character to the list of + * multi-char folds. */ + if (! RExC_in_multi_char_class) { + AV** this_array_ptr; + AV* this_array; + STRLEN cp_count = utf8_length(foldbuf, + foldbuf + foldlen); + SV* multi_fold = sv_2mortal(newSVpvn("", 0)); + + Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); + + + if (! multi_char_matches) { + multi_char_matches = newAV(); + } + + /* is actually an array of arrays. + * There will be one or two top-level elements: [2], + * and/or [3]. The [2] element is an array, each + * element thereof is a character which folds to two + * characters; likewise for [3]. (Unicode guarantees a + * maximum of 3 characters in any fold.) When we + * rewrite the character class below, we will do so + * such that the longest folds are written first, so + * that it prefers the longest matching strings first. + * This is done even if it turns out that any + * quantifier is non-greedy, out of programmer + * laziness. Tom Christiansen has agreed that this is + * ok. This makes the test for the ligature 'ffi' come + * before the test for 'ff' */ + if (av_exists(multi_char_matches, cp_count)) { + this_array_ptr = (AV**) av_fetch(multi_char_matches, + cp_count, FALSE); + this_array = *this_array_ptr; + } + else { + this_array = newAV(); + av_store(multi_char_matches, cp_count, + (SV*) this_array); + } + av_push(this_array, multi_fold); + } + + /* This element should not be processed further in this + * class */ + element_count--; + value = save_value; + prevvalue = save_prevvalue; + continue; + } + } + } + + /* Deal with this element of the class */ + if (! SIZE_ONLY) { #ifndef EBCDIC cp_list = _add_range_to_invlist(cp_list, prevvalue, value); #else @@ -11733,11 +12351,90 @@ parseit: range = 0; /* this range (if it was one) is done now */ } /* End of loop through all the text within the brackets */ + /* If anything in the class expands to more than one character, we have to + * deal with them by building up a substitute parse string, and recursively + * calling reg() on it, instead of proceeding */ + if (multi_char_matches) { + SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP); + I32 cp_count; + STRLEN len; + char *save_end = RExC_end; + char *save_parse = RExC_parse; + bool first_time = TRUE; /* First multi-char occurrence doesn't get + a "|" */ + I32 reg_flags; + + assert(! invert); +#if 0 /* Have decided not to deal with multi-char folds in inverted classes, + because too confusing */ + if (invert) { + sv_catpv(substitute_parse, "(?:"); + } +#endif + + /* Look at the longest folds first */ + for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) { + + if (av_exists(multi_char_matches, cp_count)) { + AV** this_array_ptr; + SV* this_sequence; + + this_array_ptr = (AV**) av_fetch(multi_char_matches, + cp_count, FALSE); + while ((this_sequence = av_pop(*this_array_ptr)) != + &PL_sv_undef) + { + if (! first_time) { + sv_catpv(substitute_parse, "|"); + } + first_time = FALSE; + + sv_catpv(substitute_parse, SvPVX(this_sequence)); + } + } + } + + /* If the character class contains anything else besides these + * multi-character folds, have to include it in recursive parsing */ + if (element_count) { + sv_catpv(substitute_parse, "|["); + sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); + sv_catpv(substitute_parse, "]"); + } + + sv_catpv(substitute_parse, ")"); +#if 0 + if (invert) { + /* This is a way to get the parse to skip forward a whole named + * sequence instead of matching the 2nd character when it fails the + * first */ + sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)"); + } +#endif + + RExC_parse = SvPV(substitute_parse, len); + RExC_end = RExC_parse + len; + RExC_in_multi_char_class = 1; + RExC_emit = (regnode *)orig_emit; + + ret = reg(pRExC_state, 1, ®_flags, depth+1); + + *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED); + + RExC_parse = save_parse; + RExC_end = save_end; + RExC_in_multi_char_class = 0; + SvREFCNT_dec(multi_char_matches); + SvREFCNT_dec(listsv); + return ret; + } + /* If the character class contains only a single element, it may be * optimizable into another node type which is smaller and runs faster. * Check if this is the case for this class */ if (element_count == 1) { U8 op = END; + U8 arg = 0; if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or [:digit:] or \p{foo} */ @@ -11752,10 +12449,10 @@ parseit: * modifier to the regex. We first calculate the base node * type, and if it should be inverted */ - case ANYOF_NALNUM: + case ANYOF_NWORDCHAR: invert = ! invert; /* FALLTHROUGH */ - case ANYOF_ALNUM: + case ANYOF_WORDCHAR: op = ALNUM; goto join_charset_classes; @@ -11797,6 +12494,7 @@ parseit: if (invert) { op += NALNUM - ALNUM; } + *flagp |= HASWIDTH|SIMPLE; break; /* The second group doesn't depend of the charset modifiers. @@ -11805,7 +12503,9 @@ parseit: invert = ! invert; /* FALLTHROUGH */ case ANYOF_HORIZWS: + is_horizws: op = (invert) ? NHORIZWS : HORIZWS; + *flagp |= HASWIDTH|SIMPLE; break; case ANYOF_NVERTWS: @@ -11813,17 +12513,63 @@ parseit: /* FALLTHROUGH */ case ANYOF_VERTWS: op = (invert) ? NVERTWS : VERTWS; + *flagp |= HASWIDTH|SIMPLE; break; + case ANYOF_UNIPROP: + break; + case ANYOF_NBLANK: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_BLANK: + if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) { + goto is_horizws; + } + /* FALLTHROUGH */ + default: + /* A generic posix class. All the /a ones can be handled + * by the POSIXA opcode. And all are closed under folding + * in the ASCII range, so FOLD doesn't matter */ + if (AT_LEAST_ASCII_RESTRICTED + || (! LOC && namedclass == ANYOF_ASCII)) + { + /* The odd numbered ones are the complements of the + * next-lower even number one */ + if (namedclass % 2 == 1) { + invert = ! invert; + namedclass--; + } + arg = namedclass_to_classnum(namedclass); + op = (invert) ? NPOSIXA : POSIXA; + } + break; } } - else if (! LOC) { - if (invert && prevvalue == '\n' && value == '\n') { - op = REG_ANY; /* Optimize [^\n] */ + else if (value == prevvalue) { + + /* Here, the class consists of just a single code point */ + + if (invert) { + if (! LOC && value == '\n') { + op = REG_ANY; /* Optimize [^\n] */ + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + } + else if (value < 256 || UTF) { + + /* Optimize a single value into an EXACTish node, but not if it + * would require converting the pattern to UTF-8. */ + op = compute_EXACTish(pRExC_state); } - else if (prevvalue == '0' && value == '9') { - op = (invert) ? NDIGITA : DIGITA; + } /* Otherwise is a range */ + else if (! LOC) { /* locale could vary these */ + if (prevvalue == '0') { + if (value == '9') { + op = (invert) ? NDIGITA : DIGITA; + *flagp |= HASWIDTH|SIMPLE; + } } } @@ -11836,20 +12582,44 @@ parseit: * the parse */ const char * cur_parse = RExC_parse; RExC_parse = (char *)orig_parse; - RExC_emit = (regnode *)orig_emit; + if ( SIZE_ONLY) { + if (! LOC) { + + /* To get locale nodes to not use the full ANYOF size would + * require moving the code above that writes the portions + * of it that aren't in other nodes to after this point. + * e.g. ANYOF_CLASS_SET */ + RExC_size = orig_size; + } + } + else { + RExC_emit = (regnode *)orig_emit; + } ret = reg_node(pRExC_state, op); + if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) { + if (! SIZE_ONLY) { + FLAGS(ret) = arg; + } + *flagp |= HASWIDTH|SIMPLE; + } + else if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + } + RExC_parse = (char *) cur_parse; + SvREFCNT_dec(posixes); SvREFCNT_dec(listsv); + SvREFCNT_dec(cp_list); return ret; } } if (SIZE_ONLY) return ret; - /****** !SIZE_ONLY AFTER HERE *********/ + /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/ /* If folding, we calculate all characters that could fold to or from the * ones already on the list */ @@ -11858,10 +12628,11 @@ parseit: SV* fold_intersection = NULL; - /* In the Latin1 range, the characters that can be folded-to or -from - * are precisely the alphabetic characters. If the highest code point - * is within Latin1, we can use the compiled-in list, and not have to - * go out to disk. */ + /* If the highest code point is within Latin1, we can use the + * compiled-in Alphas list, and not have to go out to disk. This + * yields two false positives, the masculine and feminine ordinal + * indicators, which are weeded out below using the + * IS_IN_SOME_FOLD_L1() macro */ if (invlist_highest(cp_list) < 256) { _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection); } @@ -11873,7 +12644,7 @@ parseit: if (! PL_utf8_foldable) { SV* swash = swash_init("utf8", "_Perl_Any_Folds", &PL_sv_undef, 1, 0); - PL_utf8_foldable = _swash_to_invlist(swash); + PL_utf8_foldable = _get_swash_invlist(swash); SvREFCNT_dec(swash); } @@ -11887,7 +12658,7 @@ parseit: * rules hard-coded into Perl. (This case happens legitimately * during compilation of Perl itself before the Unicode tables * are generated) */ - if (invlist_len(PL_utf8_foldable) == 0) { + if (_invlist_len(PL_utf8_foldable) == 0) { PL_utf8_foldclosures = newHV(); } else { @@ -11895,15 +12666,13 @@ parseit: * to force that */ if (! PL_utf8_tofold) { U8 dummy[UTF8_MAXBYTES+1]; - STRLEN dummy_len; - /* This particular string is above \xff in both UTF-8 - * and UTFEBCDIC */ - to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len); + /* This string is just a short named one above \xff */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); assert(PL_utf8_tofold); /* Verify that worked */ } PL_utf8_foldclosures = - _swash_inversion_hash(PL_utf8_tofold); + _swash_inversion_hash(PL_utf8_tofold); } } @@ -11930,7 +12699,7 @@ parseit: U8 foldbuf[UTF8_MAXBYTES_CASE+1]; STRLEN foldlen; - UV f; + SV** listp; if (j < 256) { @@ -11942,7 +12711,7 @@ parseit: * mappings, though that is not really likely, and may be * caught by the default: case of the switch below. */ - if (PL_fold_latin1[j] != j) { + if (IS_IN_SOME_FOLD_L1(j)) { /* ASCII is always matched; non-ASCII is matched only * under Unicode rules */ @@ -11957,33 +12726,16 @@ parseit: } if (HAS_NONLATIN1_FOLD_CLOSURE(j) - && (! isASCII(j) || ! MORE_ASCII_RESTRICTED)) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) { /* Certain Latin1 characters have matches outside - * Latin1, or are multi-character. To get here, 'j' is - * one of those characters. None of these matches is - * valid for ASCII characters under /aa, which is why - * the 'if' just above excludes those. The matches - * fall into three categories: - * 1) They are singly folded-to or -from an above 255 - * character, e.g., LATIN SMALL LETTER Y WITH - * DIAERESIS and LATIN CAPITAL LETTER Y WITH - * DIAERESIS; - * 2) They are part of a multi-char fold with another - * latin1 character; only LATIN SMALL LETTER - * SHARP S => "ss" fits this; - * 3) They are part of a multi-char fold with a - * character outside of Latin1, such as various - * ligatures. - * We aren't dealing fully with multi-char folds, except - * we do deal with the pattern containing a character - * that has a multi-char fold (not so much the inverse). - * For types 1) and 3), the matches only happen when the - * target string is utf8; that's not true for 2), and we - * set a flag for it. - * - * The code below adds the single fold closures for 'j' - * to the inversion list. */ + * Latin1. To get here, is one of those + * characters. None of these matches is valid for + * ASCII characters under /aa, which is why the 'if' + * just above excludes those. These matches only + * happen when the target string is utf8. The code + * below adds the single fold closures for to the + * inversion list. */ switch (j) { case 'k': case 'K': @@ -11997,9 +12749,9 @@ parseit: break; case MICRO_SIGN: cp_list = add_cp_to_invlist(cp_list, - GREEK_SMALL_LETTER_MU); - cp_list = add_cp_to_invlist(cp_list, GREEK_CAPITAL_LETTER_MU); + cp_list = add_cp_to_invlist(cp_list, + GREEK_SMALL_LETTER_MU); break; case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: @@ -12013,20 +12765,6 @@ parseit: case LATIN_SMALL_LETTER_SHARP_S: cp_list = add_cp_to_invlist(cp_list, LATIN_CAPITAL_LETTER_SHARP_S); - - /* Under /a, /d, and /u, this can match the two - * chars "ss" */ - if (! MORE_ASCII_RESTRICTED) { - add_alternate(&unicode_alternate, - (U8 *) "ss", 2); - - /* And under /u or /a, it can match even if - * the target is not utf8 */ - if (AT_LEAST_UNI_SEMANTICS) { - ANYOF_FLAGS(ret) |= - ANYOF_NONBITMAP_NON_UTF8; - } - } break; case 'F': case 'f': case 'I': case 'i': @@ -12043,7 +12781,8 @@ parseit: * express, so they can't match unless the * target string is in UTF-8, so no action here * is necessary, as regexec.c properly handles - * the general case for UTF-8 matching */ + * the general case for UTF-8 matching and + * multi-char folds */ break; default: /* Use deprecated warning to increase the @@ -12056,89 +12795,56 @@ parseit: } /* Here is an above Latin1 character. We don't have the rules - * hard-coded for it. First, get its fold */ - f = _to_uni_fold_flags(j, foldbuf, &foldlen, - ((allow_full_fold) ? FOLD_FLAGS_FULL : 0) - | ((LOC) - ? FOLD_FLAGS_LOCALE - : (MORE_ASCII_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); - - if (foldlen > (STRLEN)UNISKIP(f)) { - - /* Any multicharacter foldings (disallowed in lookbehind - * patterns) require the following transform: [ABCDEF] -> - * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F - * folds into "rst", all other characters fold to single - * characters. We save away these multicharacter foldings, - * to be later saved as part of the additional "s" data. */ - if (! RExC_in_lookbehind) { - U8* loc = foldbuf; - U8* e = foldbuf + foldlen; - - /* If any of the folded characters of this are in the - * Latin1 range, tell the regex engine that this can - * match a non-utf8 target string. */ - while (loc < e) { - if (UTF8_IS_INVARIANT(*loc) - || UTF8_IS_DOWNGRADEABLE_START(*loc)) - { - ANYOF_FLAGS(ret) - |= ANYOF_NONBITMAP_NON_UTF8; - break; - } - loc += UTF8SKIP(loc); + * hard-coded for it. First, get its fold. This is the simple + * fold, as the multi-character folds have been handled earlier + * and separated out */ + _to_uni_fold_flags(j, foldbuf, &foldlen, + ((LOC) + ? FOLD_FLAGS_LOCALE + : (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + + /* Single character fold of above Latin1. Add everything in + * its fold closure to the list that this node should match. + * The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_len(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); } + c = SvUV(*c_p); - add_alternate(&unicode_alternate, foldbuf, foldlen); - } - } - else { - /* Single character fold of above Latin1. Add everything - * in its fold closure to the list that this node should - * match */ - SV** listp; - - /* The fold closures data structure is a hash with the keys - * being every character that is folded to, like 'k', and - * the values each an array of everything that folds to its - * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */ - if ((listp = hv_fetch(PL_utf8_foldclosures, - (char *) foldbuf, foldlen, FALSE))) - { - AV* list = (AV*) *listp; - IV k; - for (k = 0; k <= av_len(list); k++) { - SV** c_p = av_fetch(list, k, FALSE); - UV c; - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - c = SvUV(*c_p); - - /* /aa doesn't allow folds between ASCII and non-; - * /l doesn't allow them between above and below - * 256 */ - if ((MORE_ASCII_RESTRICTED - && (isASCII(c) != isASCII(j))) - || (LOC && ((c < 256) != (j < 256)))) - { - continue; - } + /* /aa doesn't allow folds between ASCII and non-; /l + * doesn't allow them between above and below 256 */ + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j))) + || (LOC && ((c < 256) != (j < 256)))) + { + continue; + } - /* Folds involving non-ascii Latin1 characters - * under /d are added to a separate list */ - if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) - { - cp_list = add_cp_to_invlist(cp_list, c); - } - else { - depends_list = add_cp_to_invlist(depends_list, c); - } - } - } - } + /* Folds involving non-ascii Latin1 characters + * under /d are added to a separate list */ + if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) + { + cp_list = add_cp_to_invlist(cp_list, c); + } + else { + depends_list = add_cp_to_invlist(depends_list, c); + } + } + } } } SvREFCNT_dec(fold_intersection); @@ -12149,7 +12855,7 @@ parseit: * fold the classes (folding of those is automatically handled by the swash * fetching code) */ if (posixes) { - if (AT_LEAST_UNI_SEMANTICS) { + if (! DEPENDS_SEMANTICS) { if (cp_list) { _invlist_union(cp_list, posixes, &cp_list); SvREFCNT_dec(posixes); @@ -12159,7 +12865,6 @@ parseit: } } else { - /* Under /d, we put into a separate list the Latin1 things that * match only when the target string is utf8 */ SV* nonascii_but_latin1_properties = NULL; @@ -12239,17 +12944,16 @@ parseit: * Now we can see about various optimizations. Fold calculation (which we * did above) needs to take place before inversion. Otherwise /[^k]/i * would invert to include K, which under /i would match k, which it - * shouldn't. */ + * shouldn't. Therefore we can't invert folded locale now, as it won't be + * folded until runtime */ - /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't - * set the FOLD flag yet, so this does optimize those. It doesn't - * optimize locale. Doing so perhaps could be done as long as there is - * nothing like \w in it; some thought also would have to be given to the - * interaction with above 0x100 chars */ + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known + * at compile time. Besides not inverting folded locale now, we can't + * invert if there are things such as \w, which aren't known until runtime + * */ if (invert - && ! LOC + && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS))) && ! depends_list - && ! unicode_alternate && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { _invlist_invert(cp_list); @@ -12264,10 +12968,147 @@ parseit: invert = FALSE; } + /* If we didn't do folding, it's because some information isn't available + * until runtime; set the run-time fold flag for these. (We don't have to + * worry about properties folding, as that is taken care of by the swash + * fetching) */ + if (FOLD && LOC) + { + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; + } + + /* Some character classes are equivalent to other nodes. Such nodes take + * up less room and generally fewer operations to execute than ANYOF nodes. + * Above, we checked for and optimized into some such equivalents for + * certain common classes that are easy to test. Getting to this point in + * the code means that the class didn't get optimized there. Since this + * code is only executed in Pass 2, it is too late to save space--it has + * been allocated in Pass 1, and currently isn't given back. But turning + * things into an EXACTish node can allow the optimizer to join it to any + * adjacent such nodes. And if the class is equivalent to things like /./, + * expensive run-time swashes can be avoided. Now that we have more + * complete information, we can find things necessarily missed by the + * earlier code. I (khw) am not sure how much to look for here. It would + * be easy, but perhaps too slow, to check any candidates against all the + * node types they could possibly match using _invlistEQ(). */ + + if (cp_list + && ! invert + && ! depends_list + && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + { + UV start, end; + U8 op = END; /* The optimzation node-type */ + const char * cur_parse= RExC_parse; + + invlist_iterinit(cp_list); + if (! invlist_iternext(cp_list, &start, &end)) { + + /* Here, the list is empty. This happens, for example, when a + * Unicode property is the only thing in the character class, and + * it doesn't match anything. (perluniprops.pod notes such + * properties) */ + op = OPFAIL; + *flagp |= HASWIDTH|SIMPLE; + } + else if (start == end) { /* The range is a single code point */ + if (! invlist_iternext(cp_list, &start, &end) + + /* Don't do this optimization if it would require changing + * the pattern to UTF-8 */ + && (start < 256 || UTF)) + { + /* Here, the list contains a single code point. Can optimize + * into an EXACT node */ + + value = start; + + if (! FOLD) { + op = EXACT; + } + else if (LOC) { + + /* A locale node under folding with one code point can be + * an EXACTFL, as its fold won't be calculated until + * runtime */ + op = EXACTFL; + } + else { + + /* Here, we are generally folding, but there is only one + * code point to match. If we have to, we use an EXACT + * node, but it would be better for joining with adjacent + * nodes in the optimization pass if we used the same + * EXACTFish node that any such are likely to be. We can + * do this iff the code point doesn't participate in any + * folds. For example, an EXACTF of a colon is the same as + * an EXACT one, since nothing folds to or from a colon. */ + if (value < 256) { + if (IS_IN_SOME_FOLD_L1(value)) { + op = EXACT; + } + } + else { + if (! PL_utf8_foldable) { + SV* swash = swash_init("utf8", "_Perl_Any_Folds", + &PL_sv_undef, 1, 0); + PL_utf8_foldable = _get_swash_invlist(swash); + SvREFCNT_dec(swash); + } + if (_invlist_contains_cp(PL_utf8_foldable, value)) { + op = EXACT; + } + } + + /* If we haven't found the node type, above, it means we + * can use the prevailing one */ + if (op == END) { + op = compute_EXACTish(pRExC_state); + } + } + } + } + else if (start == 0) { + if (end == UV_MAX) { + op = SANY; + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + else if (end == '\n' - 1 + && invlist_iternext(cp_list, &start, &end) + && start == '\n' + 1 && end == UV_MAX) + { + op = REG_ANY; + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + } + + if (op != END) { + RExC_parse = (char *)orig_parse; + RExC_emit = (regnode *)orig_emit; + + ret = reg_node(pRExC_state, op); + + RExC_parse = (char *)cur_parse; + + if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + } + + SvREFCNT_dec(cp_list); + SvREFCNT_dec(listsv); + return ret; + } + } + /* Here, contains all the code points we can determine at * compile time that match under all conditions. Go through it, and * for things that belong in the bitmap, put them there, and delete from - * */ + * . While we are at it, see if everything above 255 is in the + * list, and if so, set a flag to speed up execution */ + ANYOF_BITMAP_ZERO(ret); if (cp_list) { /* This gets set if we actually need to modify things */ @@ -12281,6 +13122,10 @@ parseit: UV high; int i; + if (end == UV_MAX && start <= 256) { + ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; + } + /* Quit if are above what we should change */ if (start > 255) { break; @@ -12293,7 +13138,6 @@ parseit: for (i = start; i <= (int) high; i++) { if (! ANYOF_BITMAP_TEST(ret, i)) { ANYOF_BITMAP_SET(ret, i); - stored++; prevvalue = value; value = i; } @@ -12307,7 +13151,7 @@ parseit: } /* If have completely emptied it, remove it completely */ - if (invlist_len(cp_list) == 0) { + if (_invlist_len(cp_list) == 0) { SvREFCNT_dec(cp_list); cp_list = NULL; } @@ -12317,7 +13161,9 @@ parseit: ANYOF_FLAGS(ret) |= ANYOF_INVERT; } - /* Combine the two lists into one. */ + /* Here, the bitmap has been populated with all the Latin1 code points that + * always match. Can now add to the overall list those that match only + * when the target string is UTF-8 (). */ if (depends_list) { if (cp_list) { _invlist_union(cp_list, depends_list, &cp_list); @@ -12328,128 +13174,18 @@ parseit: } } - /* Folding in the bitmap is taken care of above, but not for locale (for - * which we have to wait to see what folding is in effect at runtime), and - * for some things not in the bitmap (only the upper latin folds in this - * case, as all other single-char folding has been set above). Set - * run-time fold flag for these */ - if (FOLD && (LOC - || (DEPENDS_SEMANTICS - && cp_list - && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8)) - || unicode_alternate)) - { - ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD; - } - - /* A single character class can be "optimized" into an EXACTish node. - * Note that since we don't currently count how many characters there are - * outside the bitmap, we are XXX missing optimization possibilities for - * them. This optimization can't happen unless this is a truly single - * character class, which means that it can't be an inversion into a - * many-character class, and there must be no possibility of there being - * things outside the bitmap. 'stored' (only) for locales doesn't include - * \w, etc, so have to make a special test that they aren't present - * - * Similarly A 2-character class of the very special form like [bB] can be - * optimized into an EXACTFish node, but only for non-locales, and for - * characters which only have the two folds; so things like 'fF' and 'Ii' - * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE - * FI'. */ - if (! cp_list - && ! unicode_alternate - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION - && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL)) - && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) - || (! ANYOF_CLASS_TEST_ANY_SET(ret))))) - || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) - && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)) - /* If the latest code point has a fold whose - * bit is set, it must be the only other one */ - && ((prevvalue = PL_fold_latin1[value]) != (IV)value) - && ANYOF_BITMAP_TEST(ret, prevvalue))))) - { - /* Note that the information needed to decide to do this optimization - * is not currently available until the 2nd pass, and that the actually - * used EXACTish node takes less space than the calculated ANYOF node, - * and hence the amount of space calculated in the first pass is larger - * than actually used, so this optimization doesn't gain us any space. - * But an EXACT node is faster than an ANYOF node, and can be combined - * with any adjacent EXACT nodes later by the optimizer for further - * gains. The speed of executing an EXACTF is similar to an ANYOF - * node, so the optimization advantage comes from the ability to join - * it to adjacent EXACT nodes */ - - const char * cur_parse= RExC_parse; - U8 op; - RExC_emit = (regnode *)orig_emit; - RExC_parse = (char *)orig_parse; - - if (stored == 1) { - - /* A locale node with one point can be folded; all the other cases - * with folding will have two points, since we calculate them above - */ - if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) { - op = EXACTFL; - } - else { - op = EXACT; - } - } - else { /* else 2 chars in the bit map: the folds of each other */ - - /* Use the folded value, which for the cases where we get here, - * is just the lower case of the current one (which may resolve to - * itself, or to the other one */ - value = toLOWER_LATIN1(value); - - /* To join adjacent nodes, they must be the exact EXACTish type. - * Try to use the most likely type, by using EXACTFA if possible, - * then EXACTFU if the regex calls for it, or is required because - * the character is non-ASCII. (If is ASCII, its fold is - * also ASCII for the cases where we get here.) */ - if (MORE_ASCII_RESTRICTED && isASCII(value)) { - op = EXACTFA; - } - else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) { - op = EXACTFU; - } - else { /* Otherwise, more likely to be EXACTF type */ - op = EXACTF; - } - } - - ret = reg_node(pRExC_state, op); - RExC_parse = (char *)cur_parse; - if (UTF && ! NATIVE_IS_INVARIANT(value)) { - *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value); - *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value); - STR_LEN(ret)= 2; - RExC_emit += STR_SZ(2); - } - else { - *STRING(ret)= (char)value; - STR_LEN(ret)= 1; - RExC_emit += STR_SZ(1); - } - SvREFCNT_dec(listsv); - return ret; - } - /* If there is a swash and more than one element, we can't use the swash in * the optimization below. */ if (swash && element_count > 1) { SvREFCNT_dec(swash); swash = NULL; } + if (! cp_list - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION - && ! unicode_alternate) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); SvREFCNT_dec(listsv); - SvREFCNT_dec(unicode_alternate); } else { /* av[0] stores the character class description in its textual form: @@ -12458,18 +13194,16 @@ parseit: * av[1] if NULL, is a placeholder to later contain the swash computed * from av[0]. But if no further computation need be done, the * swash is stored there now. - * av[2] stores the multicharacter foldings, used later in - * regexec.c:S_reginclass(). - * av[3] stores the cp_list inversion list for use in addition or + * av[2] stores the cp_list inversion list for use in addition or * instead of av[0]; used only if av[1] is NULL - * av[4] is set if any component of the class is from a user-defined + * av[3] is set if any component of the class is from a user-defined * property; used only if av[1] is NULL */ AV * const av = newAV(); SV *rv; av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) ? listsv - : &PL_sv_undef); + : (SvREFCNT_dec(listsv), &PL_sv_undef)); if (swash) { av_store(av, 1, swash); SvREFCNT_dec(cp_list); @@ -12477,27 +13211,18 @@ parseit: else { av_store(av, 1, NULL); if (cp_list) { - av_store(av, 3, cp_list); - av_store(av, 4, newSVuv(has_user_defined_property)); + av_store(av, 2, cp_list); + av_store(av, 3, newSVuv(has_user_defined_property)); } } - /* Store any computed multi-char folds only if we are allowing - * them */ - if (allow_full_fold) { - av_store(av, 2, MUTABLE_SV(unicode_alternate)); - if (unicode_alternate) { /* This node is variable length */ - OP(ret) = ANYOFV; - } - } - else { - av_store(av, 2, NULL); - } rv = newRV_noinc(MUTABLE_SV(av)); n = add_data(pRExC_state, 1, "s"); RExC_rxi->data->data[n] = (void*)rv; ARG_SET(ret, n); } + + *flagp |= HASWIDTH|SIMPLE; return ret; } #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION @@ -12591,7 +13316,7 @@ STATIC regnode * /* Location. */ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) { dVAR; - register regnode *ptr; + regnode *ptr; regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; @@ -12633,7 +13358,7 @@ STATIC regnode * /* Location. */ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) { dVAR; - register regnode *ptr; + regnode *ptr; regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; @@ -12705,9 +13430,9 @@ STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) { dVAR; - register regnode *src; - register regnode *dst; - register regnode *place; + regnode *src; + regnode *dst; + regnode *place; const int offset = regarglen[(U8)op]; const int size = NODE_STEP_REGNODE + offset; GET_RE_DEBUG_FLAGS_DECL; @@ -12793,7 +13518,7 @@ STATIC void S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) { dVAR; - register regnode *scan; + regnode *scan; GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGTAIL; @@ -12852,7 +13577,7 @@ STATIC U8 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) { dVAR; - register regnode *scan; + regnode *scan; U8 exact = PSEUDO; #ifdef EXPERIMENTAL_INPLACESCAN I32 min = 0; @@ -13093,7 +13818,43 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) { #ifdef DEBUGGING dVAR; - register int k; + int k; + + /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ + static const char * const anyofs[] = { + "[\\w]", + "[\\W]", + "[\\s]", + "[\\S]", + "[\\d]", + "[\\D]", + "[:alnum:]", + "[:^alnum:]", + "[:alpha:]", + "[:^alpha:]", + "[:ascii:]", + "[:^ascii:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:graph:]", + "[:^graph:]", + "[:lower:]", + "[:^lower:]", + "[:print:]", + "[:^print:]", + "[:punct:]", + "[:^punct:]", + "[:upper:]", + "[:^upper:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:blank:]", + "[:^blank:]", + "[\\v]", + "[\\V]" + }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -13214,43 +13975,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) const U8 flags = ANYOF_FLAGS(o); int do_sep = 0; - /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ - static const char * const anyofs[] = { - "\\w", - "\\W", - "\\s", - "\\S", - "\\d", - "\\D", - "[:alnum:]", - "[:^alnum:]", - "[:alpha:]", - "[:^alpha:]", - "[:ascii:]", - "[:^ascii:]", - "[:cntrl:]", - "[:^cntrl:]", - "[:graph:]", - "[:^graph:]", - "[:lower:]", - "[:^lower:]", - "[:print:]", - "[:^print:]", - "[:punct:]", - "[:^punct:]", - "[:upper:]", - "[:^upper:]", - "[:xdigit:]", - "[:^xdigit:]", - "[:space:]", - "[:^space:]", - "[:blank:]", - "[:^blank:]" - }; if (flags & ANYOF_LOCALE) sv_catpvs(sv, "{loc}"); - if (flags & ANYOF_LOC_NONBITMAP_FOLD) + if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); if (flags & ANYOF_INVERT) @@ -13300,7 +14028,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (ANYOF_NONBITMAP(o)) { SV *lv; /* Set if there is something outside the bit map */ - SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0); + SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL); bool byte_output = FALSE; /* If something in the bitmap has been output */ @@ -13384,6 +14112,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } + else if (k == POSIXD || k == NPOSIXD) { + U8 index = FLAGS(o) * 2; + if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + } + else { + sv_catpv(sv, anyofs[index]); + } + } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); #else @@ -13398,7 +14135,7 @@ SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ dVAR; - struct regexp *const prog = (struct regexp *)SvANY(r); + struct regexp *const prog = ReANY(r); GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_RE_INTUIT_STRING; @@ -13446,7 +14183,7 @@ void Perl_pregfree2(pTHX_ REGEXP *rx) { dVAR; - struct regexp *const r = (struct regexp *)SvANY(rx); + struct regexp *const r = ReANY(rx); GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_PREGFREE2; @@ -13456,6 +14193,7 @@ Perl_pregfree2(pTHX_ REGEXP *rx) } else { CALLREGFREE_PVT(rx); /* free the private data */ SvREFCNT_dec(RXp_PAREN_NAMES(r)); + Safefree(r->xpv_len_u.xpvlenu_pv); } if (r->substrs) { SvREFCNT_dec(r->anchored_substr); @@ -13465,11 +14203,12 @@ Perl_pregfree2(pTHX_ REGEXP *rx) Safefree(r->substrs); } RX_MATCH_COPY_FREE(rx); -#ifdef PERL_OLD_COPY_ON_WRITE +#ifdef PERL_ANY_COW SvREFCNT_dec(r->saved_copy); #endif Safefree(r->offs); SvREFCNT_dec(r->qr_anoncv); + rx->sv_u.svu_rx = 0; } /* reg_temp_copy() @@ -13493,26 +14232,42 @@ REGEXP * Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) { struct regexp *ret; - struct regexp *const r = (struct regexp *)SvANY(rx); + struct regexp *const r = ReANY(rx); + const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV; PERL_ARGS_ASSERT_REG_TEMP_COPY; if (!ret_x) ret_x = (REGEXP*) newSV_type(SVt_REGEXP); - ret = (struct regexp *)SvANY(ret_x); + else { + SvOK_off((SV *)ret_x); + if (islv) { + /* For PVLVs, SvANY points to the xpvlv body while sv_u points + to the regexp. (For SVt_REGEXPs, sv_upgrade has already + made both spots point to the same regexp body.) */ + REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP); + assert(!SvPVX(ret_x)); + ret_x->sv_u.svu_rx = temp->sv_any; + temp->sv_any = NULL; + SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL; + SvREFCNT_dec(temp); + /* SvCUR still resides in the xpvlv struct, so the regexp copy- + ing below will not set it. */ + SvCUR_set(ret_x, SvCUR(rx)); + } + } + /* This ensures that SvTHINKFIRST(sv) is true, and hence that + sv_force_normal(sv) is called. */ + SvFAKE_on(ret_x); + ret = ReANY(ret_x); - (void)ReREFCNT_inc(rx); - /* We can take advantage of the existing "copied buffer" mechanism in SVs - by pointing directly at the buffer, but flagging that the allocated - space in the copy is zero. As we've just done a struct copy, it's now - a case of zero-ing that, rather than copying the current length. */ - SvPV_set(ret_x, RX_WRAPPED(rx)); - SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8); + SvFLAGS(ret_x) |= SvUTF8(rx); + /* We share the same string buffer as the original regexp, on which we + hold a reference count, incremented when mother_re is set below. + The string pointer is copied here, being part of the regexp struct. + */ memcpy(&(ret->xpv_cur), &(r->xpv_cur), sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); - SvLEN_set(ret_x, 0); - SvSTASH_set(ret_x, NULL); - SvMAGIC_set(ret_x, NULL); if (r->offs) { const I32 npar = r->nparens+1; Newx(ret->offs, npar, regexp_paren_pair); @@ -13531,10 +14286,10 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) anchored or float namesakes, and don't hold a second reference. */ } RX_MATCH_COPIED_off(ret_x); -#ifdef PERL_OLD_COPY_ON_WRITE +#ifdef PERL_ANY_COW ret->saved_copy = NULL; #endif - ret->mother_re = rx; + ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx); SvREFCNT_inc_void(ret->qr_anoncv); return ret_x; @@ -13557,7 +14312,7 @@ void Perl_regfree_internal(pTHX_ REGEXP * const rx) { dVAR; - struct regexp *const r = (struct regexp *)SvANY(rx); + struct regexp *const r = ReANY(rx); RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; @@ -13678,8 +14433,8 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) { dVAR; I32 npar; - const struct regexp *r = (const struct regexp *)SvANY(sstr); - struct regexp *ret = (struct regexp *)SvANY(dstr); + const struct regexp *r = ReANY(sstr); + struct regexp *ret = ReANY(dstr); PERL_ARGS_ASSERT_RE_DUP_GUTS; @@ -13739,25 +14494,18 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); else ret->subbeg = NULL; -#ifdef PERL_OLD_COPY_ON_WRITE +#ifdef PERL_ANY_COW ret->saved_copy = NULL; #endif - if (ret->mother_re) { - if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) { - /* Our storage points directly to our mother regexp, but that's + /* Whether mother_re be set or no, we need to copy the string. We + cannot refrain from copying it when the storage points directly to + our mother regexp, because that's 1: a buffer in a different thread 2: something we no longer hold a reference on so we need to copy it locally. */ - /* Note we need to use SvCUR(), rather than - SvLEN(), on our mother_re, because it, in - turn, may well be pointing to its own mother_re. */ - SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re), - SvCUR(ret->mother_re)+1)); - SvLEN_set(dstr, SvCUR(ret->mother_re)+1); - } - ret->mother_re = NULL; - } + RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); + ret->mother_re = NULL; ret->gofs = 0; } #endif /* PERL_IN_XSUB_RE */ @@ -13780,7 +14528,7 @@ void * Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) { dVAR; - struct regexp *const r = (struct regexp *)SvANY(rx); + struct regexp *const r = ReANY(rx); regexp_internal *reti; int len; RXi_GET_DECL(r,ri); @@ -13884,10 +14632,10 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) - regnext - dig the "next" pointer out of a node */ regnode * -Perl_regnext(pTHX_ register regnode *p) +Perl_regnext(pTHX_ regnode *p) { dVAR; - register I32 offset; + I32 offset; if (!p) return(NULL); @@ -13961,11 +14709,13 @@ Perl_save_re_context(pTHX) PL_reg_oldsaved = NULL; PL_reg_oldsavedlen = 0; + PL_reg_oldsavedoffset = 0; + PL_reg_oldsavedcoffset = 0; PL_reg_maxiter = 0; PL_reg_leftiter = 0; PL_reg_poscache = NULL; PL_reg_poscache_size = 0; -#ifdef PERL_OLD_COPY_ON_WRITE +#ifdef PERL_ANY_COW PL_nrs = NULL; #endif @@ -13991,13 +14741,6 @@ Perl_save_re_context(pTHX) } #endif -static void -clear_re(pTHX_ void *r) -{ - dVAR; - ReREFCNT_dec((REGEXP *)r); -} - #ifdef DEBUGGING STATIC void @@ -14046,8 +14789,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, SV* sv, I32 indent, U32 depth) { dVAR; - register U8 op = PSEUDO; /* Arbitrary non-END op. */ - register const regnode *next; + U8 op = PSEUDO; /* Arbitrary non-END op. */ + const regnode *next; const regnode *optstart= NULL; RXi_GET_DECL(r,ri); @@ -14098,9 +14841,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, if (PL_regkind[(U8)op] == BRANCHJ) { assert(next); { - register const regnode *nnode = (OP(next) == LONGJMP - ? regnext((regnode *)next) - : next); + const regnode *nnode = (OP(next) == LONGJMP + ? regnext((regnode *)next) + : next); if (last && nnode > last) nnode = last; DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);