X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/61dad979a56eaefa315dbe8b01c52f0cb2723105..93b74b47eb5e224de232c87de1cafcd5a43fd945:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 921c0e9..d7a289c 100644 --- a/regcomp.c +++ b/regcomp.c @@ -89,10 +89,17 @@ extern const struct regexp_engine my_reg_engine; #include "dquote_static.c" #include "charclass_invlists.h" #include "inline_invlist.c" -#include "utf8_strings.h" +#include "unicode_constants.h" + +#ifdef HAS_ISBLANK +# define hasISBLANK 1 +#else +# define hasISBLANK 0 +#endif #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) +#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) #ifdef op #undef op @@ -152,6 +159,7 @@ typedef struct RExC_state_t { I32 in_lookbehind; I32 contains_locale; I32 override_recoding; + I32 in_multi_char_class; struct reg_code_block *code_blocks; /* positions of literal (?{}) within pattern */ int num_code_blocks; /* size of code_blocks[] */ @@ -206,7 +214,8 @@ typedef struct RExC_state_t { #define RExC_recurse_count (pRExC_state->recurse_count) #define RExC_in_lookbehind (pRExC_state->in_lookbehind) #define RExC_contains_locale (pRExC_state->contains_locale) -#define RExC_override_recoding (pRExC_state->override_recoding) +#define RExC_override_recoding (pRExC_state->override_recoding) +#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') @@ -222,11 +231,12 @@ typedef struct RExC_state_t { #define WORST 0 /* Worst case. */ #define HASWIDTH 0x01 /* Known to match non-null strings. */ -/* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single - * character, and if utf8, must be invariant. Note that this is not the same - * thing as REGNODE_SIMPLE */ +/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single + * character. (There needs to be a case: in the switch statement in regexec.c + * for any node marked SIMPLE.) Note that this is not the same thing as + * REGNODE_SIMPLE */ #define SIMPLE 0x02 -#define SPSTART 0x04 /* Starts with * or +. */ +#define SPSTART 0x04 /* Starts with * or + */ #define TRYAGAIN 0x08 /* Weeded out a declaration. */ #define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */ @@ -291,8 +301,8 @@ typedef struct RExC_state_t { string can occur infinitely far to the right. - minlenp - A pointer to the minimum length of the pattern that the string - was found inside. This is important as in the case of positive + A pointer to the minimum number of characters of the pattern that the + string was found inside. This is important as in the case of positive lookahead or positive lookbehind we can have multiple patterns involved. Consider @@ -439,7 +449,7 @@ static const scan_data_t zero_scan_data = IV len = RExC_end - RExC_precomp; \ \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ + SAVEFREESV(RExC_rx_sv); \ if (len > RegexLengthToShowInErrorMessages) { \ /* chop 10 shorter than the max, to ensure meaning of "..." */ \ len = RegexLengthToShowInErrorMessages - 10; \ @@ -470,7 +480,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL(m) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ + SAVEFREESV(RExC_rx_sv); \ Simple_vFAIL(m); \ } STMT_END @@ -488,7 +498,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL2(m,a1) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ + SAVEFREESV(RExC_rx_sv); \ Simple_vFAIL2(m, a1); \ } STMT_END @@ -507,7 +517,7 @@ static const scan_data_t zero_scan_data = */ #define vFAIL3(m,a1,a2) STMT_START { \ if (!SIZE_ONLY) \ - SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \ + SAVEFREESV(RExC_rx_sv); \ Simple_vFAIL3(m, a1, a2); \ } STMT_END @@ -685,8 +695,6 @@ DEBUG_OPTIMISE_MORE_r(if(data){ \ PerlIO_printf(Perl_debug_log,"\n"); \ }); -static void clear_re(pTHX_ void *r); - /* Mark that we cannot extend a found fixed substring at this point. Update the longest found anchored substring and the longest found floating substrings if needed. */ @@ -750,7 +758,7 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c ANYOF_BITMAP_SETALL(cl); cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL - |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL; + |ANYOF_NON_UTF8_LATIN1_ALL; /* If any portion of the regex is to operate under locale rules, * initialization includes it. The reason this isn't done for all regexes @@ -761,7 +769,7 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c * necessary. */ if (RExC_contains_locale) { ANYOF_CLASS_SETALL(cl); /* /l uses class */ - cl->flags |= ANYOF_LOCALE; + cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD; } else { ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */ @@ -816,8 +824,8 @@ S_cl_and(struct regnode_charclass_class *cl, if (!(ANYOF_CLASS_TEST_ANY_SET(and_with)) && !(ANYOF_CLASS_TEST_ANY_SET(cl)) && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD) - && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) { + && !(and_with->flags & ANYOF_LOC_FOLD) + && !(cl->flags & ANYOF_LOC_FOLD)) { int i; if (and_with->flags & ANYOF_INVERT) @@ -950,8 +958,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) */ else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) - && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) { + && !(or_with->flags & ANYOF_LOC_FOLD) + && !(cl->flags & ANYOF_LOC_FOLD) ) { int i; for (i = 0; i < ANYOF_BITMAP_SIZE; i++) @@ -977,18 +985,14 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con } else { /* 'or_with' is not inverted */ /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) - && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) - || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) { + && (!(or_with->flags & ANYOF_LOC_FOLD) + || (cl->flags & ANYOF_LOC_FOLD)) ) { int i; /* OR char bitmap and class bitmap separately */ for (i = 0; i < ANYOF_BITMAP_SIZE; i++) cl->bitmap[i] |= or_with->bitmap[i]; - if (ANYOF_CLASS_TEST_ANY_SET(or_with)) { - for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++) - cl->classflags[i] |= or_with->classflags[i]; - cl->flags |= ANYOF_CLASS; - } + ANYOF_CLASS_OR(or_with, cl); } else { /* XXXX: logic is complicated, leave it along for a moment. */ cl_anything(pRExC_state, cl); @@ -2581,10 +2585,8 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode /* The below joins as many adjacent EXACTish nodes as possible into a single - * one, and looks for problematic sequences of characters whose folds vs. - * non-folds have sufficiently different lengths, that the optimizer would be - * fooled into rejecting legitimate matches of them, and the trie construction - * code needs to handle specially. The joining is only done if: + * one. The regop may be changed if the node(s) contain certain sequences that + * require special handling. The joining is only done if: * 1) there is room in the current conglomerated node to entirely contain the * next one. * 2) they are the exact same node type @@ -2592,73 +2594,58 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * The adjacent nodes actually may be separated by NOTHING-kind nodes, and * these get optimized out * - * If there are problematic code sequences, *min_subtract is set to the delta - * that the minimum size of the node can be less than its actual size. And, - * the node type of the result is changed to reflect that it contains these - * sequences. + * If a node is to match under /i (folded), the number of characters it matches + * can be different than its character length if it contains a multi-character + * fold. *min_subtract is set to the total delta of the input nodes. * * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF * and contains LATIN SMALL LETTER SHARP S * * This is as good a place as any to discuss the design of handling these - * problematic sequences. It's been wrong in Perl for a very long time. There - * are three code points currently in Unicode whose folded lengths differ so - * much from the un-folded lengths that it causes problems for the optimizer - * and trie construction. Why only these are problematic, and not others where - * lengths also differ is something I (khw) do not understand. New versions of - * Unicode might add more such code points. Hopefully the logic in - * fold_grind.t that figures out what to test (in part by verifying that each - * size-combination gets tested) will catch any that do come along, so they can - * be added to the special handling below. The chances of new ones are - * actually rather small, as most, if not all, of the world's scripts that have - * casefolding have already been encoded by Unicode. Also, a number of - * Unicode's decisions were made to allow compatibility with pre-existing - * standards, and almost all of those have already been dealt with. These - * would otherwise be the most likely candidates for generating further tricky - * sequences. In other words, Unicode by itself is unlikely to add new ones - * unless it is for compatibility with pre-existing standards, and there aren't - * many of those left. - * - * The previous designs for dealing with these involved assigning a special - * node for them. This approach doesn't work, as evidenced by this example: + * multi-character fold sequences. It's been wrong in Perl for a very long + * time. There are three code points in Unicode whose multi-character folds + * were long ago discovered to mess things up. The previous designs for + * dealing with these involved assigning a special node for them. This + * approach doesn't work, as evidenced by this example: * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches - * Both these fold to "sss", but if the pattern is parsed to create a node - * that would match just the \xDF, it won't be able to handle the case where a + * Both these fold to "sss", but if the pattern is parsed to create a node that + * would match just the \xDF, it won't be able to handle the case where a * successful match would have to cross the node's boundary. The new approach * that hopefully generally solves the problem generates an EXACTFU_SS node * that is "sss". * - * There are a number of components to the approach (a lot of work for just - * three code points!): - * 1) This routine examines each EXACTFish node that could contain the - * problematic sequences. It returns in *min_subtract how much to + * It turns out that there are problems with all multi-character folds, and not + * just these three. Now the code is general, for all such cases, but the + * three still have some special handling. The approach taken is: + * 1) This routine examines each EXACTFish node that could contain multi- + * character fold sequences. It returns in *min_subtract how much to * subtract from the the actual length of the string to get a real minimum - * for one that could match it. This number is usually 0 except for the - * problematic sequences. This delta is used by the caller to adjust the - * min length of the match, and the delta between min and max, so that the - * optimizer doesn't reject these possibilities based on size constraints. - * 2) These sequences require special handling by the trie code, so this code - * changes the joined node type to special ops: EXACTFU_TRICKYFOLD and - * EXACTFU_SS. - * 3) This is sufficient for the two Greek sequences (described below), but - * the one involving the Sharp s (\xDF) needs more. The node type - * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss" - * sequence in it. For non-UTF-8 patterns and strings, this is the only - * case where there is a possible fold length change. That means that a - * regular EXACTFU node without UTF-8 involvement doesn't have to concern - * itself with length changes, and so can be processed faster. regexec.c - * takes advantage of this. Generally, an EXACTFish node that is in UTF-8 - * is pre-folded by regcomp.c. This saves effort in regex matching. + * match length; it is 0 if there are no multi-char folds. This delta is + * used by the caller to adjust the min length of the match, and the delta + * between min and max, so that the optimizer doesn't reject these + * possibilities based on size constraints. + * 2) Certain of these sequences require special handling by the trie code, + * so, if found, this code changes the joined node type to special ops: + * EXACTFU_TRICKYFOLD and EXACTFU_SS. + * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS + * is used for an EXACTFU node that contains at least one "ss" sequence in + * it. For non-UTF-8 patterns and strings, this is the only case where + * there is a possible fold length change. That means that a regular + * EXACTFU node without UTF-8 involvement doesn't have to concern itself + * with length changes, and so can be processed faster. regexec.c takes + * advantage of this. Generally, an EXACTFish node that is in UTF-8 is + * pre-folded by regcomp.c. This saves effort in regex matching. * However, the pre-folding isn't done for non-UTF8 patterns because the - * fold of the MICRO SIGN requires UTF-8. Also what EXACTF and EXACTFL - * nodes fold to isn't known until runtime. The fold possibilities for - * the non-UTF8 patterns are quite simple, except for the sharp s. All - * the ones that don't involve a UTF-8 target string are members of a - * fold-pair, and arrays are set up for all of them so that the other - * member of the pair can be found quickly. Code elsewhere in this file - * makes sure that in EXACTFU nodes, the sharp s gets folded to 'ss', even - * if the pattern isn't UTF-8. This avoids the issues described in the - * next item. + * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things + * down by forcing the pattern into UTF8 unless necessary. Also what + * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold + * possibilities for the non-UTF8 patterns are quite simple, except for + * the sharp s. All the ones that don't involve a UTF-8 target string are + * members of a fold-pair, and arrays are set up for all of them so that + * the other member of the pair can be found quickly. Code elsewhere in + * this file makes sure that in EXACTFU nodes, the sharp s gets folded to + * 'ss', even if the pattern isn't UTF-8. This avoids the issues + * described in the next item. * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches * 'ss' or not is not knowable at compile time. It will match iff the * target string is in UTF-8, unlike the EXACTFU nodes, where it always @@ -2778,19 +2765,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * hence missed). The sequences only happen in folding, hence for any * non-EXACT EXACTish node */ if (OP(scan) != EXACT) { - U8 *s; - U8 * s0 = (U8*) STRING(scan); - U8 * const s_end = s0 + STR_LEN(scan); - - /* The below is perhaps overboard, but this allows us to save a test - * each time through the loop at the expense of a mask. This is - * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a - * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64. - * This uses an exclusive 'or' to find that bit and then inverts it to - * form a mask, with just a single 0, in the bit position where 'S' and - * 's' differ. */ - const U8 S_or_s_mask = (U8) ~ ('S' ^ 's'); - const U8 s_masked = 's' & S_or_s_mask; + const U8 * const s0 = (U8*) STRING(scan); + const U8 * s = s0; + const U8 * const s_end = s0 + STR_LEN(scan); /* One pass is made over the node's string looking for all the * possibilities. to avoid some tests in the loop, there are two main @@ -2798,143 +2775,137 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * non-UTF-8 */ if (UTF) { - /* There are two problematic Greek code points in Unicode - * casefolding - * - * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS - * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS - * - * which casefold to - * - * Unicode UTF-8 - * - * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 - * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 - * - * This means that in case-insensitive matching (or "loose - * matching", as Unicode calls it), an EXACTF of length six (the - * UTF-8 encoded byte length of the above casefolded versions) can - * match a target string of length two (the byte length of UTF-8 - * encoded U+0390 or U+03B0). This would rather mess up the - * minimum length computation. (there are other code points that - * also fold to these two sequences, but the delta is smaller) - * - * If these sequences are found, the minimum length is decreased by - * four (six minus two). - * - * Similarly, 'ss' may match the single char and byte LATIN SMALL - * LETTER SHARP S. We decrease the min length by 1 for each - * occurrence of 'ss' found */ - -#define U390_FIRST_BYTE GREEK_SMALL_LETTER_IOTA_UTF8_FIRST_BYTE -#define U3B0_FIRST_BYTE GREEK_SMALL_LETTER_UPSILON_UTF8_FIRST_BYTE - const U8 U390_tail[] = GREEK_SMALL_LETTER_IOTA_UTF8_TAIL - COMBINING_DIAERESIS_UTF8 - COMBINING_ACUTE_ACCENT_UTF8; - const U8 U3B0_tail[] = GREEK_SMALL_LETTER_UPSILON_UTF8_TAIL - COMBINING_DIAERESIS_UTF8 - COMBINING_ACUTE_ACCENT_UTF8; - const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte; - yields a net of 0 */ - /* Examine the string for one of the problematic sequences */ - for (s = s0; - s < s_end - 1; /* Can stop 1 before the end, as minimum length - * sequence we are looking for is 2 */ - s += UTF8SKIP(s)) + /* Examine the string for a multi-character fold sequence. UTF-8 + * patterns have all characters pre-folded by the time this code is + * executed */ + while (s < s_end - 1) /* Can stop 1 before the end, as minimum + length sequence we are looking for is 2 */ { + int count = 0; + int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end); + if (! len) { /* Not a multi-char fold: get next char */ + s += UTF8SKIP(s); + continue; + } - /* Look for the first byte in each problematic sequence */ - switch (*s) { - /* We don't have to worry about other things that fold to - * 's' (such as the long s, U+017F), as all above-latin1 - * code points have been pre-folded */ - case 's': - case 'S': - - /* Current character is an 's' or 'S'. If next one is - * as well, we have the dreaded sequence */ - if (((*(s+1) & S_or_s_mask) == s_masked) - /* These two node types don't have special handling - * for 'ss' */ - && OP(scan) != EXACTFL && OP(scan) != EXACTFA) - { - *min_subtract += 1; - OP(scan) = EXACTFU_SS; - s++; /* No need to look at this character again */ - } - break; - - case U390_FIRST_BYTE: - if (s_end - s >= len - - /* The 1's are because are skipping comparing the - * first byte */ - && memEQ(s + 1, U390_tail, len - 1)) - { - goto greek_sequence; - } - break; + /* Nodes with 'ss' require special handling, except for EXACTFL + * and EXACTFA for which there is no multi-char fold to this */ + if (len == 2 && *s == 's' && *(s+1) == 's' + && OP(scan) != EXACTFL && OP(scan) != EXACTFA) + { + count = 2; + OP(scan) = EXACTFU_SS; + s += 2; + } + else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */ + && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8 + COMBINING_DIAERESIS_UTF8 + COMBINING_ACUTE_ACCENT_UTF8, + 6) + || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8 + COMBINING_DIAERESIS_UTF8 + COMBINING_ACUTE_ACCENT_UTF8, + 6))) + { + count = 3; + + /* These two folds require special handling by trie's, so + * change the node type to indicate this. If EXACTFA and + * EXACTFL were ever to be handled by trie's, this would + * have to be changed. If this node has already been + * changed to EXACTFU_SS in this loop, leave it as is. (I + * (khw) think it doesn't matter in regexec.c for UTF + * patterns, but no need to change it */ + if (OP(scan) == EXACTFU) { + OP(scan) = EXACTFU_TRICKYFOLD; + } + s += 6; + } + else { /* Here is a generic multi-char fold. */ + const U8* multi_end = s + len; + + /* Count how many characters in it. In the case of /l and + * /aa, no folds which contain ASCII code points are + * allowed, so check for those, and skip if found. (In + * EXACTFL, no folds are allowed to any Latin1 code point, + * not just ASCII. But there aren't any of these + * currently, nor ever likely, so don't take the time to + * test for them. The code that generates the + * is_MULTI_foo() macros croaks should one actually get put + * into Unicode .) */ + if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { + count = utf8_length(s, multi_end); + s = multi_end; + } + else { + while (s < multi_end) { + if (isASCII(*s)) { + s++; + goto next_iteration; + } + else { + s += UTF8SKIP(s); + } + count++; + } + } + } - case U3B0_FIRST_BYTE: - if (! (s_end - s >= len - && memEQ(s + 1, U3B0_tail, len - 1))) - { - break; - } - greek_sequence: - *min_subtract += 4; - - /* This requires special handling by trie's, so change - * the node type to indicate this. If EXACTFA and - * EXACTFL were ever to be handled by trie's, this - * would have to be changed. If this node has already - * been changed to EXACTFU_SS in this loop, leave it as - * is. (I (khw) think it doesn't matter in regexec.c - * for UTF patterns, but no need to change it */ - if (OP(scan) == EXACTFU) { - OP(scan) = EXACTFU_TRICKYFOLD; - } - s += 6; /* We already know what this sequence is. Skip - the rest of it */ - break; - } + /* The delta is how long the sequence is minus 1 (1 is how long + * the character that folds to the sequence is) */ + *min_subtract += count - 1; + next_iteration: ; } } else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { - /* Here, the pattern is not UTF-8. We need to look only for the - * 'ss' sequence, and in the EXACTF case, the sharp s, which can be - * in the final position. Otherwise we can stop looking 1 byte - * earlier because have to find both the first and second 's' */ + /* Here, the pattern is not UTF-8. Look for the multi-char folds + * that are all ASCII. As in the above case, EXACTFL and EXACTFA + * nodes can't have multi-char folds to this range (and there are + * no existing ones in the upper latin1 range). In the EXACTF + * case we look also for the sharp s, which can be in the final + * position. Otherwise we can stop looking 1 byte earlier because + * have to find at least two characters for a multi-fold */ const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; - for (s = s0; s < upper; s++) { - switch (*s) { - case 'S': - case 's': - if (s_end - s > 1 - && ((*(s+1) & S_or_s_mask) == s_masked)) - { - *min_subtract += 1; - - /* EXACTF nodes need to know that the minimum - * length changed so that a sharp s in the string - * can match this ss in the pattern, but they - * remain EXACTF nodes, as they won't match this - * unless the target string is is UTF-8, which we - * don't know until runtime */ - if (OP(scan) != EXACTF) { - OP(scan) = EXACTFU_SS; - } - s++; - } - break; - case LATIN_SMALL_LETTER_SHARP_S: - if (OP(scan) == EXACTF) { - *has_exactf_sharp_s = TRUE; - } - break; + /* The below is perhaps overboard, but this allows us to save a + * test each time through the loop at the expense of a mask. This + * is because on both EBCDIC and ASCII machines, 'S' and 's' differ + * by a single bit. On ASCII they are 32 apart; on EBCDIC, they + * are 64. This uses an exclusive 'or' to find that bit and then + * inverts it to form a mask, with just a single 0, in the bit + * position where 'S' and 's' differ. */ + const U8 S_or_s_mask = (U8) ~ ('S' ^ 's'); + const U8 s_masked = 's' & S_or_s_mask; + + while (s < upper) { + int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end); + if (! len) { /* Not a multi-char fold. */ + if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF) + { + *has_exactf_sharp_s = TRUE; + } + s++; + continue; + } + + if (len == 2 + && ((*s & S_or_s_mask) == s_masked) + && ((*(s+1) & S_or_s_mask) == s_masked)) + { + + /* EXACTF nodes need to know that the minimum length + * changed so that a sharp s in the string can match this + * ss in the pattern, but they remain EXACTF nodes, as they + * won't match this unless the target string is is UTF-8, + * which we don't know until runtime */ + if (OP(scan) != EXACTF) { + OP(scan) = EXACTFU_SS; + } } + + *min_subtract += len - 1; + s += len; } } } @@ -3025,7 +2996,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { dVAR; - I32 min = 0, pars = 0, code; + I32 min = 0; /* There must be at least this number of characters to match */ + I32 pars = 0, code; regnode *scan = *scanp, *next; I32 delta = 0; int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); @@ -3052,9 +3024,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, fake_study_recurse: while ( scan && OP(scan) != END && scan < last ){ - UV min_subtract = 0; /* How much to subtract from the minimum node - length to get a real minimum (because the - folded version may be shorter) */ + UV min_subtract = 0; /* How mmany chars to subtract from the minimum + node length to get a real minimum (because + the folded version may be shorter) */ bool has_exactf_sharp_s = FALSE; /* Peephole optimizer: */ DEBUG_STUDYDATA("Peep:", data,depth); @@ -3419,7 +3391,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * trietype so we can turn them into a trie. If/when we * allow NOTHING to start a trie sequence this condition will be * required, and it isn't expensive so we leave it in for now. */ - if ( trietype != NOTHING ) + if ( trietype && trietype != NOTHING ) make_trie( pRExC_state, startbranch, first, cur, tail, count, trietype, depth+1 ); @@ -3450,7 +3422,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); - if ( last ) { + if ( last && trietype ) { if ( trietype != NOTHING ) { /* the last branch of the sequence was part of a trie, * so we have to construct it here outside of the loop @@ -3607,7 +3579,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (uc >= 0x100 || (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) && !ANYOF_BITMAP_TEST(data->start_class, uc) - && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) + && (!(data->start_class->flags & ANYOF_LOC_FOLD) || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) ) { @@ -3666,9 +3638,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, RExC_seen |= REG_SEEN_EXACTF_SHARP_S; } min += l - min_subtract; - if (min < 0) { - min = 0; - } + assert (min >= 0); delta += min_subtract; if (flags & SCF_DO_SUBSTR) { data->pos_min += l - min_subtract; @@ -3695,12 +3665,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (compat) { ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; - data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD; if (OP(scan) == EXACTFL) { /* XXX This set is probably no longer necessary, and * probably wrong as LOCALE now is on in the initial * state */ - data->start_class->flags |= ANYOF_LOCALE; + data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD; } else { @@ -3735,7 +3704,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } else if (flags & SCF_DO_STCLASS_OR) { - if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) { + if (data->start_class->flags & ANYOF_LOC_FOLD) { /* false positive possible if the class is case-folded. Assume that the locale settings are the same... */ if (uc < 0x100) { @@ -3898,8 +3867,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) && maxcount <= REG_INFTY/3) /* Complement check for big count */ { + /* Fatal warnings may leak the regexp without this: */ + SAVEFREESV(RExC_rx_sv); ckWARNreg(RExC_parse, "Quantifier unexpected on zero-length expression"); + (void)ReREFCNT_inc(RExC_rx_sv); } min += minnext * mincount; @@ -4157,8 +4129,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; } - min += 1; - delta += 1; + min++; + delta++; /* Because of the 2 char string cr-lf */ if (flags & SCF_DO_SUBSTR) { SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ data->pos_min += 1; @@ -4209,7 +4181,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, case ALNUM: if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR); if (OP(scan) == ALNUMU) { for (value = 0; value < 256; value++) { if (!isWORDCHAR_L1(value)) { @@ -4227,7 +4199,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else { if (data->start_class->flags & ANYOF_LOCALE) - ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); + ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR); /* Even if under locale, set the bits for non-locale * in case it isn't a true locale-node. This will @@ -4250,7 +4222,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, case NALNUM: if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR); if (OP(scan) == NALNUMU) { for (value = 0; value < 256; value++) { if (isWORDCHAR_L1(value)) { @@ -4268,7 +4240,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else { if (data->start_class->flags & ANYOF_LOCALE) - ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); + ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR); /* Even if under locale, set the bits for non-locale in * case it isn't a true locale-node. This will create @@ -4897,13 +4869,18 @@ Perl_reginitcolors(pTHX) #ifdef TRIE_STUDY_OPT -#define CHECK_RESTUDY_GOTO \ +#define CHECK_RESTUDY_GOTO_butfirst(dOsomething) \ + STMT_START { \ if ( \ (data.flags & SCF_TRIE_RESTUDY) \ && ! restudied++ \ - ) goto reStudy + ) { \ + dOsomething; \ + goto reStudy; \ + } \ + } STMT_END #else -#define CHECK_RESTUDY_GOTO +#define CHECK_RESTUDY_GOTO_butfirst #endif /* @@ -5030,7 +5007,7 @@ S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr, * * becomes * - * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno' + * qr'a\\bc_______________________def\'ghi\\\\jkl(?{"this is runtime"})mno' * * After eval_sv()-ing that, grab any new code blocks from the returned qr * and merge them with any code blocks of the original regexp. @@ -5083,7 +5060,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, /* blank out literal code block */ assert(pat[s] == '('); while (s <= pRExC_state->code_blocks[n].end) { - *p++ = ' '; + *p++ = '_'; s++; } s--; @@ -5120,8 +5097,15 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, SPAGAIN; qr_ref = POPs; PUTBACK; - if (SvTRUE(ERRSV)) - Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV)); + { + SV * const errsv = ERRSV; + if (SvTRUE_NN(errsv)) + { + Safefree(pRExC_state->code_blocks); + /* use croak_sv ? */ + Perl_croak_nocontext("%s", SvPV_nolen_const(errsv)); + } + } assert(SvROK(qr_ref)); qr = SvRV(qr_ref); assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); @@ -5148,13 +5132,16 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, /* merge the main (r1) and run-time (r2) code blocks into one */ { - RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2); + RXi_GET_DECL(ReANY((REGEXP *)qr), r2); struct reg_code_block *new_block, *dst; RExC_state_t * const r1 = pRExC_state; /* convenient alias */ int i1 = 0, i2 = 0; if (!r2->num_code_blocks) /* we guessed wrong */ + { + SvREFCNT_dec(qr); return 1; + } Newx(new_block, r1->num_code_blocks + r2->num_code_blocks, @@ -5311,6 +5298,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, I32 minlen = 0; U32 rx_flags; SV * VOL pat; + SV * VOL code_blocksv = NULL; /* these are all flags - maybe they should be turned * into a single int with different bit masks */ @@ -5389,6 +5377,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist); PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist); + + PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist); } #endif @@ -5449,7 +5439,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (pRExC_state->num_code_blocks) { o = cLISTOPx(expr)->op_first; - assert(o->op_type == OP_PUSHMARK); + assert( o->op_type == OP_PUSHMARK + || (o->op_type == OP_NULL && o->op_targ == OP_PUSHMARK) + || o->op_type == OP_PADRANGE); o = o->op_sibling; } @@ -5473,6 +5465,17 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, SV *sv, *msv = *svp; SV *rx; bool code = 0; + /* we make the assumption here that each op in the list of + * op_siblings maps to one SV pushed onto the stack, + * except for code blocks, with have both an OP_NULL and + * and OP_CONST. + * This allows us to match up the list of SVs against the + * list of OPs to find the next code block. + * + * Note that PUSHMARK PADSV PADSV .. + * is optimised to + * PADRANGE NULL NULL .. + * so the alignment still works. */ if (o) { if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) { assert(n < pRExC_state->num_code_blocks); @@ -5501,8 +5504,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, else { while (SvAMAGIC(msv) && (sv = AMG_CALLunary(msv, string_amg)) - && sv != msv) - { + && sv != msv + && !( SvROK(msv) + && SvROK(sv) + && SvRV(msv) == SvRV(sv)) + ) { msv = sv; SvGETMAGIC(msv); } @@ -5520,7 +5526,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && RX_ENGINE((REGEXP*)rx)->op_comp) { - RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri); + RXi_GET_DECL(ReANY((REGEXP *)rx), ri); if (ri->num_code_blocks) { int i; /* the presence of an embedded qr// with code means @@ -5535,7 +5541,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, for (i=0; i < ri->num_code_blocks; i++) { struct reg_code_block *src, *dst; STRLEN offset = orig_patlen - + ((struct regexp *)SvANY(rx))->pre_prefix; + + ReANY((REGEXP *)rx)->pre_prefix; assert(n < pRExC_state->num_code_blocks); src = &ri->code_blocks[i]; dst = &pRExC_state->code_blocks[n]; @@ -5775,7 +5781,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_pm_flags = pm_flags; if (runtime_code) { - if (PL_tainting && PL_tainted) + if (TAINTING_get && TAINT_get) Perl_croak(aTHX_ "Eval-group in insecure regular expression"); if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { @@ -5793,6 +5799,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; RExC_override_recoding = 0; + RExC_in_multi_char_class = 0; /* First pass: determine size, legality. */ RExC_parse = exp; @@ -5824,11 +5831,23 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_lastnum=0; RExC_lastparse=NULL; ); + /* reg may croak on us, not giving us a chance to free + pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may + need it to survive as long as the regexp (qr/(?{})/). + We must check that code_blocksv is not already set, because we may + have longjmped back. */ + if (pRExC_state->code_blocks && !code_blocksv) { + code_blocksv = newSV_type(SVt_PV); + SAVEFREESV(code_blocksv); + SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks); + SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/ + } if (reg(pRExC_state, 0, &flags,1) == NULL) { RExC_precomp = NULL; - Safefree(pRExC_state->code_blocks); return(NULL); } + if (code_blocksv) + SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ /* Here, finished first pass. Get rid of any added setjmp */ if (used_setjump) { @@ -5864,7 +5883,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, of zeroing when in debug mode, thus anything assigned has to happen after that */ rx = (REGEXP*) newSV_type(SVt_REGEXP); - r = (struct regexp*)SvANY(rx); + r = ReANY(rx); Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char, regexp_internal); if ( r == NULL || ri == NULL ) @@ -5886,7 +5905,13 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, ri->num_code_blocks = pRExC_state->num_code_blocks; } else + { + int n; + for (n = 0; n < pRExC_state->num_code_blocks; n++) + if (pRExC_state->code_blocks[n].src_regex) + SAVEFREESV(pRExC_state->code_blocks[n].src_regex); SAVEFREEPV(pRExC_state->code_blocks); + } { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); @@ -5917,8 +5942,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); - p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */ - SvPOK_on(rx); + Newx(p, wraplen + 1, char); /* +1 for the ending NUL */ + r->xpv_len_u.xpvlenu_pv = p; if (RExC_utf8) SvFLAGS(rx) |= SVf_UTF8; *p++='('; *p++='?'; @@ -5953,7 +5978,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, *p++ = '\n'; *p++ = ')'; *p = 0; - SvCUR_set(rx, p - SvPVX_const(rx)); + SvCUR_set(rx, p - RX_WRAPPED(rx)); } r->intflags = 0; @@ -6021,11 +6046,6 @@ reStudy: RExC_seen |= REG_TOP_LEVEL_BRANCHES; else RExC_seen &= ~REG_TOP_LEVEL_BRANCHES; - if (data.last_found) { - SvREFCNT_dec(data.longest_fixed); - SvREFCNT_dec(data.longest_float); - SvREFCNT_dec(data.last_found); - } StructCopy(&zero_scan_data, &data, scan_data_t); } #else @@ -6191,6 +6211,10 @@ reStudy: data.longest_float = newSVpvs(""); data.last_found = newSVpvs(""); data.longest = &(data.longest_fixed); + ENTER_with_name("study_chunk"); + SAVEFREESV(data.longest_fixed); + SAVEFREESV(data.longest_float); + SAVEFREESV(data.last_found); first = scan; if (!ri->regstclass) { cl_init(pRExC_state, &ch_class); @@ -6205,7 +6229,7 @@ reStudy: SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); - CHECK_RESTUDY_GOTO; + CHECK_RESTUDY_GOTO_butfirst(LEAVE_with_name("study_chunk")); if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) @@ -6215,7 +6239,6 @@ reStudy: && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS))) r->extflags |= RXf_CHECK_ALL; scan_commit(pRExC_state, &data,&minlen,0); - SvREFCNT_dec(data.last_found); longest_float_length = CHR_SVLEN(data.longest_float); @@ -6238,10 +6261,10 @@ reStudy: r->float_max_offset = data.offset_float_max; if (data.offset_float_max < I32_MAX) /* Don't offset infinity */ r->float_max_offset -= data.lookbehind_float; + SvREFCNT_inc_simple_void_NN(data.longest_float); } else { r->float_substr = r->float_utf8 = NULL; - SvREFCNT_dec(data.longest_float); longest_float_length = 0; } @@ -6260,12 +6283,13 @@ reStudy: data.flags & SF_FIX_BEFORE_MEOL)) { r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; + SvREFCNT_inc_simple_void_NN(data.longest_fixed); } else { r->anchored_substr = r->anchored_utf8 = NULL; - SvREFCNT_dec(data.longest_fixed); longest_fixed_length = 0; } + LEAVE_with_name("study_chunk"); if (ri->regstclass && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) @@ -6340,7 +6364,7 @@ reStudy: minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size, &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0); - CHECK_RESTUDY_GOTO; + CHECK_RESTUDY_GOTO_butfirst(NOOP); r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; @@ -6385,7 +6409,10 @@ reStudy: if (RExC_seen & REG_SEEN_CANY) r->extflags |= RXf_CANY_SEEN; if (RExC_seen & REG_SEEN_VERBARG) + { r->intflags |= PREGf_VERBARG_SEEN; + r->extflags |= RXf_MODIFIES_VARS; + } if (RExC_seen & REG_SEEN_CUTGROUP) r->intflags |= PREGf_CUTGROUP_SEEN; if (pm_flags & PMf_USE_RE_EVAL) @@ -6398,18 +6425,12 @@ reStudy: #ifdef STUPID_PATTERN_CHECKS if (RX_PRELEN(rx) == 0) r->extflags |= RXf_NULL; - if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') - /* XXX: this should happen BEFORE we compile */ - r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); - else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3)) + if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3)) r->extflags |= RXf_WHITE; else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^') r->extflags |= RXf_START_ONLY; #else - if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') - /* XXX: this should happen BEFORE we compile */ - r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); - else { + { regnode *first = ri->program + 1; U8 fop = OP(first); @@ -6472,7 +6493,7 @@ Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, if (flags & RXapif_FETCH) { return reg_named_buff_fetch(rx, key, flags); } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { - Perl_croak_no_modify(aTHX); + Perl_croak_no_modify(); return NULL; } else if (flags & RXapif_EXISTS) { return reg_named_buff_exists(rx, key, flags) @@ -6511,7 +6532,7 @@ Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, { AV *retarray = NULL; SV *ret; - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH; @@ -6551,7 +6572,7 @@ bool Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, const U32 flags) { - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS; @@ -6575,7 +6596,7 @@ Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key, SV* Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) { - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY; @@ -6591,7 +6612,7 @@ Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags) SV* Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags) { - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY; @@ -6627,7 +6648,7 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) SV *ret; AV *av; I32 length; - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR; @@ -6651,7 +6672,7 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags) SV* Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags) { - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); AV *av = newAV(); PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL; @@ -6687,47 +6708,67 @@ void Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, SV * const sv) { - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); char *s = NULL; I32 i = 0; I32 s1, t1; + I32 n = paren; PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; - if (!rx->subbeg) { - sv_setsv(sv,&PL_sv_undef); - return; - } - else - if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) { - /* $` */ + if ( ( n == RX_BUFF_IDX_CARET_PREMATCH + || n == RX_BUFF_IDX_CARET_FULLMATCH + || n == RX_BUFF_IDX_CARET_POSTMATCH + ) + && !(rx->extflags & RXf_PMf_KEEPCOPY) + ) + goto ret_undef; + + if (!rx->subbeg) + goto ret_undef; + + if (n == RX_BUFF_IDX_CARET_FULLMATCH) + /* no need to distinguish between them any more */ + n = RX_BUFF_IDX_FULLMATCH; + + if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) + && rx->offs[0].start != -1) + { + /* $`, ${^PREMATCH} */ i = rx->offs[0].start; s = rx->subbeg; } else - if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) { - /* $' */ - s = rx->subbeg + rx->offs[0].end; - i = rx->sublen - rx->offs[0].end; + if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) + && rx->offs[0].end != -1) + { + /* $', ${^POSTMATCH} */ + s = rx->subbeg - rx->suboffset + rx->offs[0].end; + i = rx->sublen + rx->suboffset - rx->offs[0].end; } else - if ( 0 <= paren && paren <= (I32)rx->nparens && - (s1 = rx->offs[paren].start) != -1 && - (t1 = rx->offs[paren].end) != -1) + if ( 0 <= n && n <= (I32)rx->nparens && + (s1 = rx->offs[n].start) != -1 && + (t1 = rx->offs[n].end) != -1) { - /* $& $1 ... */ + /* $&, ${^MATCH}, $1 ... */ i = t1 - s1; - s = rx->subbeg + s1; + s = rx->subbeg + s1 - rx->suboffset; } else { - sv_setsv(sv,&PL_sv_undef); - return; + goto ret_undef; } + + assert(s >= rx->subbeg); assert(rx->sublen >= (s - rx->subbeg) + i ); if (i >= 0) { - const int oldtainted = PL_tainted; +#if NO_TAINT_SUPPORT + sv_setpvn(sv, s, i); +#else + const int oldtainted = TAINT_get; TAINT_NOT; sv_setpvn(sv, s, i); - PL_tainted = oldtainted; + TAINT_set(oldtainted); +#endif if ( (rx->extflags & RXf_CANY_SEEN) ? (RXp_MATCH_UTF8(rx) && (!i || is_utf8_string((U8*)s, i))) @@ -6737,12 +6778,12 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, } else SvUTF8_off(sv); - if (PL_tainting) { + if (TAINTING_get) { if (RXp_MATCH_TAINTED(rx)) { if (SvTYPE(sv) >= SVt_PVMG) { MAGIC* const mg = SvMAGIC(sv); MAGIC* mgt; - PL_tainted = 1; + TAINT; SvMAGIC_set(sv, mg->mg_moremagic); SvTAINT(sv); if ((mgt = SvMAGIC(sv))) { @@ -6750,13 +6791,14 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, SvMAGIC_set(sv, mg); } } else { - PL_tainted = 1; + TAINT; SvTAINT(sv); } } else SvTAINTED_off(sv); } } else { + ret_undef: sv_setsv(sv,&PL_sv_undef); return; } @@ -6773,23 +6815,27 @@ Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, PERL_UNUSED_ARG(value); if (!PL_localizing) - Perl_croak_no_modify(aTHX); + Perl_croak_no_modify(); } I32 Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, const I32 paren) { - struct regexp *const rx = (struct regexp *)SvANY(r); + struct regexp *const rx = ReANY(r); I32 i; I32 s1, t1; PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; /* Some of this code was originally in C in F */ - switch (paren) { - /* $` / ${^PREMATCH} */ - case RX_BUFF_IDX_PREMATCH: + switch (paren) { + case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ + if (!(rx->extflags & RXf_PMf_KEEPCOPY)) + goto warn_undef; + /*FALLTHROUGH*/ + + case RX_BUFF_IDX_PREMATCH: /* $` */ if (rx->offs[0].start != -1) { i = rx->offs[0].start; if (i > 0) { @@ -6799,8 +6845,11 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } } return 0; - /* $' / ${^POSTMATCH} */ - case RX_BUFF_IDX_POSTMATCH: + + case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ + if (!(rx->extflags & RXf_PMf_KEEPCOPY)) + goto warn_undef; + case RX_BUFF_IDX_POSTMATCH: /* $' */ if (rx->offs[0].end != -1) { i = rx->sublen - rx->offs[0].end; if (i > 0) { @@ -6810,6 +6859,12 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } } return 0; + + case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */ + if (!(rx->extflags & RXf_PMf_KEEPCOPY)) + goto warn_undef; + /*FALLTHROUGH*/ + /* $& / ${^MATCH}, $1, $2, ... */ default: if (paren <= (I32)rx->nparens && @@ -6819,6 +6874,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, i = t1 - s1; goto getlen; } else { + warn_undef: if (ckWARN(WARN_UNINITIALIZED)) report_uninit((const SV *)sv); return 0; @@ -6826,7 +6882,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } getlen: if (i > 0 && RXp_MATCH_UTF8(rx)) { - const char * const s = rx->subbeg + s1; + const char * const s = rx->subbeg - rx->suboffset + s1; const U8 *ep; STRLEN el; @@ -6979,9 +7035,10 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * list.) * Taking the complement (inverting) an inversion list is quite simple, if the * first element is 0, remove it; otherwise add a 0 element at the beginning. - * This implementation reserves an element at the beginning of each inversion list - * to contain 0 when the list contains 0, and contains 1 otherwise. The actual - * beginning of the list is either that element if 0, or the next one if 1. + * This implementation reserves an element at the beginning of each inversion + * list to contain 0 when the list contains 0, and contains 1 otherwise. The + * actual beginning of the list is either that element if 0, or the next one if + * 1. * * More about inversion lists can be found in "Unicode Demystified" * Chapter 13 by Richard Gillam, published by Addison-Wesley. @@ -7331,8 +7388,8 @@ Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) * And benchmarks show that caching gives better results. We also test * here if the code point is within the bounds of the list. These tests * replace others that would have had to be made anyway to make sure that - * the array bounds were not exceeded, and give us extra information at the - * same time */ + * the array bounds were not exceeded, and these give us extra information + * at the same time */ if (cp >= array[mid]) { if (cp >= array[highest_element]) { return highest_element; @@ -8200,15 +8257,17 @@ Perl__invlist_contents(pTHX_ SV* const invlist) } #endif -#if 0 +#ifdef PERL_ARGS_ASSERT__INVLIST_DUMP void -S_invlist_dump(pTHX_ SV* const invlist, const char * const header) +Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header) { /* Dumps out the ranges in an inversion list. The string 'header' * if present is output on a line before the first range */ UV start, end; + PERL_ARGS_ASSERT__INVLIST_DUMP; + if (header && strlen(header)) { PerlIO_printf(Perl_debug_log, "%s\n", header); } @@ -8217,8 +8276,12 @@ S_invlist_dump(pTHX_ SV* const invlist, const char * const header) if (end == UV_MAX) { PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start); } + else if (end != start) { + PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", + start, end); + } else { - PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end); + PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start); } } } @@ -9421,6 +9484,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char *parse_start; #endif const char *maxpos = NULL; + + /* Save the original in case we change the emitted regop to a FAIL. */ + regnode * const orig_emit = RExC_emit; + GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGPIECE; @@ -9467,6 +9534,23 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); RExC_parse = next; nextchar(pRExC_state); + if (max < min) { /* If can't match, warn and optimize to fail + unconditionally */ + if (SIZE_ONLY) { + ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); + + /* We can't back off the size because we have to reserve + * enough space for all the things we are about to throw + * away, but we can shrink it by the ammount we are about + * to re-use here */ + RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; + } + else { + RExC_emit = orig_emit; + } + ret = reg_node(pRExC_state, OPFAIL); + return ret; + } do_curly: if ((flags&SIMPLE)) { @@ -9504,8 +9588,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp = WORST; if (max > 0) *flagp |= HASWIDTH; - if (max < min) - vFAIL("Can't do {n,m} with n > m"); if (!SIZE_ONLY) { ARG1_SET(ret, (U16)min); ARG2_SET(ret, (U16)max); @@ -9567,10 +9649,12 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } nest_check: if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { + SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ ckWARN3reg(RExC_parse, "%.*s matches null string many times", (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), origparse); + (void)ReREFCNT_inc(RExC_rx_sv); } if (RExC_parse < RExC_end && *RExC_parse == '?') { @@ -9773,6 +9857,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I if (in_char_class && has_multiple_chars) { ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); } + RExC_parse = endbrace + 1; } else if (! node_p || ! has_multiple_chars) { @@ -9897,13 +9982,13 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 * it (by setting <*flagp>, and potentially populating it with a single * character. * - * If is non-zero, this function assumes that the node has already - * been populated, and just does the sizing. In this case - * should be the final code point that has already been placed into the - * node. This value will be ignored except that under some circumstances - * <*flagp> is set based on it. + * If (the length in bytes) is non-zero, this function assumes that + * the node has already been populated, and just does the sizing. In this + * case should be the final code point that has already been + * placed into the node. This value will be ignored except that under some + * circumstances <*flagp> is set based on it. * - * If is zero, the function assumes that the node is to contain only * the single character given by and calculates what * should be. In pass 1, it sizes the node appropriately. In pass 2, it * additionally will populate the node's STRING with , if @@ -9954,8 +10039,15 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32 } *flagp |= HASWIDTH; - if (len == 1 && UNI_IS_INVARIANT(code_point)) + + /* A single character node is SIMPLE, except for the special-cased SHARP S + * under /di. */ + if ((len == 1 || (UTF && len == UNISKIP(code_point))) + && (code_point != LATIN_SMALL_LETTER_SHARP_S + || ! FOLD || ! DEPENDS_SEMANTICS)) + { *flagp |= SIMPLE; + } } /* @@ -10469,6 +10561,11 @@ tryagain: bool next_is_quantifier; char * oldp = NULL; + /* If a folding node contains only code points that don't + * participate in folds, it can be changed into an EXACT node, + * which allows the optimizer more things to look for */ + bool maybe_exact; + ender = 0; node_type = compute_EXACTish(pRExC_state); ret = reg_node(pRExC_state, node_type); @@ -10481,6 +10578,11 @@ tryagain: reparse: + /* We do the EXACTFish to EXACT node only if folding, and not if in + * locale, as whether a character folds or not isn't known until + * runtime */ + maybe_exact = FOLD && ! LOC; + /* XXX The node can hold up to 255 bytes, yet this only goes to * 127. I (khw) do not know why. Keeping it somewhat less than * 255 allows us to not have to worry about overflow due to @@ -10762,13 +10864,44 @@ tryagain: } } else { - ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen, - FOLD_FLAGS_FULL - | ((LOC) ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0) - ); + UV folded = _to_uni_fold_flags( + ender, + (U8 *) s, + &foldlen, + FOLD_FLAGS_FULL + | ((LOC) ? FOLD_FLAGS_LOCALE + : (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0) + ); + + /* If this node only contains non-folding code + * points so far, see if this new one is also + * non-folding */ + if (maybe_exact) { + if (folded != ender) { + maybe_exact = FALSE; + } + else { + /* Here the fold is the original; we have + * to check further to see if anything + * folds to it */ + if (! PL_utf8_foldable) { + SV* swash = swash_init("utf8", + "_Perl_Any_Folds", + &PL_sv_undef, 1, 0); + PL_utf8_foldable = + _get_swash_invlist(swash); + SvREFCNT_dec(swash); + } + if (_invlist_contains_cp(PL_utf8_foldable, + ender)) + { + maybe_exact = FALSE; + } + } + } + ender = folded; } s += foldlen; @@ -10782,6 +10915,7 @@ tryagain: } else { *(s++) = ender; + maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender); } } else if (UTF) { @@ -10971,6 +11105,12 @@ tryagain: loopdone: /* Jumped to when encounters something that shouldn't be in the node */ + /* If 'maybe_exact' is still set here, means there are no + * code points in the node that participate in folds */ + if (FOLD && maybe_exact) { + OP(ret) = EXACT; + } + /* I (khw) don't know if you can get here with zero length, but the * old code handled this situation by creating a zero-length EXACT * node. Might as well be NOTHING instead */ @@ -11035,8 +11175,8 @@ S_regwhite( RExC_state_t *pRExC_state, char *p ) #define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.') #define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c)) -STATIC I32 -S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) +PERL_STATIC_INLINE I32 +S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, SV *free_me) { dVAR; I32 namedclass = OOB_NAMEDCLASS; @@ -11070,7 +11210,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) switch (skip) { case 4: if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */ - namedclass = ANYOF_ALNUM; + namedclass = ANYOF_WORDCHAR; break; case 5: /* Names all of length 5. */ @@ -11146,7 +11286,8 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) the class closes */ while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']') RExC_parse++; - Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); + SvREFCNT_dec(free_me); + vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); } } else { /* Maternal grandfather: @@ -11159,36 +11300,6 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) return namedclass; } -STATIC void -S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) -{ - dVAR; - - PERL_ARGS_ASSERT_CHECKPOSIXCC; - - if (POSIXCC(UCHARAT(RExC_parse))) { - const char *s = RExC_parse; - const char c = *s++; - - while (isALNUM(*s)) - s++; - if (*s && c == *s && s[1] == ']') { - ckWARN3reg(s+2, - "POSIX syntax [%c %c] belongs inside character classes", - c, c); - - /* [[=foo=]] and [[.foo.]] are still future. */ - if (POSIXCC_NOTYET(c)) { - /* adjust RExC_parse so the error shows after - the class closes */ - while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']') - NOOP; - Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); - } - } - } -} - /* Generate the code to add a full posix character to the bracketed * character class given by . ( is needed only under locale rules) * destlist is the inversion list for non-locale rules that this class is @@ -11322,24 +11433,6 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) } \ } -STATIC void -S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) -{ - /* Adds input 'string' with length 'len' to the ANYOF node's unicode - * alternate list, pointed to by 'alternate_ptr'. This is an array of - * the multi-character folds of characters in the node */ - SV *sv; - - PERL_ARGS_ASSERT_ADD_ALTERNATE; - - if (! *alternate_ptr) { - *alternate_ptr = newAV(); - } - sv = newSVpvn_utf8((char*)string, len, TRUE); - av_push(*alternate_ptr, sv); - return; -} - /* The names of properties whose definitions are not known at compile time are * stored in this SV, after a constant heading. So if the length has been * changed since initialization, then there is a run-time definition. */ @@ -11349,27 +11442,40 @@ S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) * number defined in handy.h. */ #define namedclass_to_classnum(class) ((class) / 2) -/* - parse a class specification and produce either an ANYOF node that - matches the pattern or perhaps will be optimized into an EXACTish node - instead. The node contains a bit map for the first 256 characters, with the - corresponding bit set if that character is in the list. For characters - above 255, a range list is used */ - STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { + /* parse a bracketed class specification. Most of these will produce an ANYOF node; + * but something like [a] will produce an EXACT node; [aA], an EXACTFish + * node; [[:ascii:]], a POSIXA node; etc. It is more complex under /i with + * multi-character folds: it will be rewritten following the paradigm of + * this example, where the s are characters which fold to + * multiple character sequences: + * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i + * gets effectively rewritten as: + * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i + * reg() gets called (recursively) on the rewritten version, and this + * function will return what it constructs. (Actually the s + * aren't physically removed from the [abcdefghi], it's just that they are + * ignored in the recursion by means of a flag: + * .) + * + * ANYOF nodes contain a bit map for the first 256 characters, with the + * corresponding bit set if that character is in the list. For characters + * above 255, a range list or swash is used. There are extra bits for \w, + * etc. in locale ANYOFs, as what these match is not determinable at + * compile time */ + dVAR; UV nextvalue; - UV prevvalue = OOB_UNICODE; + UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; IV range = 0; - UV value = 0; + UV value = OOB_UNICODE, save_value = OOB_UNICODE; regnode *ret; STRLEN numlen; IV namedclass = OOB_NAMEDCLASS; char *rangebegin = NULL; bool need_class = 0; - bool allow_full_fold = TRUE; /* Assume wants multi-char folding */ SV *listsv = NULL; STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more than just initialized. */ @@ -11378,6 +11484,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) extended beyond the Latin1 range */ UV element_count = 0; /* Number of distinct elements in the class. Optimizations may be possible if this is tiny */ + AV * multi_char_matches = NULL; /* Code points that fold to more than one + character; used under /i */ UV n; /* Unicode properties are stored in a swash; this holds the current one @@ -11401,8 +11509,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * of the target string */ SV* cp_list = NULL; - /* List of multi-character folds that are matched by this node */ - AV* unicode_alternate = NULL; #ifdef EBCDIC /* In a range, counts how many 0-2 of the ends of it came from literals, * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ @@ -11430,25 +11536,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* Assume we are going to generate an ANYOF node. */ ret = reganode(pRExC_state, ANYOF, 0); - if (!SIZE_ONLY) { ANYOF_FLAGS(ret) = 0; } if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ - RExC_naughty++; RExC_parse++; invert = TRUE; - - /* We have decided to not allow multi-char folds in inverted character - * classes, due to the confusion that can happen, especially with - * classes that are designed for a non-Unicode world: You have the - * peculiar case that: - "s s" =~ /^[^\xDF]+$/i => Y - "ss" =~ /^[^\xDF]+$/i => N - * - * See [perl #89750] */ - allow_full_fold = FALSE; + RExC_naughty++; } if (SIZE_ONLY) { @@ -11467,7 +11562,22 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; if (!SIZE_ONLY && POSIXCC(nextvalue)) - checkposixcc(pRExC_state); + { + const char *s = RExC_parse; + const char c = *s++; + + while (isALNUM(*s)) + s++; + if (*s && c == *s && s[1] == ']') { + SAVEFREESV(RExC_rx_sv); + SAVEFREESV(listsv); + ckWARN3reg(s+2, + "POSIX syntax [%c %c] belongs inside character classes", + c, c); + (void)ReREFCNT_inc(RExC_rx_sv); + SvREFCNT_inc_simple_void_NN(listsv); + } + } /* allow 1st char to be ] (allowing it to be - is dealt with later) */ if (UCHARAT(RExC_parse) == ']') @@ -11479,6 +11589,8 @@ parseit: charclassloop: namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ + save_value = value; + save_prevvalue = prevvalue; if (!range) { rangebegin = RExC_parse; @@ -11495,7 +11607,7 @@ parseit: nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; if (value == '[' && POSIXCC(nextvalue)) - namedclass = regpposixcc(pRExC_state, value); + namedclass = regpposixcc(pRExC_state, value, listsv); else if (value == '\\') { if (UTF) { value = utf8n_to_uvchr((U8*)RExC_parse, @@ -11511,8 +11623,8 @@ parseit: * A similar issue a little bit later when switching on * namedclass. --jhi */ switch ((I32)value) { - case 'w': namedclass = ANYOF_ALNUM; break; - case 'W': namedclass = ANYOF_NALNUM; break; + case 'w': namedclass = ANYOF_WORDCHAR; break; + case 'W': namedclass = ANYOF_NWORDCHAR; break; case 's': namedclass = ANYOF_SPACE; break; case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; @@ -11651,7 +11763,7 @@ parseit: Safefree(name); } RExC_parse = e + 1; - namedclass = ANYOF_MAX; /* no official name, but it's named */ + namedclass = ANYOF_UNIPROP; /* no official name, but it's named */ /* \p means they want Unicode semantics */ RExC_uni_semantics = 1; @@ -11726,9 +11838,13 @@ parseit: default: /* Allow \_ to not give an error */ if (!SIZE_ONLY && isALNUM(value) && value != '_') { + SAVEFREESV(RExC_rx_sv); + SAVEFREESV(listsv); ckWARN2reg(RExC_parse, "Unrecognized escape \\%c in character class passed through", (int)value); + (void)ReREFCNT_inc(RExC_rx_sv); + SvREFCNT_inc_simple_void_NN(listsv); } break; } @@ -11773,9 +11889,13 @@ parseit: const int w = RExC_parse >= rangebegin ? RExC_parse - rangebegin : 0; + SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ + SAVEFREESV(listsv); ckWARN4reg(RExC_parse, "False [] range \"%*.*s\"", w, w, rangebegin); + (void)ReREFCNT_inc(RExC_rx_sv); + SvREFCNT_inc_simple_void_NN(listsv); cp_list = add_cp_to_invlist(cp_list, '-'); cp_list = add_cp_to_invlist(cp_list, prevvalue); } @@ -11806,32 +11926,86 @@ parseit: runtime_posix_matches_above_Unicode); break; case ANYOF_ASCII: +#ifdef HAS_ISASCII if (LOC) { ANYOF_CLASS_SET(ret, namedclass); } - else { + else +#endif /* Not isascii(); just use the hard-coded definition for it */ _invlist_union(posixes, PL_ASCII, &posixes); - } break; case ANYOF_NASCII: +#ifdef HAS_ISASCII if (LOC) { ANYOF_CLASS_SET(ret, namedclass); } else { +#endif _invlist_union_complement_2nd(posixes, PL_ASCII, &posixes); if (DEPENDS_SEMANTICS) { ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; } +#ifdef HAS_ISASCII } +#endif break; case ANYOF_BLANK: - DO_POSIX(ret, namedclass, posixes, + if (hasISBLANK || ! LOC) { + DO_POSIX(ret, namedclass, posixes, PL_PosixBlank, PL_XPosixBlank); + } + else { /* There is no isblank() and we are in locale: We + use the ASCII range and the above-Latin1 range + code points */ + SV* scratch_list = NULL; + + /* Include all above-Latin1 blanks */ + _invlist_intersection(PL_AboveLatin1, + PL_XPosixBlank, + &scratch_list); + /* Add it to the running total of posix classes */ + if (! posixes) { + posixes = scratch_list; + } + else { + _invlist_union(posixes, scratch_list, &posixes); + SvREFCNT_dec(scratch_list); + } + /* Add the ASCII-range blanks to the running total. */ + _invlist_union(posixes, PL_PosixBlank, &posixes); + } break; case ANYOF_NBLANK: - DO_N_POSIX(ret, namedclass, posixes, - PL_PosixBlank, PL_XPosixBlank); + if (hasISBLANK || ! LOC) { + DO_N_POSIX(ret, namedclass, posixes, + PL_PosixBlank, PL_XPosixBlank); + } + else { /* There is no isblank() and we are in locale */ + SV* scratch_list = NULL; + + /* Include all above-Latin1 non-blanks */ + _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, + &scratch_list); + + /* Add them to the running total of posix classes */ + _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, + &scratch_list); + if (! posixes) { + posixes = scratch_list; + } + else { + _invlist_union(posixes, scratch_list, &posixes); + SvREFCNT_dec(scratch_list); + } + + /* Get the list of all non-ASCII-blanks in Latin 1, and + * add them to the running total */ + _invlist_subtract(PL_Latin1, PL_PosixBlank, + &scratch_list); + _invlist_union(posixes, scratch_list, &posixes); + SvREFCNT_dec(scratch_list); + } break; case ANYOF_CNTRL: DO_POSIX(ret, namedclass, posixes, @@ -11967,11 +12141,11 @@ parseit: } break; } - case ANYOF_ALNUM: /* Really is 'Word' */ + case ANYOF_WORDCHAR: DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); break; - case ANYOF_NALNUM: + case ANYOF_NWORDCHAR: DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv, runtime_posix_matches_above_Unicode); @@ -11995,8 +12169,7 @@ parseit: DO_N_POSIX(ret, namedclass, posixes, PL_PosixXDigit, PL_XPosixXDigit); break; - case ANYOF_MAX: - /* this is to handle \p and \P */ + case ANYOF_UNIPROP: /* this is to handle \p and \P */ break; default: vFAIL("Invalid [::] class"); @@ -12051,8 +12224,100 @@ parseit: RExC_uni_semantics = 1; } - /* Ready to process either the single value, or the completed range */ - if (!SIZE_ONLY) { + /* Ready to process either the single value, or the completed range. + * For single-valued non-inverted ranges, we consider the possibility + * of multi-char folds. (We made a conscious decision to not do this + * for the other cases because it can often lead to non-intuitive + * results. For example, you have the peculiar case that: + * "s s" =~ /^[^\xDF]+$/i => Y + * "ss" =~ /^[^\xDF]+$/i => N + * + * See [perl #89750] */ + if (FOLD && ! invert && value == prevvalue) { + if (value == LATIN_SMALL_LETTER_SHARP_S + || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold, + value))) + { + /* Here is indeed a multi-char fold. Get what it is */ + + U8 foldbuf[UTF8_MAXBYTES_CASE]; + STRLEN foldlen; + + UV folded = _to_uni_fold_flags( + value, + foldbuf, + &foldlen, + FOLD_FLAGS_FULL + | ((LOC) ? FOLD_FLAGS_LOCALE + : (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0) + ); + + /* Here, should be the first character of the + * multi-char fold of , with containing the + * whole thing. But, if this fold is not allowed (because of + * the flags), will be the same as , and should + * be processed like any other character, so skip the special + * handling */ + if (folded != value) { + + /* Skip if we are recursed, currently parsing the class + * again. Otherwise add this character to the list of + * multi-char folds. */ + if (! RExC_in_multi_char_class) { + AV** this_array_ptr; + AV* this_array; + STRLEN cp_count = utf8_length(foldbuf, + foldbuf + foldlen); + SV* multi_fold = sv_2mortal(newSVpvn("", 0)); + + Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); + + + if (! multi_char_matches) { + multi_char_matches = newAV(); + } + + /* is actually an array of arrays. + * There will be one or two top-level elements: [2], + * and/or [3]. The [2] element is an array, each + * element thereof is a character which folds to two + * characters; likewise for [3]. (Unicode guarantees a + * maximum of 3 characters in any fold.) When we + * rewrite the character class below, we will do so + * such that the longest folds are written first, so + * that it prefers the longest matching strings first. + * This is done even if it turns out that any + * quantifier is non-greedy, out of programmer + * laziness. Tom Christiansen has agreed that this is + * ok. This makes the test for the ligature 'ffi' come + * before the test for 'ff' */ + if (av_exists(multi_char_matches, cp_count)) { + this_array_ptr = (AV**) av_fetch(multi_char_matches, + cp_count, FALSE); + this_array = *this_array_ptr; + } + else { + this_array = newAV(); + av_store(multi_char_matches, cp_count, + (SV*) this_array); + } + av_push(this_array, multi_fold); + } + + /* This element should not be processed further in this + * class */ + element_count--; + value = save_value; + prevvalue = save_prevvalue; + continue; + } + } + } + + /* Deal with this element of the class */ + if (! SIZE_ONLY) { #ifndef EBCDIC cp_list = _add_range_to_invlist(cp_list, prevvalue, value); #else @@ -12082,6 +12347,84 @@ parseit: range = 0; /* this range (if it was one) is done now */ } /* End of loop through all the text within the brackets */ + /* If anything in the class expands to more than one character, we have to + * deal with them by building up a substitute parse string, and recursively + * calling reg() on it, instead of proceeding */ + if (multi_char_matches) { + SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP); + I32 cp_count; + STRLEN len; + char *save_end = RExC_end; + char *save_parse = RExC_parse; + bool first_time = TRUE; /* First multi-char occurrence doesn't get + a "|" */ + I32 reg_flags; + + assert(! invert); +#if 0 /* Have decided not to deal with multi-char folds in inverted classes, + because too confusing */ + if (invert) { + sv_catpv(substitute_parse, "(?:"); + } +#endif + + /* Look at the longest folds first */ + for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) { + + if (av_exists(multi_char_matches, cp_count)) { + AV** this_array_ptr; + SV* this_sequence; + + this_array_ptr = (AV**) av_fetch(multi_char_matches, + cp_count, FALSE); + while ((this_sequence = av_pop(*this_array_ptr)) != + &PL_sv_undef) + { + if (! first_time) { + sv_catpv(substitute_parse, "|"); + } + first_time = FALSE; + + sv_catpv(substitute_parse, SvPVX(this_sequence)); + } + } + } + + /* If the character class contains anything else besides these + * multi-character folds, have to include it in recursive parsing */ + if (element_count) { + sv_catpv(substitute_parse, "|["); + sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse); + sv_catpv(substitute_parse, "]"); + } + + sv_catpv(substitute_parse, ")"); +#if 0 + if (invert) { + /* This is a way to get the parse to skip forward a whole named + * sequence instead of matching the 2nd character when it fails the + * first */ + sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)"); + } +#endif + + RExC_parse = SvPV(substitute_parse, len); + RExC_end = RExC_parse + len; + RExC_in_multi_char_class = 1; + RExC_emit = (regnode *)orig_emit; + + ret = reg(pRExC_state, 1, ®_flags, depth+1); + + *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED); + + RExC_parse = save_parse; + RExC_end = save_end; + RExC_in_multi_char_class = 0; + SvREFCNT_dec(multi_char_matches); + SvREFCNT_dec(listsv); + return ret; + } + /* If the character class contains only a single element, it may be * optimizable into another node type which is smaller and runs faster. * Check if this is the case for this class */ @@ -12102,10 +12445,10 @@ parseit: * modifier to the regex. We first calculate the base node * type, and if it should be inverted */ - case ANYOF_NALNUM: + case ANYOF_NWORDCHAR: invert = ! invert; /* FALLTHROUGH */ - case ANYOF_ALNUM: + case ANYOF_WORDCHAR: op = ALNUM; goto join_charset_classes; @@ -12169,7 +12512,7 @@ parseit: *flagp |= HASWIDTH|SIMPLE; break; - case ANYOF_MAX: + case ANYOF_UNIPROP: break; case ANYOF_NBLANK: @@ -12251,7 +12594,7 @@ parseit: ret = reg_node(pRExC_state, op); - if (PL_regkind[op] == POSIXD) { + if (PL_regkind[op] == POSIXD || PL_regkind[op] == NPOSIXD) { if (! SIZE_ONLY) { FLAGS(ret) = arg; } @@ -12263,7 +12606,9 @@ parseit: RExC_parse = (char *) cur_parse; + SvREFCNT_dec(posixes); SvREFCNT_dec(listsv); + SvREFCNT_dec(cp_list); return ret; } } @@ -12279,10 +12624,11 @@ parseit: SV* fold_intersection = NULL; - /* In the Latin1 range, the characters that can be folded-to or -from - * are precisely the alphabetic characters. If the highest code point - * is within Latin1, we can use the compiled-in list, and not have to - * go out to disk. */ + /* If the highest code point is within Latin1, we can use the + * compiled-in Alphas list, and not have to go out to disk. This + * yields two false positives, the masculine and feminine ordinal + * indicators, which are weeded out below using the + * IS_IN_SOME_FOLD_L1() macro */ if (invlist_highest(cp_list) < 256) { _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection); } @@ -12316,14 +12662,13 @@ parseit: * to force that */ if (! PL_utf8_tofold) { U8 dummy[UTF8_MAXBYTES+1]; - STRLEN dummy_len; /* This string is just a short named one above \xff */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len); + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); assert(PL_utf8_tofold); /* Verify that worked */ } PL_utf8_foldclosures = - _swash_inversion_hash(PL_utf8_tofold); + _swash_inversion_hash(PL_utf8_tofold); } } @@ -12350,7 +12695,7 @@ parseit: U8 foldbuf[UTF8_MAXBYTES_CASE+1]; STRLEN foldlen; - UV f; + SV** listp; if (j < 256) { @@ -12362,7 +12707,7 @@ parseit: * mappings, though that is not really likely, and may be * caught by the default: case of the switch below. */ - if (PL_fold_latin1[j] != j) { + if (IS_IN_SOME_FOLD_L1(j)) { /* ASCII is always matched; non-ASCII is matched only * under Unicode rules */ @@ -12380,30 +12725,13 @@ parseit: && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) { /* Certain Latin1 characters have matches outside - * Latin1, or are multi-character. To get here, 'j' is - * one of those characters. None of these matches is - * valid for ASCII characters under /aa, which is why - * the 'if' just above excludes those. The matches - * fall into three categories: - * 1) They are singly folded-to or -from an above 255 - * character, e.g., LATIN SMALL LETTER Y WITH - * DIAERESIS and LATIN CAPITAL LETTER Y WITH - * DIAERESIS; - * 2) They are part of a multi-char fold with another - * latin1 character; only LATIN SMALL LETTER - * SHARP S => "ss" fits this; - * 3) They are part of a multi-char fold with a - * character outside of Latin1, such as various - * ligatures. - * We aren't dealing fully with multi-char folds, except - * we do deal with the pattern containing a character - * that has a multi-char fold (not so much the inverse). - * For types 1) and 3), the matches only happen when the - * target string is utf8; that's not true for 2), and we - * set a flag for it. - * - * The code below adds the single fold closures for 'j' - * to the inversion list. */ + * Latin1. To get here, is one of those + * characters. None of these matches is valid for + * ASCII characters under /aa, which is why the 'if' + * just above excludes those. These matches only + * happen when the target string is utf8. The code + * below adds the single fold closures for to the + * inversion list. */ switch (j) { case 'k': case 'K': @@ -12433,20 +12761,6 @@ parseit: case LATIN_SMALL_LETTER_SHARP_S: cp_list = add_cp_to_invlist(cp_list, LATIN_CAPITAL_LETTER_SHARP_S); - - /* Under /a, /d, and /u, this can match the two - * chars "ss" */ - if (! ASCII_FOLD_RESTRICTED) { - add_alternate(&unicode_alternate, - (U8 *) "ss", 2); - - /* And under /u or /a, it can match even if - * the target is not utf8 */ - if (AT_LEAST_UNI_SEMANTICS) { - ANYOF_FLAGS(ret) |= - ANYOF_NONBITMAP_NON_UTF8; - } - } break; case 'F': case 'f': case 'I': case 'i': @@ -12463,7 +12777,8 @@ parseit: * express, so they can't match unless the * target string is in UTF-8, so no action here * is necessary, as regexec.c properly handles - * the general case for UTF-8 matching */ + * the general case for UTF-8 matching and + * multi-char folds */ break; default: /* Use deprecated warning to increase the @@ -12476,89 +12791,56 @@ parseit: } /* Here is an above Latin1 character. We don't have the rules - * hard-coded for it. First, get its fold */ - f = _to_uni_fold_flags(j, foldbuf, &foldlen, - ((allow_full_fold) ? FOLD_FLAGS_FULL : 0) - | ((LOC) - ? FOLD_FLAGS_LOCALE - : (ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); - - if (foldlen > (STRLEN)UNISKIP(f)) { - - /* Any multicharacter foldings (disallowed in lookbehind - * patterns) require the following transform: [ABCDEF] -> - * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F - * folds into "rst", all other characters fold to single - * characters. We save away these multicharacter foldings, - * to be later saved as part of the additional "s" data. */ - if (! RExC_in_lookbehind) { - U8* loc = foldbuf; - U8* e = foldbuf + foldlen; - - /* If any of the folded characters of this are in the - * Latin1 range, tell the regex engine that this can - * match a non-utf8 target string. */ - while (loc < e) { - if (UTF8_IS_INVARIANT(*loc) - || UTF8_IS_DOWNGRADEABLE_START(*loc)) - { - ANYOF_FLAGS(ret) - |= ANYOF_NONBITMAP_NON_UTF8; - break; - } - loc += UTF8SKIP(loc); + * hard-coded for it. First, get its fold. This is the simple + * fold, as the multi-character folds have been handled earlier + * and separated out */ + _to_uni_fold_flags(j, foldbuf, &foldlen, + ((LOC) + ? FOLD_FLAGS_LOCALE + : (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); + + /* Single character fold of above Latin1. Add everything in + * its fold closure to the list that this node should match. + * The fold closures data structure is a hash with the keys + * being the UTF-8 of every character that is folded to, like + * 'k', and the values each an array of all code points that + * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ]. + * Multi-character folds are not included */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_len(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); } + c = SvUV(*c_p); - add_alternate(&unicode_alternate, foldbuf, foldlen); - } - } - else { - /* Single character fold of above Latin1. Add everything - * in its fold closure to the list that this node should - * match */ - SV** listp; - - /* The fold closures data structure is a hash with the keys - * being every character that is folded to, like 'k', and - * the values each an array of everything that folds to its - * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */ - if ((listp = hv_fetch(PL_utf8_foldclosures, - (char *) foldbuf, foldlen, FALSE))) - { - AV* list = (AV*) *listp; - IV k; - for (k = 0; k <= av_len(list); k++) { - SV** c_p = av_fetch(list, k, FALSE); - UV c; - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - c = SvUV(*c_p); - - /* /aa doesn't allow folds between ASCII and non-; - * /l doesn't allow them between above and below - * 256 */ - if ((ASCII_FOLD_RESTRICTED - && (isASCII(c) != isASCII(j))) - || (LOC && ((c < 256) != (j < 256)))) - { - continue; - } + /* /aa doesn't allow folds between ASCII and non-; /l + * doesn't allow them between above and below 256 */ + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j))) + || (LOC && ((c < 256) != (j < 256)))) + { + continue; + } - /* Folds involving non-ascii Latin1 characters - * under /d are added to a separate list */ - if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) - { - cp_list = add_cp_to_invlist(cp_list, c); - } - else { - depends_list = add_cp_to_invlist(depends_list, c); - } - } - } - } + /* Folds involving non-ascii Latin1 characters + * under /d are added to a separate list */ + if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) + { + cp_list = add_cp_to_invlist(cp_list, c); + } + else { + depends_list = add_cp_to_invlist(depends_list, c); + } + } + } } } SvREFCNT_dec(fold_intersection); @@ -12662,12 +12944,12 @@ parseit: * folded until runtime */ /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known - * at compile time. Besides not inverting folded locale now, we can't invert - * if there are things such as \w, which aren't known until runtime */ + * at compile time. Besides not inverting folded locale now, we can't + * invert if there are things such as \w, which aren't known until runtime + * */ if (invert && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS))) && ! depends_list - && ! unicode_alternate && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { _invlist_invert(cp_list); @@ -12686,9 +12968,9 @@ parseit: * until runtime; set the run-time fold flag for these. (We don't have to * worry about properties folding, as that is taken care of by the swash * fetching) */ - if (FOLD && (LOC || unicode_alternate)) + if (FOLD && LOC) { - ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD; + ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; } /* Some character classes are equivalent to other nodes. Such nodes take @@ -12707,7 +12989,6 @@ parseit: * node types they could possibly match using _invlistEQ(). */ if (cp_list - && ! unicode_alternate && ! invert && ! depends_list && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS) @@ -12758,13 +13039,9 @@ parseit: * EXACTFish node that any such are likely to be. We can * do this iff the code point doesn't participate in any * folds. For example, an EXACTF of a colon is the same as - * an EXACT one, since nothing folds to or from a colon. - * In the Latin1 range, being an alpha means that the - * character participates in a fold (except for the - * feminine and masculine ordinals, which I (khw) don't - * think are worrying about optimizing for). */ + * an EXACT one, since nothing folds to or from a colon. */ if (value < 256) { - if (isALPHA_L1(value)) { + if (IS_IN_SOME_FOLD_L1(value)) { op = EXACT; } } @@ -12816,6 +13093,7 @@ parseit: alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); } + SvREFCNT_dec(cp_list); SvREFCNT_dec(listsv); return ret; } @@ -12900,12 +13178,10 @@ parseit: } if (! cp_list - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION - && ! unicode_alternate) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); SvREFCNT_dec(listsv); - SvREFCNT_dec(unicode_alternate); } else { /* av[0] stores the character class description in its textual form: @@ -12914,18 +13190,16 @@ parseit: * av[1] if NULL, is a placeholder to later contain the swash computed * from av[0]. But if no further computation need be done, the * swash is stored there now. - * av[2] stores the multicharacter foldings, used later in - * regexec.c:S_reginclass(). - * av[3] stores the cp_list inversion list for use in addition or + * av[2] stores the cp_list inversion list for use in addition or * instead of av[0]; used only if av[1] is NULL - * av[4] is set if any component of the class is from a user-defined + * av[3] is set if any component of the class is from a user-defined * property; used only if av[1] is NULL */ AV * const av = newAV(); SV *rv; av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) ? listsv - : &PL_sv_undef); + : (SvREFCNT_dec(listsv), &PL_sv_undef)); if (swash) { av_store(av, 1, swash); SvREFCNT_dec(cp_list); @@ -12933,22 +13207,11 @@ parseit: else { av_store(av, 1, NULL); if (cp_list) { - av_store(av, 3, cp_list); - av_store(av, 4, newSVuv(has_user_defined_property)); + av_store(av, 2, cp_list); + av_store(av, 3, newSVuv(has_user_defined_property)); } } - /* Store any computed multi-char folds only if we are allowing - * them */ - if (allow_full_fold) { - av_store(av, 2, MUTABLE_SV(unicode_alternate)); - if (unicode_alternate) { /* This node is variable length */ - OP(ret) = ANYOFV; - } - } - else { - av_store(av, 2, NULL); - } rv = newRV_noinc(MUTABLE_SV(av)); n = add_data(pRExC_state, 1, "s"); RExC_rxi->data->data[n] = (void*)rv; @@ -13555,36 +13818,45 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ static const char * const anyofs[] = { - "\\w", - "\\W", - "\\s", - "\\S", - "\\d", - "\\D", - "[:alnum:]", - "[:^alnum:]", +#if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \ + || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 || _CC_ALNUMC != 7 \ + || _CC_GRAPH != 8 || _CC_SPACE != 9 || _CC_BLANK != 10 \ + || _CC_XDIGIT != 11 || _CC_PSXSPC != 12 || _CC_CNTRL != 13 \ + || _CC_ASCII != 14 || _CC_VERTSPACE != 15 + #error Need to adjust order of anyofs[] +#endif + "[\\w]", + "[\\W]", + "[\\d]", + "[\\D]", "[:alpha:]", "[:^alpha:]", - "[:ascii:]", - "[:^ascii:]", - "[:cntrl:]", - "[:^cntrl:]", - "[:graph:]", - "[:^graph:]", "[:lower:]", "[:^lower:]", - "[:print:]", - "[:^print:]", - "[:punct:]", - "[:^punct:]", "[:upper:]", "[:^upper:]", + "[:punct:]", + "[:^punct:]", + "[:print:]", + "[:^print:]", + "[:alnum:]", + "[:^alnum:]", + "[:graph:]", + "[:^graph:]", + "[\\s]", + "[\\S]", + "[:blank:]", + "[:^blank:]", "[:xdigit:]", "[:^xdigit:]", "[:space:]", "[:^space:]", - "[:blank:]", - "[:^blank:]" + "[:cntrl:]", + "[:^cntrl:]", + "[:ascii:]", + "[:^ascii:]", + "[\\v]", + "[\\V]" }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -13709,7 +13981,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (flags & ANYOF_LOCALE) sv_catpvs(sv, "{loc}"); - if (flags & ANYOF_LOC_NONBITMAP_FOLD) + if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); if (flags & ANYOF_INVERT) @@ -13759,7 +14031,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (ANYOF_NONBITMAP(o)) { SV *lv; /* Set if there is something outside the bit map */ - SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0); + SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL); bool byte_output = FALSE; /* If something in the bitmap has been output */ @@ -13843,7 +14115,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } - else if (k == POSIXD) { + else if (k == POSIXD || k == NPOSIXD) { U8 index = FLAGS(o) * 2; if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); @@ -13866,7 +14138,7 @@ SV * Perl_re_intuit_string(pTHX_ REGEXP * const r) { /* Assume that RE_INTUIT is set */ dVAR; - struct regexp *const prog = (struct regexp *)SvANY(r); + struct regexp *const prog = ReANY(r); GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_RE_INTUIT_STRING; @@ -13914,7 +14186,7 @@ void Perl_pregfree2(pTHX_ REGEXP *rx) { dVAR; - struct regexp *const r = (struct regexp *)SvANY(rx); + struct regexp *const r = ReANY(rx); GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_PREGFREE2; @@ -13924,6 +14196,7 @@ Perl_pregfree2(pTHX_ REGEXP *rx) } else { CALLREGFREE_PVT(rx); /* free the private data */ SvREFCNT_dec(RXp_PAREN_NAMES(r)); + Safefree(r->xpv_len_u.xpvlenu_pv); } if (r->substrs) { SvREFCNT_dec(r->anchored_substr); @@ -13933,11 +14206,12 @@ Perl_pregfree2(pTHX_ REGEXP *rx) Safefree(r->substrs); } RX_MATCH_COPY_FREE(rx); -#ifdef PERL_OLD_COPY_ON_WRITE +#ifdef PERL_ANY_COW SvREFCNT_dec(r->saved_copy); #endif Safefree(r->offs); SvREFCNT_dec(r->qr_anoncv); + rx->sv_u.svu_rx = 0; } /* reg_temp_copy() @@ -13961,26 +14235,42 @@ REGEXP * Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) { struct regexp *ret; - struct regexp *const r = (struct regexp *)SvANY(rx); + struct regexp *const r = ReANY(rx); + const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV; PERL_ARGS_ASSERT_REG_TEMP_COPY; if (!ret_x) ret_x = (REGEXP*) newSV_type(SVt_REGEXP); - ret = (struct regexp *)SvANY(ret_x); + else { + SvOK_off((SV *)ret_x); + if (islv) { + /* For PVLVs, SvANY points to the xpvlv body while sv_u points + to the regexp. (For SVt_REGEXPs, sv_upgrade has already + made both spots point to the same regexp body.) */ + REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP); + assert(!SvPVX(ret_x)); + ret_x->sv_u.svu_rx = temp->sv_any; + temp->sv_any = NULL; + SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL; + SvREFCNT_dec(temp); + /* SvCUR still resides in the xpvlv struct, so the regexp copy- + ing below will not set it. */ + SvCUR_set(ret_x, SvCUR(rx)); + } + } + /* This ensures that SvTHINKFIRST(sv) is true, and hence that + sv_force_normal(sv) is called. */ + SvFAKE_on(ret_x); + ret = ReANY(ret_x); - (void)ReREFCNT_inc(rx); - /* We can take advantage of the existing "copied buffer" mechanism in SVs - by pointing directly at the buffer, but flagging that the allocated - space in the copy is zero. As we've just done a struct copy, it's now - a case of zero-ing that, rather than copying the current length. */ - SvPV_set(ret_x, RX_WRAPPED(rx)); - SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8); + SvFLAGS(ret_x) |= SvUTF8(rx); + /* We share the same string buffer as the original regexp, on which we + hold a reference count, incremented when mother_re is set below. + The string pointer is copied here, being part of the regexp struct. + */ memcpy(&(ret->xpv_cur), &(r->xpv_cur), sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur)); - SvLEN_set(ret_x, 0); - SvSTASH_set(ret_x, NULL); - SvMAGIC_set(ret_x, NULL); if (r->offs) { const I32 npar = r->nparens+1; Newx(ret->offs, npar, regexp_paren_pair); @@ -13999,10 +14289,10 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) anchored or float namesakes, and don't hold a second reference. */ } RX_MATCH_COPIED_off(ret_x); -#ifdef PERL_OLD_COPY_ON_WRITE +#ifdef PERL_ANY_COW ret->saved_copy = NULL; #endif - ret->mother_re = rx; + ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx); SvREFCNT_inc_void(ret->qr_anoncv); return ret_x; @@ -14025,7 +14315,7 @@ void Perl_regfree_internal(pTHX_ REGEXP * const rx) { dVAR; - struct regexp *const r = (struct regexp *)SvANY(rx); + struct regexp *const r = ReANY(rx); RXi_GET_DECL(r,ri); GET_RE_DEBUG_FLAGS_DECL; @@ -14146,8 +14436,8 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) { dVAR; I32 npar; - const struct regexp *r = (const struct regexp *)SvANY(sstr); - struct regexp *ret = (struct regexp *)SvANY(dstr); + const struct regexp *r = ReANY(sstr); + struct regexp *ret = ReANY(dstr); PERL_ARGS_ASSERT_RE_DUP_GUTS; @@ -14207,25 +14497,18 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); else ret->subbeg = NULL; -#ifdef PERL_OLD_COPY_ON_WRITE +#ifdef PERL_ANY_COW ret->saved_copy = NULL; #endif - if (ret->mother_re) { - if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) { - /* Our storage points directly to our mother regexp, but that's + /* Whether mother_re be set or no, we need to copy the string. We + cannot refrain from copying it when the storage points directly to + our mother regexp, because that's 1: a buffer in a different thread 2: something we no longer hold a reference on so we need to copy it locally. */ - /* Note we need to use SvCUR(), rather than - SvLEN(), on our mother_re, because it, in - turn, may well be pointing to its own mother_re. */ - SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re), - SvCUR(ret->mother_re)+1)); - SvLEN_set(dstr, SvCUR(ret->mother_re)+1); - } - ret->mother_re = NULL; - } + RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1); + ret->mother_re = NULL; ret->gofs = 0; } #endif /* PERL_IN_XSUB_RE */ @@ -14248,7 +14531,7 @@ void * Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) { dVAR; - struct regexp *const r = (struct regexp *)SvANY(rx); + struct regexp *const r = ReANY(rx); regexp_internal *reti; int len; RXi_GET_DECL(r,ri); @@ -14352,7 +14635,7 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) - regnext - dig the "next" pointer out of a node */ regnode * -Perl_regnext(pTHX_ register regnode *p) +Perl_regnext(pTHX_ regnode *p) { dVAR; I32 offset; @@ -14429,11 +14712,13 @@ Perl_save_re_context(pTHX) PL_reg_oldsaved = NULL; PL_reg_oldsavedlen = 0; + PL_reg_oldsavedoffset = 0; + PL_reg_oldsavedcoffset = 0; PL_reg_maxiter = 0; PL_reg_leftiter = 0; PL_reg_poscache = NULL; PL_reg_poscache_size = 0; -#ifdef PERL_OLD_COPY_ON_WRITE +#ifdef PERL_ANY_COW PL_nrs = NULL; #endif @@ -14459,13 +14744,6 @@ Perl_save_re_context(pTHX) } #endif -static void -clear_re(pTHX_ void *r) -{ - dVAR; - ReREFCNT_dec((REGEXP *)r); -} - #ifdef DEBUGGING STATIC void