X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/18890cbd431f4bda6e2dfe4162b73042ac79d6ab..486ffce2e3218c92f8211b19f0be2fc5532f8b57:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 3ca7a41..b5ed584 100644 --- a/regcomp.c +++ b/regcomp.c @@ -81,16 +81,16 @@ #define REG_COMP_C #ifdef PERL_IN_XSUB_RE # include "re_comp.h" +extern const struct regexp_engine my_reg_engine; #else # include "regcomp.h" #endif #include "dquote_static.c" -#ifndef PERL_IN_XSUB_RE -# include "charclass_invlists.h" -#endif +#include "charclass_invlists.h" #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) #ifdef op #undef op @@ -398,8 +398,8 @@ static const scan_data_t zero_scan_data = #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) -#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) @@ -2582,12 +2582,12 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * one, and looks for problematic sequences of characters whose folds vs. * non-folds have sufficiently different lengths, that the optimizer would be * fooled into rejecting legitimate matches of them, and the trie construction - * code can't cope with them. The joining is only done if: + * code needs to handle specially. The joining is only done if: * 1) there is room in the current conglomerated node to entirely contain the * next one. * 2) they are the exact same node type * - * The adjacent nodes actually may be separated by NOTHING kind nodes, and + * The adjacent nodes actually may be separated by NOTHING-kind nodes, and * these get optimized out * * If there are problematic code sequences, *min_subtract is set to the delta @@ -2600,26 +2600,27 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * * This is as good a place as any to discuss the design of handling these * problematic sequences. It's been wrong in Perl for a very long time. There - * are three code points in Unicode whose folded lengths differ so much from - * the un-folded lengths that it causes problems for the optimizer and trie - * construction. Why only these are problematic, and not others where lengths - * also differ is something I (khw) do not understand. New versions of Unicode - * might add more such code points. Hopefully the logic in fold_grind.t that - * figures out what to test (in part by verifying that each size-combination - * gets tested) will catch any that do come along, so they can be added to the - * special handling below. The chances of new ones are actually rather small, - * as most, if not all, of the world's scripts that have casefolding have - * already been encoded by Unicode. Also, a number of Unicode's decisions were - * made to allow compatibility with pre-existing standards, and almost all of - * those have already been dealt with. These would otherwise be the most - * likely candidates for generating further tricky sequences. In other words, - * Unicode by itself is unlikely to add new ones unless it is for compatibility - * with pre-existing standards, and there aren't many of those left. + * are three code points currently in Unicode whose folded lengths differ so + * much from the un-folded lengths that it causes problems for the optimizer + * and trie construction. Why only these are problematic, and not others where + * lengths also differ is something I (khw) do not understand. New versions of + * Unicode might add more such code points. Hopefully the logic in + * fold_grind.t that figures out what to test (in part by verifying that each + * size-combination gets tested) will catch any that do come along, so they can + * be added to the special handling below. The chances of new ones are + * actually rather small, as most, if not all, of the world's scripts that have + * casefolding have already been encoded by Unicode. Also, a number of + * Unicode's decisions were made to allow compatibility with pre-existing + * standards, and almost all of those have already been dealt with. These + * would otherwise be the most likely candidates for generating further tricky + * sequences. In other words, Unicode by itself is unlikely to add new ones + * unless it is for compatibility with pre-existing standards, and there aren't + * many of those left. * * The previous designs for dealing with these involved assigning a special * node for them. This approach doesn't work, as evidenced by this example: * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches - * Both these fold to "sss", but if the pattern is parsed to create a node of + * Both these fold to "sss", but if the pattern is parsed to create a node * that would match just the \xDF, it won't be able to handle the case where a * successful match would have to cross the node's boundary. The new approach * that hopefully generally solves the problem generates an EXACTFU_SS node @@ -2634,9 +2635,9 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * problematic sequences. This delta is used by the caller to adjust the * min length of the match, and the delta between min and max, so that the * optimizer doesn't reject these possibilities based on size constraints. - * 2) These sequences require special handling by the trie code, so it - * changes the joined node type to ops for the trie's benefit, those new - * ops being EXACTFU_SS and EXACTFU_TRICKYFOLD. + * 2) These sequences require special handling by the trie code, so this code + * changes the joined node type to special ops: EXACTFU_TRICKYFOLD and + * EXACTFU_SS. * 3) This is sufficient for the two Greek sequences (described below), but * the one involving the Sharp s (\xDF) needs more. The node type * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss" @@ -2646,22 +2647,21 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * itself with length changes, and so can be processed faster. regexec.c * takes advantage of this. Generally, an EXACTFish node that is in UTF-8 * is pre-folded by regcomp.c. This saves effort in regex matching. - * However, probably mostly for historical reasons, the pre-folding isn't - * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL - * nodes, as what they fold to isn't known until runtime.) The fold - * possibilities for the non-UTF8 patterns are quite simple, except for - * the sharp s. All the ones that don't involve a UTF-8 target string - * are members of a fold-pair, and arrays are set up for all of them - * that quickly find the other member of the pair. It might actually - * be faster to pre-fold these, but it isn't currently done, except for - * the sharp s. Code elsewhere in this file makes sure that it gets - * folded to 'ss', even if the pattern isn't UTF-8. This avoids the - * issues described in the next item. + * However, the pre-folding isn't done for non-UTF8 patterns because the + * fold of the MICRO SIGN requires UTF-8. Also what EXACTF and EXACTFL + * nodes fold to isn't known until runtime. The fold possibilities for + * the non-UTF8 patterns are quite simple, except for the sharp s. All + * the ones that don't involve a UTF-8 target string are members of a + * fold-pair, and arrays are set up for all of them so that the other + * member of the pair can be found quickly. Code elsewhere in this file + * makes sure that in EXACTFU nodes, the sharp s gets folded to 'ss', even + * if the pattern isn't UTF-8. This avoids the issues described in the + * next item. * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches * 'ss' or not is not knowable at compile time. It will match iff the * target string is in UTF-8, unlike the EXACTFU nodes, where it always * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus - * it can't be folded to "ss" at compile time, unlike EXACTFU does as + * it can't be folded to "ss" at compile time, unlike EXACTFU does (as * described in item 3). An assumption that the optimizer part of * regexec.c (probably unwittingly) makes is that a character in the * pattern corresponds to at most a single character in the target string. @@ -2733,6 +2733,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b const unsigned int oldl = STR_LEN(scan); regnode * const nnext = regnext(n); + /* XXX I (khw) kind of doubt that this works on platforms where + * U8_MAX is above 255 because of lots of other assumptions */ if (oldl + STR_LEN(n) > U8_MAX) break; @@ -2883,7 +2885,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b greek_sequence: *min_subtract += 4; - /* This can't currently be handled by trie's, so change + /* This requires special handling by trie's, so change * the node type to indicate this. If EXACTFA and * EXACTFL were ever to be handled by trie's, this * would have to be changed. If this node has already @@ -2919,9 +2921,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b /* EXACTF nodes need to know that the minimum * length changed so that a sharp s in the string * can match this ss in the pattern, but they - * remain EXACTF nodes, as they are not trie'able, - * so don't have to invent a new node type to - * exclude them from the trie code */ + * remain EXACTF nodes, as they won't match this + * unless the target string is is UTF-8, which we + * don't know until runtime */ if (OP(scan) != EXACTF) { OP(scan) = EXACTFU_SS; } @@ -3661,7 +3663,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); } - else if (has_exactf_sharp_s) { + if (has_exactf_sharp_s) { RExC_seen |= REG_SEEN_EXACTF_SHARP_S; } min += l - min_subtract; @@ -3958,6 +3960,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && !(data->flags & SF_HAS_EVAL) && !deltanext /* atom is fixed width */ && minnext != 0 /* CURLYM can't handle zero width */ + && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */ ) { /* XXXX How to optimize if data == 0? */ /* Optimize to a simpler form. */ @@ -4961,16 +4964,23 @@ Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) } #endif -/* public(ish) wrapper for Perl_re_op_compile that only takes an SV - * pattern rather than a list of OPs */ +/* public(ish) entry point for the perl core's own regex compiling code. + * It's actually a wrapper for Perl_re_op_compile that only takes an SV + * pattern rather than a list of OPs, and uses the internal engine rather + * than the current one */ REGEXP * Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) { SV *pat = pattern; /* defeat constness! */ PERL_ARGS_ASSERT_RE_COMPILE; - return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(), - NULL, NULL, rx_flags, 0); + return Perl_re_op_compile(aTHX_ &pat, 1, NULL, +#ifdef PERL_IN_XSUB_RE + &my_reg_engine, +#else + &PL_core_reg_engine, +#endif + NULL, NULL, rx_flags, 0); } /* see if there are any run-time code blocks in the pattern. @@ -5197,6 +5207,50 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, } +STATIC bool +S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol) +{ + /* This is the common code for setting up the floating and fixed length + * string data extracted from Perlre_op_compile() below. Returns a boolean + * as to whether succeeded or not */ + + I32 t,ml; + + if (! (longest_length + || (eol /* Can't have SEOL and MULTI */ + && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) + ) + /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ + || (RExC_seen & REG_SEEN_EXACTF_SHARP_S)) + { + return FALSE; + } + + /* copy the information about the longest from the reg_scan_data + over to the program. */ + if (SvUTF8(sv_longest)) { + *rx_utf8 = sv_longest; + *rx_substr = NULL; + } else { + *rx_substr = sv_longest; + *rx_utf8 = NULL; + } + /* end_shift is how many chars that must be matched that + follow this item. We calculate it ahead of time as once the + lookbehind offset is added in we lose the ability to correctly + calculate it.*/ + ml = minlen ? *(minlen) : (I32)longest_length; + *rx_end_shift = ml - offset + - longest_length + (SvTAIL(sv_longest) != 0) + + lookbehind; + + t = (eol/* Can't have SEOL and MULTI */ + && (! meol || (RExC_flags & RXf_PMf_MULTILINE))); + fbm_compile(sv_longest, t ? FBMcf_TAIL : 0); + + return TRUE; +} + /* * Perl_re_op_compile - the perl internal RE engine's function to compile a * regular expression into internal code. @@ -5249,7 +5303,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, dVAR; REGEXP *rx; struct regexp *r; - register regexp_internal *ri; + regexp_internal *ri; STRLEN plen; char * VOL exp; char* xend; @@ -5311,9 +5365,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist); PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist); - PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); - PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); - PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist); PL_PosixLower = _new_invlist_C_array(PosixLower_invlist); @@ -5688,7 +5739,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags, exp, plen); if (!runtime_code) { - ReREFCNT_inc(old_re); if (used_setjump) { JMPENV_POP; } @@ -6168,105 +6218,56 @@ reStudy: scan_commit(pRExC_state, &data,&minlen,0); SvREFCNT_dec(data.last_found); - /* Note that code very similar to this but for anchored string - follows immediately below, changes may need to be made to both. - Be careful. - */ longest_float_length = CHR_SVLEN(data.longest_float); - if (longest_float_length - || (data.flags & SF_FL_BEFORE_EOL - && (!(data.flags & SF_FL_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE)))) - { - I32 t,ml; - /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ - if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S) - || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ - && data.offset_fixed == data.offset_float_min - && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) - goto remove_float; /* As in (a)+. */ - - /* copy the information about the longest float from the reg_scan_data - over to the program. */ - if (SvUTF8(data.longest_float)) { - r->float_utf8 = data.longest_float; - r->float_substr = NULL; - } else { - r->float_substr = data.longest_float; - r->float_utf8 = NULL; - } - /* float_end_shift is how many chars that must be matched that - follow this item. We calculate it ahead of time as once the - lookbehind offset is added in we lose the ability to correctly - calculate it.*/ - ml = data.minlen_float ? *(data.minlen_float) - : (I32)longest_float_length; - r->float_end_shift = ml - data.offset_float_min - - longest_float_length + (SvTAIL(data.longest_float) != 0) - + data.lookbehind_float; + if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) + && S_setup_longest (aTHX_ pRExC_state, + data.longest_float, + &(r->float_utf8), + &(r->float_substr), + &(r->float_end_shift), + data.lookbehind_float, + data.offset_float_min, + data.minlen_float, + longest_float_length, + data.flags & SF_FL_BEFORE_EOL, + data.flags & SF_FL_BEFORE_MEOL)) + { r->float_min_offset = data.offset_float_min - data.lookbehind_float; r->float_max_offset = data.offset_float_max; if (data.offset_float_max < I32_MAX) /* Don't offset infinity */ r->float_max_offset -= data.lookbehind_float; - - t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ - && (!(data.flags & SF_FL_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE))); - fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0); } else { - remove_float: r->float_substr = r->float_utf8 = NULL; SvREFCNT_dec(data.longest_float); longest_float_length = 0; } - /* Note that code very similar to this but for floating string - is immediately above, changes may need to be made to both. - Be careful. - */ longest_fixed_length = CHR_SVLEN(data.longest_fixed); - /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ - if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) - && (longest_fixed_length - || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ - && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE)))) ) + if (S_setup_longest (aTHX_ pRExC_state, + data.longest_fixed, + &(r->anchored_utf8), + &(r->anchored_substr), + &(r->anchored_end_shift), + data.lookbehind_fixed, + data.offset_fixed, + data.minlen_fixed, + longest_fixed_length, + data.flags & SF_FIX_BEFORE_EOL, + data.flags & SF_FIX_BEFORE_MEOL)) { - I32 t,ml; - - /* copy the information about the longest fixed - from the reg_scan_data over to the program. */ - if (SvUTF8(data.longest_fixed)) { - r->anchored_utf8 = data.longest_fixed; - r->anchored_substr = NULL; - } else { - r->anchored_substr = data.longest_fixed; - r->anchored_utf8 = NULL; - } - /* fixed_end_shift is how many chars that must be matched that - follow this item. We calculate it ahead of time as once the - lookbehind offset is added in we lose the ability to correctly - calculate it.*/ - ml = data.minlen_fixed ? *(data.minlen_fixed) - : (I32)longest_fixed_length; - r->anchored_end_shift = ml - data.offset_fixed - - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0) - + data.lookbehind_fixed; r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; - - t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ - && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE))); - fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0); } else { r->anchored_substr = r->anchored_utf8 = NULL; SvREFCNT_dec(data.longest_fixed); longest_fixed_length = 0; } + if (ri->regstclass && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) ri->regstclass = NULL; @@ -7312,8 +7313,8 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end #ifndef PERL_IN_XSUB_RE -STATIC IV -S_invlist_search(pTHX_ SV* const invlist, const UV cp) +IV +Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) { /* Searches the inversion list for the entry that contains the input code * point . If is not in the list, -1 is returned. Otherwise, the @@ -7324,7 +7325,7 @@ S_invlist_search(pTHX_ SV* const invlist, const UV cp) IV high = invlist_len(invlist); const UV * const array = invlist_array(invlist); - PERL_ARGS_ASSERT_INVLIST_SEARCH; + PERL_ARGS_ASSERT__INVLIST_SEARCH; /* If list is empty or the code point is before the first element, return * failure. */ @@ -7378,7 +7379,7 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV array = invlist_array(invlist); /* Find which element it is */ - i = invlist_search(invlist, start); + i = _invlist_search(invlist, start); /* We populate from to */ while (current < end) { @@ -7941,6 +7942,18 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) #endif +PERL_STATIC_INLINE bool +S__invlist_contains_cp(pTHX_ SV* const invlist, const UV cp) +{ + /* Does contain code point as part of the set? */ + + IV index = _invlist_search(invlist, cp); + + PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP; + + return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index); +} + PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { return _add_range_to_invlist(invlist, cp, cp); @@ -8183,6 +8196,77 @@ S_invlist_dump(pTHX_ SV* const invlist, const char * const header) } #endif +#if 0 +bool +S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) +{ + /* Return a boolean as to if the two passed in inversion lists are + * identical. The final argument, if TRUE, says to take the complement of + * the second inversion list before doing the comparison */ + + UV* array_a = invlist_array(a); + UV* array_b = invlist_array(b); + UV len_a = invlist_len(a); + UV len_b = invlist_len(b); + + UV i = 0; /* current index into the arrays */ + bool retval = TRUE; /* Assume are identical until proven otherwise */ + + PERL_ARGS_ASSERT__INVLISTEQ; + + /* If are to compare 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* The complement of nothing is everything, so would have to have + * just one element, starting at zero (ending at infinity) */ + if (len_b == 0) { + return (len_a == 1 && array_a[0] == 0); + } + else if (array_b[0] == 0) { + + /* Otherwise, to complement, we invert. Here, the first element is + * 0, just remove it. To do this, we just pretend the array starts + * one later, and clear the flag as we don't have to do anything + * else later */ + + array_b++; + len_b--; + complement_b = FALSE; + } + else { + + /* But if the first element is not zero, we unshift a 0 before the + * array. The data structure reserves a space for that 0 (which + * should be a '1' right now), so physical shifting is unneeded, + * but temporarily change that element to 0. Before exiting the + * routine, we must restore the element to '1' */ + array_b--; + len_b++; + array_b[0] = 0; + } + } + + /* Make sure that the lengths are the same, as well as the final element + * before looping through the remainder. (Thus we test the length, final, + * and first elements right off the bat) */ + if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) { + retval = FALSE; + } + else for (i = 0; i < len_a - 1; i++) { + if (array_a[i] != array_b[i]) { + retval = FALSE; + break; + } + } + + if (complement_b) { + array_b[0] = 1; + } + return retval; +} +#endif + #undef HEADER_LENGTH #undef INVLIST_INITIAL_LENGTH #undef TO_INTERNAL_SIZE @@ -8215,11 +8299,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ { dVAR; - register regnode *ret; /* Will be the head of the group. */ - register regnode *br; - register regnode *lastbr; - register regnode *ender = NULL; - register I32 parno = 0; + regnode *ret; /* Will be the head of the group. */ + regnode *br; + regnode *lastbr; + regnode *ender = NULL; + I32 parno = 0; I32 flags; U32 oregflags = RExC_flags; bool have_branch = 0; @@ -8390,7 +8474,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, ((! FOLD) ? NREF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? NREFFA : (AT_LEAST_UNI_SEMANTICS) ? NREFFU @@ -9225,9 +9309,9 @@ STATIC regnode * S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) { dVAR; - register regnode *ret; - register regnode *chain = NULL; - register regnode *latest; + regnode *ret; + regnode *chain = NULL; + regnode *latest; I32 flags = 0, c = 0; GET_RE_DEBUG_FLAGS_DECL; @@ -9298,9 +9382,9 @@ STATIC regnode * S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { dVAR; - register regnode *ret; - register char op; - register char *next; + regnode *ret; + char op; + char *next; I32 flags; const char * const origparse = RExC_parse; I32 min; @@ -9489,86 +9573,95 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) return(ret); } - -/* reg_namedseq(pRExC_state,UVp, UV depth) +STATIC bool +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class) +{ - This is expected to be called by a parser routine that has - recognized '\N' and needs to handle the rest. RExC_parse is - expected to point at the first char following the N at the time - of the call. + /* This is expected to be called by a parser routine that has recognized '\N' + and needs to handle the rest. RExC_parse is expected to point at the first + char following the N at the time of the call. On successful return, + RExC_parse has been updated to point to just after the sequence identified + by this routine, and <*flagp> has been updated. - The \N may be inside (indicated by valuep not being NULL) or outside a + The \N may be inside (indicated by the boolean ) or outside a character class. \N may begin either a named sequence, or if outside a character class, mean to match a non-newline. For non single-quoted regexes, the tokenizer has - attempted to decide which, and in the case of a named sequence converted it + attempted to decide which, and in the case of a named sequence, converted it into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, where c1... are the characters in the sequence. For single-quoted regexes, the tokenizer passes the \N sequence through unchanged; this code will not - attempt to determine this nor expand those. The net effect is that if the - beginning of the passed-in pattern isn't '{U+' or there is no '}', it - signals that this \N occurrence means to match a non-newline. - + attempt to determine this nor expand those, instead raising a syntax error. + The net effect is that if the beginning of the passed-in pattern isn't '{U+' + or there is no '}', it signals that this \N occurrence means to match a + non-newline. + Only the \N{U+...} form should occur in a character class, for the same reason that '.' inside a character class means to just match a period: it just doesn't make sense. - - If valuep is non-null then it is assumed that we are parsing inside - of a charclass definition and the first codepoint in the resolved - string is returned via *valuep and the routine will return NULL. - In this mode if a multichar string is returned from the charnames - handler, a warning will be issued, and only the first char in the - sequence will be examined. If the string returned is zero length - then the value of *valuep is undefined and NON-NULL will - be returned to indicate failure. (This will NOT be a valid pointer - to a regnode.) - - If valuep is null then it is assumed that we are parsing normal text and a - new EXACT node is inserted into the program containing the resolved string, - and a pointer to the new node is returned. But if the string is zero length - a NOTHING node is emitted instead. - On success RExC_parse is set to the char following the endbrace. - Parsing failures will generate a fatal error via vFAIL(...) + The function raises an error (via vFAIL), and doesn't return for various + syntax errors. Otherwise it returns TRUE and sets or on + success; it returns FALSE otherwise. + + If is non-null, it means the caller can accept an input sequence + consisting of a just a single code point; <*valuep> is set to that value + if the input is such. + + If is non-null it signifies that the caller can accept any other + legal sequence (i.e., one that isn't just a single code point). <*node_p> + is set as follows: + 1) \N means not-a-NL: points to a newly created REG_ANY node; + 2) \N{}: points to a new NOTHING node; + 3) otherwise: points to a new EXACT node containing the resolved + string. + Note that FALSE is returned for single code point sequences if is + null. */ -STATIC regnode * -S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth) -{ + char * endbrace; /* '}' following the name */ - regnode *ret = NULL; char* p; + char *endchar; /* Points to '.' or '}' ending cur char in the input + stream */ + bool has_multiple_chars; /* true if the input stream contains a sequence of + more than one character */ GET_RE_DEBUG_FLAGS_DECL; - PERL_ARGS_ASSERT_REG_NAMEDSEQ; + PERL_ARGS_ASSERT_GROK_BSLASH_N; GET_RE_DEBUG_FLAGS; + assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ + /* The [^\n] meaning of \N ignores spaces and comments under the /x * modifier. The other meaning does not */ p = (RExC_flags & RXf_PMf_EXTENDED) ? regwhite( pRExC_state, RExC_parse ) : RExC_parse; - + /* Disambiguate between \N meaning a named character versus \N meaning * [^\n]. The former is assumed when it can't be the latter. */ if (*p != '{' || regcurly(p)) { RExC_parse = p; - if (valuep) { + if (! node_p) { /* no bare \N in a charclass */ - vFAIL("\\N in a character class must be a named character: \\N{...}"); - } + if (in_char_class) { + vFAIL("\\N in a character class must be a named character: \\N{...}"); + } + return FALSE; + } nextchar(pRExC_state); - ret = reg_node(pRExC_state, REG_ANY); + *node_p = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; RExC_naughty++; RExC_parse--; - Set_Node_Length(ret, 1); /* MJD */ - return ret; + Set_Node_Length(*node_p, 1); /* MJD */ + return TRUE; } - /* Here, we have decided it should be a named sequence */ + /* Here, we have decided it should be a named character or sequence */ /* The test above made sure that the next real character is a '{', but * under the /x modifier, it could be separated by space (or a comment and @@ -9590,44 +9683,48 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept } if (endbrace == RExC_parse) { /* empty: \N{} */ - if (! valuep) { - RExC_parse = endbrace + 1; - return reg_node(pRExC_state,NOTHING); - } - - if (SIZE_ONLY) { - ckWARNreg(RExC_parse, - "Ignoring zero length \\N{} in character class" - ); - RExC_parse = endbrace + 1; + bool ret = TRUE; + if (node_p) { + *node_p = reg_node(pRExC_state,NOTHING); + } + else if (in_char_class) { + if (SIZE_ONLY && in_char_class) { + ckWARNreg(RExC_parse, + "Ignoring zero length \\N{} in character class" + ); + } + ret = FALSE; } - *valuep = 0; - return (regnode *) &RExC_parse; /* Invalid regnode pointer */ + else { + return FALSE; + } + nextchar(pRExC_state); + return ret; } - REQUIRE_UTF8; /* named sequences imply Unicode semantics */ + RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ RExC_parse += 2; /* Skip past the 'U+' */ - if (valuep) { /* In a bracketed char class */ - /* We only pay attention to the first char of - multichar strings being returned. I kinda wonder + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + + /* Code points are separated by dots. If none, there is only one code + * point, and is terminated by the brace */ + has_multiple_chars = (endchar < endbrace); + + if (valuep && (! has_multiple_chars || in_char_class)) { + /* We only pay attention to the first char of + multichar strings being returned in char classes. I kinda wonder if this makes sense as it does change the behaviour from earlier versions, OTOH that behaviour was broken as well. XXX Solution is to recharacterize as [rest-of-class]|multi1|multi2... */ - STRLEN length_of_hex; - I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); + I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES | PERL_SCAN_DISALLOW_PREFIX | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); - - char * endchar = RExC_parse + strcspn(RExC_parse, ".}"); - if (endchar < endbrace) { - ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); - } - length_of_hex = (STRLEN)(endchar - RExC_parse); - *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL); + *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL); /* The tokenizer should have guaranteed validity, but it's possible to * bypass it by using single quoting, so check */ @@ -9639,16 +9736,26 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept ? UTF8SKIP(RExC_parse) : 1; /* Guard against malformed utf8 */ - if (RExC_parse >= endchar) RExC_parse = endchar; + if (RExC_parse >= endchar) { + RExC_parse = endchar; + } vFAIL("Invalid hexadecimal number in \\N{U+...}"); - } + } - RExC_parse = endbrace + 1; - if (endchar == endbrace) return NULL; + if (in_char_class && has_multiple_chars) { + ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); + } + RExC_parse = endbrace + 1; + } + else if (! node_p || ! has_multiple_chars) { - ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */ + /* Here, the input is legal, but not according to the caller's + * options. We fail without advancing the parse, so that the + * caller can try again */ + RExC_parse = p; + return FALSE; } - else { /* Not a char class */ + else { /* What is done here is to convert this to a sub-pattern of the form * (?:\x{char1}\x{char2}...) @@ -9661,16 +9768,11 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP); STRLEN len; - char *endchar; /* Points to '.' or '}' ending cur char in the input - stream */ char *orig_end = RExC_end; + I32 flags; while (RExC_parse < endbrace) { - /* Code points are separated by dots. If none, there is only one - * code point, and is terminated by the brace */ - endchar = RExC_parse + strcspn(RExC_parse, ".}"); - /* Convert to notation the rest of the code understands */ sv_catpv(substitute_parse, "\\x{"); sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); @@ -9678,6 +9780,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept /* Point to the beginning of the next character in the sequence. */ RExC_parse = endchar + 1; + endchar = RExC_parse + strcspn(RExC_parse, ".}"); } sv_catpv(substitute_parse, ")"); @@ -9692,16 +9795,17 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept /* The values are Unicode, and therefore not subject to recoding */ RExC_override_recoding = 1; - ret = reg(pRExC_state, 1, flagp, depth+1); + *node_p = reg(pRExC_state, 1, &flags, depth+1); + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); RExC_parse = endbrace; RExC_end = orig_end; RExC_override_recoding = 0; - nextchar(pRExC_state); + nextchar(pRExC_state); } - return ret; + return TRUE; } @@ -9738,6 +9842,93 @@ S_reg_recode(pTHX_ const char value, SV **encp) return uv; } +PERL_STATIC_INLINE U8 +S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) +{ + U8 op; + + PERL_ARGS_ASSERT_COMPUTE_EXACTISH; + + if (! FOLD) { + return EXACT; + } + + op = get_regex_charset(RExC_flags); + if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { + op--; /* /a is same as /u, and map /aa's offset to what /a's would have + been, so there is no hole */ + } + + return op + EXACTF; +} + +PERL_STATIC_INLINE void +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point) +{ + /* This knows the details about sizing an EXACTish node, setting flags for + * it (by setting <*flagp>, and potentially populating it with a single + * character. + * + * If is non-zero, this function assumes that the node has already + * been populated, and just does the sizing. In this case + * should be the final code point that has already been placed into the + * node. This value will be ignored except that under some circumstances + * <*flagp> is set based on it. + * + * If and calculates what + * should be. In pass 1, it sizes the node appropriately. In pass 2, it + * additionally will populate the node's STRING with , if + * is 0. In both cases <*flagp> is appropriately set + * + * It knows that under FOLD, UTF characters and the Latin Sharp S must be + * folded (the latter only when the rules indicate it can match 'ss') */ + + bool len_passed_in = cBOOL(len != 0); + U8 character[UTF8_MAXBYTES_CASE+1]; + + PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + + if (! len_passed_in) { + if (UTF) { + if (FOLD) { + to_uni_fold(NATIVE_TO_UNI(code_point), character, &len); + } + else { + uvchr_to_utf8( character, code_point); + len = UTF8SKIP(character); + } + } + else if (! FOLD + || code_point != LATIN_SMALL_LETTER_SHARP_S + || ASCII_FOLD_RESTRICTED + || ! AT_LEAST_UNI_SEMANTICS) + { + *character = (U8) code_point; + len = 1; + } + else { + *character = 's'; + *(character + 1) = 's'; + len = 2; + } + } + + if (SIZE_ONLY) { + RExC_size += STR_SZ(len); + } + else { + RExC_emit += STR_SZ(len); + STR_LEN(node) = len; + if (! len_passed_in) { + Copy((char *) character, STRING(node), len, char); + } + } + + *flagp |= HASWIDTH; + if (len == 1 && UNI_IS_INVARIANT(code_point)) + *flagp |= SIMPLE; +} /* - regatom - the lowest level @@ -9803,7 +9994,7 @@ STATIC regnode * S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { dVAR; - register regnode *ret = NULL; + regnode *ret = NULL; I32 flags; char *parse_start = RExC_parse; U8 op; @@ -9851,13 +10042,12 @@ tryagain: case '[': { char * const oregcomp_parse = ++RExC_parse; - ret = regclass(pRExC_state,depth+1); + ret = regclass(pRExC_state, flagp,depth+1); if (*RExC_parse != ']') { RExC_parse = oregcomp_parse; vFAIL("Unmatched ["); } nextchar(pRExC_state); - *flagp |= HASWIDTH|SIMPLE; Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ break; } @@ -10068,7 +10258,7 @@ tryagain: } RExC_parse--; - ret = regclass(pRExC_state,depth+1); + ret = regclass(pRExC_state, flagp,depth+1); RExC_end = oldregxend; RExC_parse--; @@ -10076,16 +10266,24 @@ tryagain: Set_Node_Offset(ret, parse_start + 2); Set_Node_Cur_Length(ret); nextchar(pRExC_state); - *flagp |= HASWIDTH|SIMPLE; } break; case 'N': - /* Handle \N and \N{NAME} here and not below because it can be - multicharacter. join_exact() will join them up later on. - Also this makes sure that things like /\N{BLAH}+/ and - \N{BLAH} being multi char Just Happen. dmq*/ + /* Handle \N and \N{NAME} with multiple code points here and not + * below because it can be multicharacter. join_exact() will join + * them up later on. Also this makes sure that things like + * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq. + * The options to the grok function call causes it to fail if the + * sequence is just a single code point. We then go treat it as + * just another character in the current EXACT node, and hence it + * gets uniform treatment with all the other characters. The + * special treatment for quantifiers is not needed for such single + * character sequences */ ++RExC_parse; - ret= reg_namedseq(pRExC_state, NULL, flagp, depth); + if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) { + RExC_parse--; + goto defchar; + } break; case 'k': /* Handle \k and \k'NAME' */ parse_named_seq: @@ -10115,7 +10313,7 @@ tryagain: ret = reganode(pRExC_state, ((! FOLD) ? NREF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? NREFFA : (AT_LEAST_UNI_SEMANTICS) ? NREFFU @@ -10186,7 +10384,7 @@ tryagain: ret = reganode(pRExC_state, ((! FOLD) ? REF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? REFFA : (AT_LEAST_UNI_SEMANTICS) ? REFFU @@ -10230,33 +10428,30 @@ tryagain: RExC_parse++; defchar: { - register STRLEN len; - register UV ender; - register char *p; + STRLEN len = 0; + UV ender; + char *p; char *s; +#define MAX_NODE_STRING_SIZE 127 + char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; + char *s0; + U8 upper_parse = MAX_NODE_STRING_SIZE; STRLEN foldlen; - U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; U8 node_type; - - /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so, - * it is folded to 'ss' even if not utf8 */ - bool is_exactfu_sharp_s; + bool next_is_quantifier; + char * oldp; ender = 0; - if (! FOLD) { - node_type = EXACT; - } - else { - node_type = get_regex_charset(RExC_flags); - if (node_type >= REGEX_ASCII_RESTRICTED_CHARSET) { - node_type--; /* /a is same as /u, and map /aa's offset to - what /a's would have been, so there is no - hole */ - } - node_type += EXACTF; - } + node_type = compute_EXACTish(pRExC_state); ret = reg_node(pRExC_state, node_type); - s = STRING(ret); + + /* In pass1, folded, we use a temporary buffer instead of the + * actual node, as the node doesn't exist yet */ + s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret); + + s0 = s; + + reparse: /* XXX The node can hold up to 255 bytes, yet this only goes to * 127. I (khw) do not know why. Keeping it somewhat less than @@ -10276,11 +10471,11 @@ tryagain: * could back off to end with only a code point that isn't such a * non-final, but it is possible for there not to be any in the * entire node. */ - for (len = 0, p = RExC_parse - 1; - len < 127 && p < RExC_end; + for (p = RExC_parse - 1; + len < upper_parse && p < RExC_end; len++) { - char * const oldp = p; + oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); @@ -10316,7 +10511,6 @@ tryagain: case 'g': case 'G': /* generic-backref, pos assertion */ case 'h': case 'H': /* HORIZWS */ case 'k': case 'K': /* named backref, keep marker */ - case 'N': /* named char sequence */ case 'p': case 'P': /* Unicode property */ case 'R': /* LNBREAK */ case 's': case 'S': /* space class */ @@ -10334,6 +10528,22 @@ tryagain: ender = '\n'; p++; break; + case 'N': /* Handle a single-code point named character. */ + /* The options cause it to fail if a multiple code + * point sequence. Handle those in the switch() above + * */ + RExC_parse = p + 1; + if (! grok_bslash_N(pRExC_state, NULL, &ender, + flagp, depth, FALSE)) + { + RExC_parse = p = oldp; + goto loopdone; + } + p = RExC_parse; + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; case 'r': ender = '\r'; p++; @@ -10479,150 +10689,270 @@ tryagain: break; } /* End of switch on the literal */ - is_exactfu_sharp_s = (node_type == EXACTFU - && ender == LATIN_SMALL_LETTER_SHARP_S); + /* Here, have looked at the literal character and + * contains its ordinal,

points to the character after it + */ + if ( RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); - if ((UTF && FOLD) || is_exactfu_sharp_s) { - /* Prime the casefolded buffer. Locale rules, which apply - * only to code points < 256, aren't known until execution, - * so for them, just output the original character using - * utf8. If we start to fold non-UTF patterns, be sure to - * update join_exact() */ - if (LOC && ender < 256) { - if (UNI_IS_INVARIANT(ender)) { - *tmpbuf = (U8) ender; - foldlen = 1; - } else { - *tmpbuf = UTF8_TWO_BYTE_HI(ender); - *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender); - foldlen = 2; - } - } - else if (isASCII(ender)) { /* Note: Here can't also be LOC - */ - ender = toLOWER(ender); - *tmpbuf = (U8) ender; - foldlen = 1; - } - else if (! MORE_ASCII_RESTRICTED && ! LOC) { - /* Locale and /aa require more selectivity about the - * fold, so are handled below. Otherwise, here, just - * use the fold */ - ender = toFOLD_uni(ender, tmpbuf, &foldlen); - } - else { - /* Under locale rules or /aa we are not to mix, - * respectively, ords < 256 or ASCII with non-. So - * reject folds that mix them, using only the - * non-folded code point. So do the fold to a - * temporary, and inspect each character in it. */ - U8 trialbuf[UTF8_MAXBYTES_CASE+1]; - U8* s = trialbuf; - UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen); - U8* e = s + foldlen; - bool fold_ok = TRUE; - - while (s < e) { - if (isASCII(*s) - || (LOC && (UTF8_IS_INVARIANT(*s) - || UTF8_IS_DOWNGRADEABLE_START(*s)))) - { - fold_ok = FALSE; - break; - } - s += UTF8SKIP(s); - } - if (fold_ok) { - Copy(trialbuf, tmpbuf, foldlen, U8); - ender = tmpender; - } - else { - uvuni_to_utf8(tmpbuf, ender); - foldlen = UNISKIP(ender); - } - } - } - if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ - if (len) - p = oldp; - else if (UTF || is_exactfu_sharp_s) { - if (FOLD) { - /* Emit all the Unicode characters. */ - STRLEN numlen; - for (foldbuf = tmpbuf; - foldlen; - foldlen -= numlen) { - - /* tmpbuf has been constructed by us, so we - * know it is valid utf8 */ - ender = valid_utf8_to_uvchr(foldbuf, &numlen); - if (numlen > 0) { - const STRLEN unilen = reguni(pRExC_state, ender, s); - s += unilen; - len += unilen; - /* In EBCDIC the numlen - * and unilen can differ. */ - foldbuf += numlen; - if (numlen >= foldlen) - break; - } - else - break; /* "Can't happen." */ - } - } - else { - const STRLEN unilen = reguni(pRExC_state, ender, s); - if (unilen > 0) { - s += unilen; - len += unilen; - } - } - } - else { - len++; - REGC((char)ender, s++); - } - break; + /* If the next thing is a quantifier, it applies to this + * character only, which means that this character has to be in + * its own node and can't just be appended to the string in an + * existing node, so if there are already other characters in + * the node, close the node with just them, and set up to do + * this character again next time through, when it will be the + * only thing in its new node */ + if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len) + { + p = oldp; + goto loopdone; + } + + if (FOLD) { + if (UTF + /* See comments for join_exact() as to why we fold + * this non-UTF at compile time */ + || (node_type == EXACTFU + && ender == LATIN_SMALL_LETTER_SHARP_S)) + { + + + /* Prime the casefolded buffer. Locale rules, which + * apply only to code points < 256, aren't known until + * execution, so for them, just output the original + * character using utf8. If we start to fold non-UTF + * patterns, be sure to update join_exact() */ + if (LOC && ender < 256) { + if (UNI_IS_INVARIANT(ender)) { + *s = (U8) ender; + foldlen = 1; + } else { + *s = UTF8_TWO_BYTE_HI(ender); + *(s + 1) = UTF8_TWO_BYTE_LO(ender); + foldlen = 2; + } + } + else { + ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen, + FOLD_FLAGS_FULL + | ((LOC) ? FOLD_FLAGS_LOCALE + : (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0) + ); + } + s += foldlen; + + /* The loop increments each time, as all but this + * path (and the one just below for UTF) through it add + * a single byte to the EXACTish node. But this one + * has changed len to be the correct final value, so + * subtract one to cancel out the increment that + * follows */ + len += foldlen - 1; + } + else { + *(s++) = ender; + } } - if (UTF || is_exactfu_sharp_s) { - if (FOLD) { - /* Emit all the Unicode characters. */ - STRLEN numlen; - for (foldbuf = tmpbuf; - foldlen; - foldlen -= numlen) { - ender = valid_utf8_to_uvchr(foldbuf, &numlen); - if (numlen > 0) { - const STRLEN unilen = reguni(pRExC_state, ender, s); - len += unilen; - s += unilen; - /* In EBCDIC the numlen - * and unilen can differ. */ - foldbuf += numlen; - if (numlen >= foldlen) - break; - } - else - break; - } - } - else { - const STRLEN unilen = reguni(pRExC_state, ender, s); - if (unilen > 0) { - s += unilen; - len += unilen; - } - } - len--; + else if (UTF) { + const STRLEN unilen = reguni(pRExC_state, ender, s); + if (unilen > 0) { + s += unilen; + len += unilen; + } + + /* See comment just above for - 1 */ + len--; } else { REGC((char)ender, s++); + } + + if (next_is_quantifier) { + + /* Here, the next input is a quantifier, and to get here, + * the current character is the only one in the node. + * Also, here doesn't include the final byte for this + * character */ + len++; + goto loopdone; } - } + + } /* End of loop through literal characters */ + + /* Here we have either exhausted the input or ran out of room in + * the node. (If we encountered a character that can't be in the + * node, transfer is made directly to , and so we + * wouldn't have fallen off the end of the loop.) In the latter + * case, we artificially have to split the node into two, because + * we just don't have enough space to hold everything. This + * creates a problem if the final character participates in a + * multi-character fold in the non-final position, as a match that + * should have occurred won't, due to the way nodes are matched, + * and our artificial boundary. So back off until we find a non- + * problematic character -- one that isn't at the beginning or + * middle of such a fold. (Either it doesn't participate in any + * folds, or appears only in the final position of all the folds it + * does participate in.) A better solution with far fewer false + * positives, and that would fill the nodes more completely, would + * be to actually have available all the multi-character folds to + * test against, and to back-off only far enough to be sure that + * this node isn't ending with a partial one. is set + * further below (if we need to reparse the node) to include just + * up through that final non-problematic character that this code + * identifies, so when it is set to less than the full node, we can + * skip the rest of this */ + if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { + + const STRLEN full_len = len; + + assert(len >= MAX_NODE_STRING_SIZE); + + /* Here, points to the final byte of the final character. + * Look backwards through the string until find a non- + * problematic character */ + + if (! UTF) { + + /* These two have no multi-char folds to non-UTF characters + */ + if (ASCII_FOLD_RESTRICTED || LOC) { + goto loopdone; + } + + while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { } + len = s - s0 + 1; + } + else { + if (! PL_NonL1NonFinalFold) { + PL_NonL1NonFinalFold = _new_invlist_C_array( + NonL1_Perl_Non_Final_Folds_invlist); + } + + /* Point to the first byte of the final character */ + s = (char *) utf8_hop((U8 *) s, -1); + + while (s >= s0) { /* Search backwards until find + non-problematic char */ + if (UTF8_IS_INVARIANT(*s)) { + + /* There are no ascii characters that participate + * in multi-char folds under /aa. In EBCDIC, the + * non-ascii invariants are all control characters, + * so don't ever participate in any folds. */ + if (ASCII_FOLD_RESTRICTED + || ! IS_NON_FINAL_FOLD(*s)) + { + break; + } + } + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + + /* No Latin1 characters participate in multi-char + * folds under /l */ + if (LOC + || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI( + *s, *(s+1)))) + { + break; + } + } + else if (! _invlist_contains_cp( + PL_NonL1NonFinalFold, + valid_utf8_to_uvchr((U8 *) s, NULL))) + { + break; + } + + /* Here, the current character is problematic in that + * it does occur in the non-final position of some + * fold, so try the character before it, but have to + * special case the very first byte in the string, so + * we don't read outside the string */ + s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); + } /* End of loop backwards through the string */ + + /* If there were only problematic characters in the string, + * will point to before s0, in which case the length + * should be 0, otherwise include the length of the + * non-problematic character just found */ + len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s); + } + + /* Here, have found the final character, if any, that is + * non-problematic as far as ending the node without splitting + * it across a potential multi-char fold. contains the + * number of bytes in the node up-to and including that + * character, or is 0 if there is no such character, meaning + * the whole node contains only problematic characters. In + * this case, give up and just take the node as-is. We can't + * do any better */ + if (len == 0) { + len = full_len; + } else { + + /* Here, the node does contain some characters that aren't + * problematic. If one such is the final character in the + * node, we are done */ + if (len == full_len) { + goto loopdone; + } + else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) { + + /* If the final character is problematic, but the + * penultimate is not, back-off that last character to + * later start a new node with it */ + p = oldp; + goto loopdone; + } + + /* Here, the final non-problematic character is earlier + * in the input than the penultimate character. What we do + * is reparse from the beginning, going up only as far as + * this final ok one, thus guaranteeing that the node ends + * in an acceptable character. The reason we reparse is + * that we know how far in the character is, but we don't + * know how to correlate its position with the input parse. + * An alternate implementation would be to build that + * correlation as we go along during the original parse, + * but that would entail extra work for every node, whereas + * this code gets executed only when the string is too + * large for the node, and the final two characters are + * problematic, an infrequent occurrence. Yet another + * possible strategy would be to save the tail of the + * string, and the next time regatom is called, initialize + * with that. The problem with this is that unless you + * back off one more character, you won't be guaranteed + * regatom will get called again, unless regbranch, + * regpiece ... are also changed. If you do back off that + * extra character, so that there is input guaranteed to + * force calling regatom, you can't handle the case where + * just the first character in the node is acceptable. I + * (khw) decided to try this method which doesn't have that + * pitfall; if performance issues are found, we can do a + * combination of the current approach plus that one */ + upper_parse = len; + len = 0; + s = s0; + goto reparse; + } + } /* End of verifying node ends with an appropriate char */ + loopdone: /* Jumped to when encounters something that shouldn't be in the node */ + + /* I (khw) don't know if you can get here with zero length, but the + * old code handled this situation by creating a zero-length EXACT + * node. Might as well be NOTHING instead */ + if (len == 0) { + OP(ret) = NOTHING; + } + else{ + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); + } + RExC_parse = p - 1; Set_Node_Cur_Length(ret); /* MJD */ nextchar(pRExC_state); @@ -10632,20 +10962,10 @@ tryagain: if (iv < 0) vFAIL("Internal disaster"); } - if (len > 0) - *flagp |= HASWIDTH; - if (len == 1 && UNI_IS_INVARIANT(ender)) - *flagp |= SIMPLE; - if (SIZE_ONLY) - RExC_size += STR_SZ(len); - else { - STR_LEN(ret) = len; - RExC_emit += STR_SZ(len); - } - } + } /* End of label 'defchar:' */ break; - } + } /* End of giant switch on input character */ return(ret); } @@ -10997,6 +11317,10 @@ S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) * changed since initialization, then there is a run-time definition. */ #define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len) +/* This converts the named class defined in regcomp.h to its equivalent class + * number defined in handy.h. */ +#define namedclass_to_classnum(class) ((class) / 2) + /* parse a class specification and produce either an ANYOF node that matches the pattern or perhaps will be optimized into an EXACTish node @@ -11005,14 +11329,14 @@ S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) above 255, a range list is used */ STATIC regnode * -S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) +S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { dVAR; - register UV nextvalue; - register IV prevvalue = OOB_UNICODE; - register IV range = 0; - UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ - register regnode *ret; + UV nextvalue; + UV prevvalue = OOB_UNICODE; + IV range = 0; + UV value = 0; + regnode *ret; STRLEN numlen; IV namedclass = OOB_NAMEDCLASS; char *rangebegin = NULL; @@ -11056,7 +11380,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ UV literal_endpoint = 0; #endif - UV stored = 0; /* how many chars stored in the bitmap */ bool invert = FALSE; /* Is this class to be complemented */ /* Is there any thing like \W or [:^digit:] that matches above the legal @@ -11066,6 +11389,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; + const I32 orig_size = RExC_size; GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCLASS; @@ -11108,7 +11432,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) if (LOC) { ANYOF_FLAGS(ret) |= ANYOF_LOCALE; } - ANYOF_BITMAP_ZERO(ret); listsv = newSVpvs("# comment\n"); initial_listsv_len = SvCUR(listsv); } @@ -11177,11 +11500,11 @@ parseit: if this makes sense as it does change the behaviour from earlier versions, OTOH that behaviour was broken as well. */ - UV v; /* value is register so we cant & it /grrr */ - if (reg_namedseq(pRExC_state, &v, NULL, depth)) { + if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth, + TRUE /* => charclass */)) + { goto parseit; } - value= v; } break; case 'p': @@ -11208,7 +11531,6 @@ parseit: n = 1; } if (!SIZE_ONLY) { - SV** invlistsvp; SV* invlist; char* name; @@ -11250,10 +11572,7 @@ parseit: if ( ! swash || ! SvROK(swash) || ! SvTYPE(SvRV(swash)) == SVt_PVHV - || ! (invlistsvp = - hv_fetchs(MUTABLE_HV(SvRV(swash)), - "INVLIST", FALSE)) - || ! (invlist = *invlistsvp)) + || ! (invlist = _get_swash_invlist(swash))) { if (swash) { SvREFCNT_dec(swash); @@ -11661,7 +11980,7 @@ parseit: } /* end of namedclass \blah */ if (range) { - if (prevvalue > (IV)value) /* b-a */ { + if (prevvalue > value) /* b-a */ { const int w = RExC_parse - rangebegin; Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); range = 0; /* not a valid range */ @@ -11685,8 +12004,10 @@ parseit: "False [] range \"%*.*s\"", w, w, rangebegin); } - if (!SIZE_ONLY) + if (!SIZE_ONLY) { cp_list = add_cp_to_invlist(cp_list, '-'); + } + element_count++; } else range = 1; /* yeah, it's a range! */ continue; /* but do it the next time */ @@ -11738,6 +12059,7 @@ parseit: * Check if this is the case for this class */ if (element_count == 1) { U8 op = END; + U8 arg = 0; if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or [:digit:] or \p{foo} */ @@ -11797,6 +12119,7 @@ parseit: if (invert) { op += NALNUM - ALNUM; } + *flagp |= HASWIDTH|SIMPLE; break; /* The second group doesn't depend of the charset modifiers. @@ -11805,7 +12128,9 @@ parseit: invert = ! invert; /* FALLTHROUGH */ case ANYOF_HORIZWS: + is_horizws: op = (invert) ? NHORIZWS : HORIZWS; + *flagp |= HASWIDTH|SIMPLE; break; case ANYOF_NVERTWS: @@ -11813,14 +12138,63 @@ parseit: /* FALLTHROUGH */ case ANYOF_VERTWS: op = (invert) ? NVERTWS : VERTWS; + *flagp |= HASWIDTH|SIMPLE; break; + case ANYOF_MAX: + break; + case ANYOF_NBLANK: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_BLANK: + if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) { + goto is_horizws; + } + /* FALLTHROUGH */ + default: + /* A generic posix class. All the /a ones can be handled + * by the POSIXA opcode. And all are closed under folding + * in the ASCII range, so FOLD doesn't matter */ + if (AT_LEAST_ASCII_RESTRICTED + || (! LOC && namedclass == ANYOF_ASCII)) + { + /* The odd numbered ones are the complements of the + * next-lower even number one */ + if (namedclass % 2 == 1) { + invert = ! invert; + namedclass--; + } + arg = namedclass_to_classnum(namedclass); + op = (invert) ? NPOSIXA : POSIXA; + } + break; } } - else if (! LOC) { - if (prevvalue == '0' && value == '9') { - op = (invert) ? NDIGITA : DIGITA; + else if (value == prevvalue) { + + /* Here, the class consists of just a single code point */ + + if (invert) { + if (! LOC && value == '\n') { + op = REG_ANY; /* Optimize [^\n] */ + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + } + else if (value < 256 || UTF) { + + /* Optimize a single value into an EXACTish node, but not if it + * would require converting the pattern to UTF-8. */ + op = compute_EXACTish(pRExC_state); + } + } /* Otherwise is a range */ + else if (! LOC) { /* locale could vary these */ + if (prevvalue == '0') { + if (value == '9') { + op = (invert) ? NDIGITA : DIGITA; + *flagp |= HASWIDTH|SIMPLE; + } } } @@ -11833,10 +12207,32 @@ parseit: * the parse */ const char * cur_parse = RExC_parse; RExC_parse = (char *)orig_parse; - RExC_emit = (regnode *)orig_emit; + if ( SIZE_ONLY) { + if (! LOC) { + + /* To get locale nodes to not use the full ANYOF size would + * require moving the code above that writes the portions + * of it that aren't in other nodes to after this point. + * e.g. ANYOF_CLASS_SET */ + RExC_size = orig_size; + } + } + else { + RExC_emit = (regnode *)orig_emit; + } ret = reg_node(pRExC_state, op); + if (PL_regkind[op] == POSIXD) { + if (! SIZE_ONLY) { + FLAGS(ret) = arg; + } + *flagp |= HASWIDTH|SIMPLE; + } + else if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + } + RExC_parse = (char *) cur_parse; SvREFCNT_dec(listsv); @@ -11846,7 +12242,7 @@ parseit: if (SIZE_ONLY) return ret; - /****** !SIZE_ONLY AFTER HERE *********/ + /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/ /* If folding, we calculate all characters that could fold to or from the * ones already on the list */ @@ -11870,7 +12266,7 @@ parseit: if (! PL_utf8_foldable) { SV* swash = swash_init("utf8", "_Perl_Any_Folds", &PL_sv_undef, 1, 0); - PL_utf8_foldable = _swash_to_invlist(swash); + PL_utf8_foldable = _get_swash_invlist(swash); SvREFCNT_dec(swash); } @@ -11954,7 +12350,7 @@ parseit: } if (HAS_NONLATIN1_FOLD_CLOSURE(j) - && (! isASCII(j) || ! MORE_ASCII_RESTRICTED)) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) { /* Certain Latin1 characters have matches outside * Latin1, or are multi-character. To get here, 'j' is @@ -11994,9 +12390,9 @@ parseit: break; case MICRO_SIGN: cp_list = add_cp_to_invlist(cp_list, - GREEK_SMALL_LETTER_MU); - cp_list = add_cp_to_invlist(cp_list, GREEK_CAPITAL_LETTER_MU); + cp_list = add_cp_to_invlist(cp_list, + GREEK_SMALL_LETTER_MU); break; case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: @@ -12013,7 +12409,7 @@ parseit: /* Under /a, /d, and /u, this can match the two * chars "ss" */ - if (! MORE_ASCII_RESTRICTED) { + if (! ASCII_FOLD_RESTRICTED) { add_alternate(&unicode_alternate, (U8 *) "ss", 2); @@ -12058,7 +12454,7 @@ parseit: ((allow_full_fold) ? FOLD_FLAGS_FULL : 0) | ((LOC) ? FOLD_FLAGS_LOCALE - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? FOLD_FLAGS_NOMIX_ASCII : 0)); @@ -12117,7 +12513,7 @@ parseit: /* /aa doesn't allow folds between ASCII and non-; * /l doesn't allow them between above and below * 256 */ - if ((MORE_ASCII_RESTRICTED + if ((ASCII_FOLD_RESTRICTED && (isASCII(c) != isASCII(j))) || (LOC && ((c < 256) != (j < 256)))) { @@ -12146,7 +12542,7 @@ parseit: * fold the classes (folding of those is automatically handled by the swash * fetching code) */ if (posixes) { - if (AT_LEAST_UNI_SEMANTICS) { + if (! DEPENDS_SEMANTICS) { if (cp_list) { _invlist_union(cp_list, posixes, &cp_list); SvREFCNT_dec(posixes); @@ -12156,7 +12552,6 @@ parseit: } } else { - /* Under /d, we put into a separate list the Latin1 things that * match only when the target string is utf8 */ SV* nonascii_but_latin1_properties = NULL; @@ -12236,15 +12631,14 @@ parseit: * Now we can see about various optimizations. Fold calculation (which we * did above) needs to take place before inversion. Otherwise /[^k]/i * would invert to include K, which under /i would match k, which it - * shouldn't. */ + * shouldn't. Therefore we can't invert folded locale now, as it won't be + * folded until runtime */ - /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't - * set the FOLD flag yet, so this does optimize those. It doesn't - * optimize locale. Doing so perhaps could be done as long as there is - * nothing like \w in it; some thought also would have to be given to the - * interaction with above 0x100 chars */ + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known + * at compile time. Besides not inverting folded locale now, we can't invert + * if there are things such as \w, which aren't known until runtime */ if (invert - && ! LOC + && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS))) && ! depends_list && ! unicode_alternate && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) @@ -12261,10 +12655,151 @@ parseit: invert = FALSE; } + /* If we didn't do folding, it's because some information isn't available + * until runtime; set the run-time fold flag for these. (We don't have to + * worry about properties folding, as that is taken care of by the swash + * fetching) */ + if (FOLD && (LOC || unicode_alternate)) + { + ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD; + } + + /* Some character classes are equivalent to other nodes. Such nodes take + * up less room and generally fewer operations to execute than ANYOF nodes. + * Above, we checked for and optimized into some such equivalents for + * certain common classes that are easy to test. Getting to this point in + * the code means that the class didn't get optimized there. Since this + * code is only executed in Pass 2, it is too late to save space--it has + * been allocated in Pass 1, and currently isn't given back. But turning + * things into an EXACTish node can allow the optimizer to join it to any + * adjacent such nodes. And if the class is equivalent to things like /./, + * expensive run-time swashes can be avoided. Now that we have more + * complete information, we can find things necessarily missed by the + * earlier code. I (khw) am not sure how much to look for here. It would + * be easy, but perhaps too slow, to check any candidates against all the + * node types they could possibly match using _invlistEQ(). */ + + if (cp_list + && ! unicode_alternate + && ! invert + && ! depends_list + && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + { + UV start, end; + U8 op = END; /* The optimzation node-type */ + const char * cur_parse= RExC_parse; + + invlist_iterinit(cp_list); + if (! invlist_iternext(cp_list, &start, &end)) { + + /* Here, the list is empty. This happens, for example, when a + * Unicode property is the only thing in the character class, and + * it doesn't match anything. (perluniprops.pod notes such + * properties) */ + op = OPFAIL; + *flagp |= HASWIDTH|SIMPLE; + } + else if (start == end) { /* The range is a single code point */ + if (! invlist_iternext(cp_list, &start, &end) + + /* Don't do this optimization if it would require changing + * the pattern to UTF-8 */ + && (start < 256 || UTF)) + { + /* Here, the list contains a single code point. Can optimize + * into an EXACT node */ + + value = start; + + if (! FOLD) { + op = EXACT; + } + else if (LOC) { + + /* A locale node under folding with one code point can be + * an EXACTFL, as its fold won't be calculated until + * runtime */ + op = EXACTFL; + } + else { + + /* Here, we are generally folding, but there is only one + * code point to match. If we have to, we use an EXACT + * node, but it would be better for joining with adjacent + * nodes in the optimization pass if we used the same + * EXACTFish node that any such are likely to be. We can + * do this iff the code point doesn't participate in any + * folds. For example, an EXACTF of a colon is the same as + * an EXACT one, since nothing folds to or from a colon. + * In the Latin1 range, being an alpha means that the + * character participates in a fold (except for the + * feminine and masculine ordinals, which I (khw) don't + * think are worrying about optimizing for). */ + if (value < 256) { + if (isALPHA_L1(value)) { + op = EXACT; + } + } + else { + if (! PL_utf8_foldable) { + SV* swash = swash_init("utf8", "_Perl_Any_Folds", + &PL_sv_undef, 1, 0); + PL_utf8_foldable = _get_swash_invlist(swash); + SvREFCNT_dec(swash); + } + if (_invlist_contains_cp(PL_utf8_foldable, value)) { + op = EXACT; + } + } + + /* If we haven't found the node type, above, it means we + * can use the prevailing one */ + if (op == END) { + op = compute_EXACTish(pRExC_state); + } + } + } + } + else if (start == 0) { + if (end == UV_MAX) { + op = SANY; + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + else if (end == '\n' - 1 + && invlist_iternext(cp_list, &start, &end) + && start == '\n' + 1 && end == UV_MAX) + { + op = REG_ANY; + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + } + + if (op != END) { + RExC_parse = (char *)orig_parse; + RExC_emit = (regnode *)orig_emit; + + ret = reg_node(pRExC_state, op); + + RExC_parse = (char *)cur_parse; + + if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + } + + SvREFCNT_dec(listsv); + return ret; + } + } + /* Here, contains all the code points we can determine at * compile time that match under all conditions. Go through it, and * for things that belong in the bitmap, put them there, and delete from - * */ + * . While we are at it, see if everything above 255 is in the + * list, and if so, set a flag to speed up execution */ + ANYOF_BITMAP_ZERO(ret); if (cp_list) { /* This gets set if we actually need to modify things */ @@ -12278,6 +12813,10 @@ parseit: UV high; int i; + if (end == UV_MAX && start <= 256) { + ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; + } + /* Quit if are above what we should change */ if (start > 255) { break; @@ -12290,7 +12829,6 @@ parseit: for (i = start; i <= (int) high; i++) { if (! ANYOF_BITMAP_TEST(ret, i)) { ANYOF_BITMAP_SET(ret, i); - stored++; prevvalue = value; value = i; } @@ -12314,7 +12852,9 @@ parseit: ANYOF_FLAGS(ret) |= ANYOF_INVERT; } - /* Combine the two lists into one. */ + /* Here, the bitmap has been populated with all the Latin1 code points that + * always match. Can now add to the overall list those that match only + * when the target string is UTF-8 (). */ if (depends_list) { if (cp_list) { _invlist_union(cp_list, depends_list, &cp_list); @@ -12325,121 +12865,13 @@ parseit: } } - /* Folding in the bitmap is taken care of above, but not for locale (for - * which we have to wait to see what folding is in effect at runtime), and - * for some things not in the bitmap (only the upper latin folds in this - * case, as all other single-char folding has been set above). Set - * run-time fold flag for these */ - if (FOLD && (LOC - || (DEPENDS_SEMANTICS - && cp_list - && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8)) - || unicode_alternate)) - { - ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD; - } - - /* A single character class can be "optimized" into an EXACTish node. - * Note that since we don't currently count how many characters there are - * outside the bitmap, we are XXX missing optimization possibilities for - * them. This optimization can't happen unless this is a truly single - * character class, which means that it can't be an inversion into a - * many-character class, and there must be no possibility of there being - * things outside the bitmap. 'stored' (only) for locales doesn't include - * \w, etc, so have to make a special test that they aren't present - * - * Similarly A 2-character class of the very special form like [bB] can be - * optimized into an EXACTFish node, but only for non-locales, and for - * characters which only have the two folds; so things like 'fF' and 'Ii' - * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE - * FI'. */ - if (! cp_list - && ! unicode_alternate - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION - && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL)) - && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) - || (! ANYOF_CLASS_TEST_ANY_SET(ret))))) - || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) - && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)) - /* If the latest code point has a fold whose - * bit is set, it must be the only other one */ - && ((prevvalue = PL_fold_latin1[value]) != (IV)value) - && ANYOF_BITMAP_TEST(ret, prevvalue))))) - { - /* Note that the information needed to decide to do this optimization - * is not currently available until the 2nd pass, and that the actually - * used EXACTish node takes less space than the calculated ANYOF node, - * and hence the amount of space calculated in the first pass is larger - * than actually used, so this optimization doesn't gain us any space. - * But an EXACT node is faster than an ANYOF node, and can be combined - * with any adjacent EXACT nodes later by the optimizer for further - * gains. The speed of executing an EXACTF is similar to an ANYOF - * node, so the optimization advantage comes from the ability to join - * it to adjacent EXACT nodes */ - - const char * cur_parse= RExC_parse; - U8 op; - RExC_emit = (regnode *)orig_emit; - RExC_parse = (char *)orig_parse; - - if (stored == 1) { - - /* A locale node with one point can be folded; all the other cases - * with folding will have two points, since we calculate them above - */ - if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) { - op = EXACTFL; - } - else { - op = EXACT; - } - } - else { /* else 2 chars in the bit map: the folds of each other */ - - /* Use the folded value, which for the cases where we get here, - * is just the lower case of the current one (which may resolve to - * itself, or to the other one */ - value = toLOWER_LATIN1(value); - - /* To join adjacent nodes, they must be the exact EXACTish type. - * Try to use the most likely type, by using EXACTFA if possible, - * then EXACTFU if the regex calls for it, or is required because - * the character is non-ASCII. (If is ASCII, its fold is - * also ASCII for the cases where we get here.) */ - if (MORE_ASCII_RESTRICTED && isASCII(value)) { - op = EXACTFA; - } - else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) { - op = EXACTFU; - } - else { /* Otherwise, more likely to be EXACTF type */ - op = EXACTF; - } - } - - ret = reg_node(pRExC_state, op); - RExC_parse = (char *)cur_parse; - if (UTF && ! NATIVE_IS_INVARIANT(value)) { - *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value); - *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value); - STR_LEN(ret)= 2; - RExC_emit += STR_SZ(2); - } - else { - *STRING(ret)= (char)value; - STR_LEN(ret)= 1; - RExC_emit += STR_SZ(1); - } - SvREFCNT_dec(listsv); - return ret; - } - /* If there is a swash and more than one element, we can't use the swash in * the optimization below. */ if (swash && element_count > 1) { SvREFCNT_dec(swash); swash = NULL; } + if (! cp_list && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION && ! unicode_alternate) @@ -12495,6 +12927,8 @@ parseit: RExC_rxi->data->data[n] = (void*)rv; ARG_SET(ret, n); } + + *flagp |= HASWIDTH|SIMPLE; return ret; } #undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION @@ -12588,7 +13022,7 @@ STATIC regnode * /* Location. */ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) { dVAR; - register regnode *ptr; + regnode *ptr; regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; @@ -12630,7 +13064,7 @@ STATIC regnode * /* Location. */ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) { dVAR; - register regnode *ptr; + regnode *ptr; regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; @@ -12702,9 +13136,9 @@ STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) { dVAR; - register regnode *src; - register regnode *dst; - register regnode *place; + regnode *src; + regnode *dst; + regnode *place; const int offset = regarglen[(U8)op]; const int size = NODE_STEP_REGNODE + offset; GET_RE_DEBUG_FLAGS_DECL; @@ -12790,7 +13224,7 @@ STATIC void S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) { dVAR; - register regnode *scan; + regnode *scan; GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGTAIL; @@ -12849,7 +13283,7 @@ STATIC U8 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) { dVAR; - register regnode *scan; + regnode *scan; U8 exact = PSEUDO; #ifdef EXPERIMENTAL_INPLACESCAN I32 min = 0; @@ -13090,7 +13524,41 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) { #ifdef DEBUGGING dVAR; - register int k; + int k; + + /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ + static const char * const anyofs[] = { + "\\w", + "\\W", + "\\s", + "\\S", + "\\d", + "\\D", + "[:alnum:]", + "[:^alnum:]", + "[:alpha:]", + "[:^alpha:]", + "[:ascii:]", + "[:^ascii:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:graph:]", + "[:^graph:]", + "[:lower:]", + "[:^lower:]", + "[:print:]", + "[:^print:]", + "[:punct:]", + "[:^punct:]", + "[:upper:]", + "[:^upper:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:blank:]", + "[:^blank:]" + }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -13211,39 +13679,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) const U8 flags = ANYOF_FLAGS(o); int do_sep = 0; - /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ - static const char * const anyofs[] = { - "\\w", - "\\W", - "\\s", - "\\S", - "\\d", - "\\D", - "[:alnum:]", - "[:^alnum:]", - "[:alpha:]", - "[:^alpha:]", - "[:ascii:]", - "[:^ascii:]", - "[:cntrl:]", - "[:^cntrl:]", - "[:graph:]", - "[:^graph:]", - "[:lower:]", - "[:^lower:]", - "[:print:]", - "[:^print:]", - "[:punct:]", - "[:^punct:]", - "[:upper:]", - "[:^upper:]", - "[:xdigit:]", - "[:^xdigit:]", - "[:space:]", - "[:^space:]", - "[:blank:]", - "[:^blank:]" - }; if (flags & ANYOF_LOCALE) sv_catpvs(sv, "{loc}"); @@ -13381,6 +13816,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } + else if (k == POSIXD) { + U8 index = FLAGS(o) * 2; + if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + } + else { + sv_catpv(sv, anyofs[index]); + } + } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); #else @@ -13884,7 +14328,7 @@ regnode * Perl_regnext(pTHX_ register regnode *p) { dVAR; - register I32 offset; + I32 offset; if (!p) return(NULL); @@ -14043,8 +14487,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, SV* sv, I32 indent, U32 depth) { dVAR; - register U8 op = PSEUDO; /* Arbitrary non-END op. */ - register const regnode *next; + U8 op = PSEUDO; /* Arbitrary non-END op. */ + const regnode *next; const regnode *optstart= NULL; RXi_GET_DECL(r,ri); @@ -14095,9 +14539,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, if (PL_regkind[(U8)op] == BRANCHJ) { assert(next); { - register const regnode *nnode = (OP(next) == LONGJMP - ? regnext((regnode *)next) - : next); + const regnode *nnode = (OP(next) == LONGJMP + ? regnext((regnode *)next) + : next); if (last && nnode > last) nnode = last; DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);