X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/5073ffbd0df5f82154fd580e53686ef82b68748d..8769f413b9f20a6cc3b610fc5fa588e3593b5295:/regcomp.c diff --git a/regcomp.c b/regcomp.c index a05b853..8cef832 100644 --- a/regcomp.c +++ b/regcomp.c @@ -81,16 +81,24 @@ #define REG_COMP_C #ifdef PERL_IN_XSUB_RE # include "re_comp.h" +extern const struct regexp_engine my_reg_engine; #else # include "regcomp.h" #endif #include "dquote_static.c" -#ifndef PERL_IN_XSUB_RE -# include "charclass_invlists.h" +#include "charclass_invlists.h" +#include "inline_invlist.c" +#include "unicode_constants.h" + +#ifdef HAS_ISBLANK +# define hasISBLANK 1 +#else +# define hasISBLANK 0 #endif #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) #ifdef op #undef op @@ -220,7 +228,7 @@ typedef struct RExC_state_t { #define WORST 0 /* Worst case. */ #define HASWIDTH 0x01 /* Known to match non-null strings. */ -/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single +/* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single * character, and if utf8, must be invariant. Note that this is not the same * thing as REGNODE_SIMPLE */ #define SIMPLE 0x02 @@ -289,8 +297,8 @@ typedef struct RExC_state_t { string can occur infinitely far to the right. - minlenp - A pointer to the minimum length of the pattern that the string - was found inside. This is important as in the case of positive + A pointer to the minimum number of characters of the pattern that the + string was found inside. This is important as in the case of positive lookahead or positive lookbehind we can have multiple patterns involved. Consider @@ -398,14 +406,18 @@ static const scan_data_t zero_scan_data = #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) -#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) -#define OOB_UNICODE 12345678 #define OOB_NAMEDCLASS -1 +/* There is no code point that is out-of-bounds, so this is problematic. But + * its only current use is to initialize a variable that is always set before + * looked at. */ +#define OOB_UNICODE 0xDEADBEEF + #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) @@ -2578,44 +2590,45 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * one, and looks for problematic sequences of characters whose folds vs. * non-folds have sufficiently different lengths, that the optimizer would be * fooled into rejecting legitimate matches of them, and the trie construction - * code can't cope with them. The joining is only done if: + * code needs to handle specially. The joining is only done if: * 1) there is room in the current conglomerated node to entirely contain the * next one. * 2) they are the exact same node type * - * The adjacent nodes actually may be separated by NOTHING kind nodes, and + * The adjacent nodes actually may be separated by NOTHING-kind nodes, and * these get optimized out * * If there are problematic code sequences, *min_subtract is set to the delta - * that the minimum size of the node can be less than its actual size. And, - * the node type of the result is changed to reflect that it contains these - * sequences. + * number of characters that the minimum size of the node can be less than its + * actual size. And, the node type of the result is changed to reflect that it + * contains these sequences. * * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF * and contains LATIN SMALL LETTER SHARP S * * This is as good a place as any to discuss the design of handling these * problematic sequences. It's been wrong in Perl for a very long time. There - * are three code points in Unicode whose folded lengths differ so much from - * the un-folded lengths that it causes problems for the optimizer and trie - * construction. Why only these are problematic, and not others where lengths - * also differ is something I (khw) do not understand. New versions of Unicode - * might add more such code points. Hopefully the logic in fold_grind.t that - * figures out what to test (in part by verifying that each size-combination - * gets tested) will catch any that do come along, so they can be added to the - * special handling below. The chances of new ones are actually rather small, - * as most, if not all, of the world's scripts that have casefolding have - * already been encoded by Unicode. Also, a number of Unicode's decisions were - * made to allow compatibility with pre-existing standards, and almost all of - * those have already been dealt with. These would otherwise be the most - * likely candidates for generating further tricky sequences. In other words, - * Unicode by itself is unlikely to add new ones unless it is for compatibility - * with pre-existing standards, and there aren't many of those left. + * are three code points currently in Unicode whose folded lengths differ so + * much from the un-folded lengths that it causes problems for the optimizer + * and trie construction. Why only these are problematic, and not others where + * lengths also differ is something I (khw) do not understand. New versions of + * Unicode might add more such code points. Hopefully the logic in + * fold_grind.t that figures out what to test (in part by verifying that each + * size-combination gets tested) will catch any that do come along, so they can + * be added to the special handling below. The chances of new ones are + * actually rather small, as most, if not all, of the world's scripts that have + * casefolding have already been encoded by Unicode. Also, a number of + * Unicode's decisions were made to allow compatibility with pre-existing + * standards, and almost all of those have already been dealt with. These + * would otherwise be the most likely candidates for generating further tricky + * sequences. In other words, Unicode by itself is unlikely to add new ones + * unless it is for compatibility with pre-existing standards, and there aren't + * many of those left. * * The previous designs for dealing with these involved assigning a special * node for them. This approach doesn't work, as evidenced by this example: * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches - * Both these fold to "sss", but if the pattern is parsed to create a node of + * Both these fold to "sss", but if the pattern is parsed to create a node * that would match just the \xDF, it won't be able to handle the case where a * successful match would have to cross the node's boundary. The new approach * that hopefully generally solves the problem generates an EXACTFU_SS node @@ -2630,9 +2643,9 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * problematic sequences. This delta is used by the caller to adjust the * min length of the match, and the delta between min and max, so that the * optimizer doesn't reject these possibilities based on size constraints. - * 2) These sequences are not currently correctly handled by the trie code - * either, so it changes the joined node type to ops that are not handled - * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD. + * 2) These sequences require special handling by the trie code, so this code + * changes the joined node type to special ops: EXACTFU_TRICKYFOLD and + * EXACTFU_SS. * 3) This is sufficient for the two Greek sequences (described below), but * the one involving the Sharp s (\xDF) needs more. The node type * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss" @@ -2642,22 +2655,21 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * itself with length changes, and so can be processed faster. regexec.c * takes advantage of this. Generally, an EXACTFish node that is in UTF-8 * is pre-folded by regcomp.c. This saves effort in regex matching. - * However, probably mostly for historical reasons, the pre-folding isn't - * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL - * nodes, as what they fold to isn't known until runtime.) The fold - * possibilities for the non-UTF8 patterns are quite simple, except for - * the sharp s. All the ones that don't involve a UTF-8 target string - * are members of a fold-pair, and arrays are set up for all of them - * that quickly find the other member of the pair. It might actually - * be faster to pre-fold these, but it isn't currently done, except for - * the sharp s. Code elsewhere in this file makes sure that it gets - * folded to 'ss', even if the pattern isn't UTF-8. This avoids the - * issues described in the next item. + * However, the pre-folding isn't done for non-UTF8 patterns because the + * fold of the MICRO SIGN requires UTF-8. Also what EXACTF and EXACTFL + * nodes fold to isn't known until runtime. The fold possibilities for + * the non-UTF8 patterns are quite simple, except for the sharp s. All + * the ones that don't involve a UTF-8 target string are members of a + * fold-pair, and arrays are set up for all of them so that the other + * member of the pair can be found quickly. Code elsewhere in this file + * makes sure that in EXACTFU nodes, the sharp s gets folded to 'ss', even + * if the pattern isn't UTF-8. This avoids the issues described in the + * next item. * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches * 'ss' or not is not knowable at compile time. It will match iff the * target string is in UTF-8, unlike the EXACTFU nodes, where it always * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus - * it can't be folded to "ss" at compile time, unlike EXACTFU does as + * it can't be folded to "ss" at compile time, unlike EXACTFU does (as * described in item 3). An assumption that the optimizer part of * regexec.c (probably unwittingly) makes is that a character in the * pattern corresponds to at most a single character in the target string. @@ -2729,6 +2741,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b const unsigned int oldl = STR_LEN(scan); regnode * const nnext = regnext(n); + /* XXX I (khw) kind of doubt that this works on platforms where + * U8_MAX is above 255 because of lots of other assumptions */ if (oldl + STR_LEN(n) > U8_MAX) break; @@ -2804,32 +2818,26 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 * * This means that in case-insensitive matching (or "loose - * matching", as Unicode calls it), an EXACTF of length six (the - * UTF-8 encoded byte length of the above casefolded versions) can - * match a target string of length two (the byte length of UTF-8 - * encoded U+0390 or U+03B0). This would rather mess up the - * minimum length computation. (there are other code points that - * also fold to these two sequences, but the delta is smaller) + * matching", as Unicode calls it), an EXACTF of length 3 chars can + * match a target string of length 1 char. This would rather mess + * up the minimum length computation. * * If these sequences are found, the minimum length is decreased by - * four (six minus two). + * two. * * Similarly, 'ss' may match the single char and byte LATIN SMALL * LETTER SHARP S. We decrease the min length by 1 for each * occurrence of 'ss' found */ -#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */ -# define U390_first_byte 0xb4 - const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42"; -# define U3B0_first_byte 0xb5 - const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42"; -#else -# define U390_first_byte 0xce - const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81"; -# define U3B0_first_byte 0xcf - const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81"; -#endif - const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte; +#define U390_FIRST_BYTE GREEK_SMALL_LETTER_IOTA_UTF8_FIRST_BYTE +#define U3B0_FIRST_BYTE GREEK_SMALL_LETTER_UPSILON_UTF8_FIRST_BYTE + const U8 U390_tail[] = GREEK_SMALL_LETTER_IOTA_UTF8_TAIL + COMBINING_DIAERESIS_UTF8 + COMBINING_ACUTE_ACCENT_UTF8; + const U8 U3B0_tail[] = GREEK_SMALL_LETTER_UPSILON_UTF8_TAIL + COMBINING_DIAERESIS_UTF8 + COMBINING_ACUTE_ACCENT_UTF8; + const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte; yields a net of 0 */ /* Examine the string for one of the problematic sequences */ for (s = s0; @@ -2859,7 +2867,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b } break; - case U390_first_byte: + case U390_FIRST_BYTE: if (s_end - s >= len /* The 1's are because are skipping comparing the @@ -2870,16 +2878,16 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b } break; - case U3B0_first_byte: + case U3B0_FIRST_BYTE: if (! (s_end - s >= len && memEQ(s + 1, U3B0_tail, len - 1))) { break; } greek_sequence: - *min_subtract += 4; + *min_subtract += 2; - /* This can't currently be handled by trie's, so change + /* This requires special handling by trie's, so change * the node type to indicate this. If EXACTFA and * EXACTFL were ever to be handled by trie's, this * would have to be changed. If this node has already @@ -2915,9 +2923,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b /* EXACTF nodes need to know that the minimum * length changed so that a sharp s in the string * can match this ss in the pattern, but they - * remain EXACTF nodes, as they are not trie'able, - * so don't have to invent a new node type to - * exclude them from the trie code */ + * remain EXACTF nodes, as they won't match this + * unless the target string is is UTF-8, which we + * don't know until runtime */ if (OP(scan) != EXACTF) { OP(scan) = EXACTFU_SS; } @@ -3020,7 +3028,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { dVAR; - I32 min = 0, pars = 0, code; + I32 min = 0; /* There must be at least this number of characters to match */ + I32 pars = 0, code; regnode *scan = *scanp, *next; I32 delta = 0; int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); @@ -3047,9 +3056,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, fake_study_recurse: while ( scan && OP(scan) != END && scan < last ){ - UV min_subtract = 0; /* How much to subtract from the minimum node - length to get a real minimum (because the - folded version may be shorter) */ + UV min_subtract = 0; /* How mmany chars to subtract from the minimum + node length to get a real minimum (because + the folded version may be shorter) */ bool has_exactf_sharp_s = FALSE; /* Peephole optimizer: */ DEBUG_STUDYDATA("Peep:", data,depth); @@ -3414,7 +3423,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * trietype so we can turn them into a trie. If/when we * allow NOTHING to start a trie sequence this condition will be * required, and it isn't expensive so we leave it in for now. */ - if ( trietype != NOTHING ) + if ( trietype && trietype != NOTHING ) make_trie( pRExC_state, startbranch, first, cur, tail, count, trietype, depth+1 ); @@ -3445,7 +3454,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); - if ( last ) { + if ( last && trietype ) { if ( trietype != NOTHING ) { /* the last branch of the sequence was part of a trie, * so we have to construct it here outside of the loop @@ -3657,13 +3666,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); } - else if (has_exactf_sharp_s) { + if (has_exactf_sharp_s) { RExC_seen |= REG_SEEN_EXACTF_SHARP_S; } min += l - min_subtract; - if (min < 0) { - min = 0; - } + assert (min >= 0); delta += min_subtract; if (flags & SCF_DO_SUBSTR) { data->pos_min += l - min_subtract; @@ -3954,6 +3961,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && !(data->flags & SF_HAS_EVAL) && !deltanext /* atom is fixed width */ && minnext != 0 /* CURLYM can't handle zero width */ + && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */ ) { /* XXXX How to optimize if data == 0? */ /* Optimize to a simpler form. */ @@ -4203,7 +4211,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, case ALNUM: if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR); if (OP(scan) == ALNUMU) { for (value = 0; value < 256; value++) { if (!isWORDCHAR_L1(value)) { @@ -4221,7 +4229,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else { if (data->start_class->flags & ANYOF_LOCALE) - ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); + ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR); /* Even if under locale, set the bits for non-locale * in case it isn't a true locale-node. This will @@ -4244,7 +4252,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, case NALNUM: if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR); if (OP(scan) == NALNUMU) { for (value = 0; value < 256; value++) { if (isWORDCHAR_L1(value)) { @@ -4262,7 +4270,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else { if (data->start_class->flags & ANYOF_LOCALE) - ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); + ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR); /* Even if under locale, set the bits for non-locale in * case it isn't a true locale-node. This will create @@ -4957,16 +4965,23 @@ Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) } #endif -/* public(ish) wrapper for Perl_re_op_compile that only takes an SV - * pattern rather than a list of OPs */ +/* public(ish) entry point for the perl core's own regex compiling code. + * It's actually a wrapper for Perl_re_op_compile that only takes an SV + * pattern rather than a list of OPs, and uses the internal engine rather + * than the current one */ REGEXP * Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) { SV *pat = pattern; /* defeat constness! */ PERL_ARGS_ASSERT_RE_COMPILE; - return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(), - NULL, NULL, rx_flags, 0); + return Perl_re_op_compile(aTHX_ &pat, 1, NULL, +#ifdef PERL_IN_XSUB_RE + &my_reg_engine, +#else + &PL_core_reg_engine, +#endif + NULL, NULL, rx_flags, 0); } /* see if there are any run-time code blocks in the pattern. @@ -5193,6 +5208,50 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, } +STATIC bool +S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol) +{ + /* This is the common code for setting up the floating and fixed length + * string data extracted from Perlre_op_compile() below. Returns a boolean + * as to whether succeeded or not */ + + I32 t,ml; + + if (! (longest_length + || (eol /* Can't have SEOL and MULTI */ + && (! meol || (RExC_flags & RXf_PMf_MULTILINE))) + ) + /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ + || (RExC_seen & REG_SEEN_EXACTF_SHARP_S)) + { + return FALSE; + } + + /* copy the information about the longest from the reg_scan_data + over to the program. */ + if (SvUTF8(sv_longest)) { + *rx_utf8 = sv_longest; + *rx_substr = NULL; + } else { + *rx_substr = sv_longest; + *rx_utf8 = NULL; + } + /* end_shift is how many chars that must be matched that + follow this item. We calculate it ahead of time as once the + lookbehind offset is added in we lose the ability to correctly + calculate it.*/ + ml = minlen ? *(minlen) : (I32)longest_length; + *rx_end_shift = ml - offset + - longest_length + (SvTAIL(sv_longest) != 0) + + lookbehind; + + t = (eol/* Can't have SEOL and MULTI */ + && (! meol || (RExC_flags & RXf_PMf_MULTILINE))); + fbm_compile(sv_longest, t ? FBMcf_TAIL : 0); + + return TRUE; +} + /* * Perl_re_op_compile - the perl internal RE engine's function to compile a * regular expression into internal code. @@ -5245,7 +5304,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, dVAR; REGEXP *rx; struct regexp *r; - register regexp_internal *ri; + regexp_internal *ri; STRLEN plen; char * VOL exp; char* xend; @@ -5307,9 +5366,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist); PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist); - PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); - PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); - PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist); PL_PosixLower = _new_invlist_C_array(PosixLower_invlist); @@ -5447,8 +5503,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, else { while (SvAMAGIC(msv) && (sv = AMG_CALLunary(msv, string_amg)) - && sv != msv) - { + && sv != msv + && !( SvROK(msv) + && SvROK(sv) + && SvRV(msv) == SvRV(sv)) + ) { msv = sv; SvGETMAGIC(msv); } @@ -5684,7 +5743,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags, exp, plen); if (!runtime_code) { - ReREFCNT_inc(old_re); if (used_setjump) { JMPENV_POP; } @@ -6164,105 +6222,56 @@ reStudy: scan_commit(pRExC_state, &data,&minlen,0); SvREFCNT_dec(data.last_found); - /* Note that code very similar to this but for anchored string - follows immediately below, changes may need to be made to both. - Be careful. - */ longest_float_length = CHR_SVLEN(data.longest_float); - if (longest_float_length - || (data.flags & SF_FL_BEFORE_EOL - && (!(data.flags & SF_FL_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE)))) - { - I32 t,ml; - /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ - if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S) - || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ - && data.offset_fixed == data.offset_float_min - && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) - goto remove_float; /* As in (a)+. */ - - /* copy the information about the longest float from the reg_scan_data - over to the program. */ - if (SvUTF8(data.longest_float)) { - r->float_utf8 = data.longest_float; - r->float_substr = NULL; - } else { - r->float_substr = data.longest_float; - r->float_utf8 = NULL; - } - /* float_end_shift is how many chars that must be matched that - follow this item. We calculate it ahead of time as once the - lookbehind offset is added in we lose the ability to correctly - calculate it.*/ - ml = data.minlen_float ? *(data.minlen_float) - : (I32)longest_float_length; - r->float_end_shift = ml - data.offset_float_min - - longest_float_length + (SvTAIL(data.longest_float) != 0) - + data.lookbehind_float; + if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) + && S_setup_longest (aTHX_ pRExC_state, + data.longest_float, + &(r->float_utf8), + &(r->float_substr), + &(r->float_end_shift), + data.lookbehind_float, + data.offset_float_min, + data.minlen_float, + longest_float_length, + data.flags & SF_FL_BEFORE_EOL, + data.flags & SF_FL_BEFORE_MEOL)) + { r->float_min_offset = data.offset_float_min - data.lookbehind_float; r->float_max_offset = data.offset_float_max; if (data.offset_float_max < I32_MAX) /* Don't offset infinity */ r->float_max_offset -= data.lookbehind_float; - - t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ - && (!(data.flags & SF_FL_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE))); - fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0); } else { - remove_float: r->float_substr = r->float_utf8 = NULL; SvREFCNT_dec(data.longest_float); longest_float_length = 0; } - /* Note that code very similar to this but for floating string - is immediately above, changes may need to be made to both. - Be careful. - */ longest_fixed_length = CHR_SVLEN(data.longest_fixed); - /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ - if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) - && (longest_fixed_length - || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ - && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE)))) ) + if (S_setup_longest (aTHX_ pRExC_state, + data.longest_fixed, + &(r->anchored_utf8), + &(r->anchored_substr), + &(r->anchored_end_shift), + data.lookbehind_fixed, + data.offset_fixed, + data.minlen_fixed, + longest_fixed_length, + data.flags & SF_FIX_BEFORE_EOL, + data.flags & SF_FIX_BEFORE_MEOL)) { - I32 t,ml; - - /* copy the information about the longest fixed - from the reg_scan_data over to the program. */ - if (SvUTF8(data.longest_fixed)) { - r->anchored_utf8 = data.longest_fixed; - r->anchored_substr = NULL; - } else { - r->anchored_substr = data.longest_fixed; - r->anchored_utf8 = NULL; - } - /* fixed_end_shift is how many chars that must be matched that - follow this item. We calculate it ahead of time as once the - lookbehind offset is added in we lose the ability to correctly - calculate it.*/ - ml = data.minlen_fixed ? *(data.minlen_fixed) - : (I32)longest_fixed_length; - r->anchored_end_shift = ml - data.offset_fixed - - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0) - + data.lookbehind_fixed; r->anchored_offset = data.offset_fixed - data.lookbehind_fixed; - - t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ - && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE))); - fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0); } else { r->anchored_substr = r->anchored_utf8 = NULL; SvREFCNT_dec(data.longest_fixed); longest_fixed_length = 0; } + if (ri->regstclass && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) ri->regstclass = NULL; @@ -6394,18 +6403,12 @@ reStudy: #ifdef STUPID_PATTERN_CHECKS if (RX_PRELEN(rx) == 0) r->extflags |= RXf_NULL; - if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') - /* XXX: this should happen BEFORE we compile */ - r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); - else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3)) + if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3)) r->extflags |= RXf_WHITE; else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^') r->extflags |= RXf_START_ONLY; #else - if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ') - /* XXX: this should happen BEFORE we compile */ - r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); - else { + { regnode *first = ri->program + 1; U8 fop = OP(first); @@ -6687,37 +6690,53 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, char *s = NULL; I32 i = 0; I32 s1, t1; + I32 n = paren; PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH; - if (!rx->subbeg) { - sv_setsv(sv,&PL_sv_undef); - return; - } - else - if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) { - /* $` */ + if ( ( n == RX_BUFF_IDX_CARET_PREMATCH + || n == RX_BUFF_IDX_CARET_FULLMATCH + || n == RX_BUFF_IDX_CARET_POSTMATCH + ) + && !(rx->extflags & RXf_PMf_KEEPCOPY) + ) + goto ret_undef; + + if (!rx->subbeg) + goto ret_undef; + + if (n == RX_BUFF_IDX_CARET_FULLMATCH) + /* no need to distinguish between them any more */ + n = RX_BUFF_IDX_FULLMATCH; + + if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH) + && rx->offs[0].start != -1) + { + /* $`, ${^PREMATCH} */ i = rx->offs[0].start; s = rx->subbeg; } else - if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) { - /* $' */ - s = rx->subbeg + rx->offs[0].end; - i = rx->sublen - rx->offs[0].end; + if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH) + && rx->offs[0].end != -1) + { + /* $', ${^POSTMATCH} */ + s = rx->subbeg - rx->suboffset + rx->offs[0].end; + i = rx->sublen + rx->suboffset - rx->offs[0].end; } else - if ( 0 <= paren && paren <= (I32)rx->nparens && - (s1 = rx->offs[paren].start) != -1 && - (t1 = rx->offs[paren].end) != -1) + if ( 0 <= n && n <= (I32)rx->nparens && + (s1 = rx->offs[n].start) != -1 && + (t1 = rx->offs[n].end) != -1) { - /* $& $1 ... */ + /* $&, ${^MATCH}, $1 ... */ i = t1 - s1; - s = rx->subbeg + s1; + s = rx->subbeg + s1 - rx->suboffset; } else { - sv_setsv(sv,&PL_sv_undef); - return; + goto ret_undef; } + + assert(s >= rx->subbeg); assert(rx->sublen >= (s - rx->subbeg) + i ); if (i >= 0) { const int oldtainted = PL_tainted; @@ -6753,6 +6772,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, SvTAINTED_off(sv); } } else { + ret_undef: sv_setsv(sv,&PL_sv_undef); return; } @@ -6783,9 +6803,13 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH; /* Some of this code was originally in C in F */ - switch (paren) { - /* $` / ${^PREMATCH} */ - case RX_BUFF_IDX_PREMATCH: + switch (paren) { + case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */ + if (!(rx->extflags & RXf_PMf_KEEPCOPY)) + goto warn_undef; + /*FALLTHROUGH*/ + + case RX_BUFF_IDX_PREMATCH: /* $` */ if (rx->offs[0].start != -1) { i = rx->offs[0].start; if (i > 0) { @@ -6795,8 +6819,11 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } } return 0; - /* $' / ${^POSTMATCH} */ - case RX_BUFF_IDX_POSTMATCH: + + case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */ + if (!(rx->extflags & RXf_PMf_KEEPCOPY)) + goto warn_undef; + case RX_BUFF_IDX_POSTMATCH: /* $' */ if (rx->offs[0].end != -1) { i = rx->sublen - rx->offs[0].end; if (i > 0) { @@ -6806,6 +6833,12 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } } return 0; + + case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */ + if (!(rx->extflags & RXf_PMf_KEEPCOPY)) + goto warn_undef; + /*FALLTHROUGH*/ + /* $& / ${^MATCH}, $1, $2, ... */ default: if (paren <= (I32)rx->nparens && @@ -6815,6 +6848,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, i = t1 - s1; goto getlen; } else { + warn_undef: if (ckWARN(WARN_UNINITIALIZED)) report_uninit((const SV *)sv); return 0; @@ -6822,7 +6856,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv, } getlen: if (i > 0 && RXp_MATCH_UTF8(rx)) { - const char * const s = rx->subbeg + s1; + const char * const s = rx->subbeg - rx->suboffset + s1; const U8 *ep; STRLEN el; @@ -6975,9 +7009,10 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * list.) * Taking the complement (inverting) an inversion list is quite simple, if the * first element is 0, remove it; otherwise add a 0 element at the beginning. - * This implementation reserves an element at the beginning of each inversion list - * to contain 0 when the list contains 0, and contains 1 otherwise. The actual - * beginning of the list is either that element if 0, or the next one if 1. + * This implementation reserves an element at the beginning of each inversion + * list to contain 0 when the list contains 0, and contains 1 otherwise. The + * actual beginning of the list is either that element if 0, or the next one if + * 1. * * More about inversion lists can be found in "Unicode Demystified" * Chapter 13 by Richard Gillam, published by Addison-Wesley. @@ -6991,32 +7026,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * Some of the methods should always be private to the implementation, and some * should eventually be made public */ -#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */ -#define INVLIST_ITER_OFFSET 1 /* Current iteration position */ - -/* This is a combination of a version and data structure type, so that one - * being passed in can be validated to be an inversion list of the correct - * vintage. When the structure of the header is changed, a new random number - * in the range 2**31-1 should be generated and the new() method changed to - * insert that at this location. Then, if an auxiliary program doesn't change - * correspondingly, it will be discovered immediately */ -#define INVLIST_VERSION_ID_OFFSET 2 -#define INVLIST_VERSION_ID 1064334010 - -/* For safety, when adding new elements, remember to #undef them at the end of - * the inversion list code section */ - -#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */ -/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list - * contains the code point U+00000, and begins here. If 1, the inversion list - * doesn't contain U+0000, and it begins at the next UV in the array. - * Inverting an inversion list consists of adding or removing the 0 at the - * beginning of it. By reserving a space for that 0, inversion can be made - * very fast */ - -#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1) +/* The header definitions are in F */ -/* Internally things are UVs */ #define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV)) #define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH) @@ -7038,7 +7049,7 @@ S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0) PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; /* Must be empty */ - assert(! *get_invlist_len_addr(invlist)); + assert(! *_get_invlist_len_addr(invlist)); /* 1^1 = 0; 1^0 = 1 */ *zero = 1 ^ will_have_0; @@ -7056,7 +7067,7 @@ S_invlist_array(pTHX_ SV* const invlist) /* Must not be empty. If these fail, you probably didn't check for * being non-zero before trying to get the array */ - assert(*get_invlist_len_addr(invlist)); + assert(*_get_invlist_len_addr(invlist)); assert(*get_invlist_zero_addr(invlist) == 0 || *get_invlist_zero_addr(invlist) == 1); @@ -7067,28 +7078,6 @@ S_invlist_array(pTHX_ SV* const invlist) + *get_invlist_zero_addr(invlist)); } -PERL_STATIC_INLINE UV* -S_get_invlist_len_addr(pTHX_ SV* invlist) -{ - /* Return the address of the UV that contains the current number - * of used elements in the inversion list */ - - PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR; - - return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV))); -} - -PERL_STATIC_INLINE UV -S_invlist_len(pTHX_ SV* const invlist) -{ - /* Returns the current number of elements stored in the inversion list's - * array */ - - PERL_ARGS_ASSERT_INVLIST_LEN; - - return *get_invlist_len_addr(invlist); -} - PERL_STATIC_INLINE void S_invlist_set_len(pTHX_ SV* const invlist, const UV len) { @@ -7096,7 +7085,7 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len) PERL_ARGS_ASSERT_INVLIST_SET_LEN; - *get_invlist_len_addr(invlist) = len; + *_get_invlist_len_addr(invlist) = len; assert(len <= SvLEN(invlist)); @@ -7116,6 +7105,39 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len) * Note that when inverting, SvCUR shouldn't change */ } +PERL_STATIC_INLINE IV* +S_get_invlist_previous_index_addr(pTHX_ SV* invlist) +{ + /* Return the address of the UV that is reserved to hold the cached index + * */ + + PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR; + + return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV))); +} + +PERL_STATIC_INLINE IV +S_invlist_previous_index(pTHX_ SV* const invlist) +{ + /* Returns cached index of previous search */ + + PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX; + + return *get_invlist_previous_index_addr(invlist); +} + +PERL_STATIC_INLINE void +S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index) +{ + /* Caches for later retrieval */ + + PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX; + + assert(index == 0 || index < (int) _invlist_len(invlist)); + + *get_invlist_previous_index_addr(invlist) = index; +} + PERL_STATIC_INLINE UV S_invlist_max(pTHX_ SV* const invlist) { @@ -7166,8 +7188,9 @@ Perl__new_invlist(pTHX_ IV initial_size) * properly */ *get_invlist_zero_addr(new_list) = UV_MAX; + *get_invlist_previous_index_addr(new_list) = 0; *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID; -#if HEADER_LENGTH != 4 +#if HEADER_LENGTH != 5 # error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length #endif @@ -7190,7 +7213,7 @@ S__new_invlist_C_array(pTHX_ UV* list) SvPV_set(invlist, (char *) list); SvLEN_set(invlist, 0); /* Means we own the contents, and the system shouldn't touch it */ - SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist))); + SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist))); if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) { Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); @@ -7220,11 +7243,6 @@ S_invlist_trim(pTHX_ SV* const invlist) SvPV_shrink_to_cur((SV *) invlist); } -/* An element is in an inversion list iff its index is even numbered: 0, 2, 4, - * etc */ -#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) -#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) - #define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output) STATIC void @@ -7236,7 +7254,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end UV* array; UV max = invlist_max(invlist); - UV len = invlist_len(invlist); + UV len = _invlist_len(invlist); PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; @@ -7308,8 +7326,8 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end #ifndef PERL_IN_XSUB_RE -STATIC IV -S_invlist_search(pTHX_ SV* const invlist, const UV cp) +IV +Perl__invlist_search(pTHX_ SV* const invlist, const UV cp) { /* Searches the inversion list for the entry that contains the input code * point . If is not in the list, -1 is returned. Otherwise, the @@ -7317,23 +7335,68 @@ S_invlist_search(pTHX_ SV* const invlist, const UV cp) * contains */ IV low = 0; - IV high = invlist_len(invlist); - const UV * const array = invlist_array(invlist); + IV mid; + IV high = _invlist_len(invlist); + const IV highest_element = high - 1; + const UV* array; - PERL_ARGS_ASSERT_INVLIST_SEARCH; + PERL_ARGS_ASSERT__INVLIST_SEARCH; - /* If list is empty or the code point is before the first element, return - * failure. */ - if (high == 0 || cp < array[0]) { + /* If list is empty, return failure. */ + if (high == 0) { return -1; } + /* If the code point is before the first element, return failure. (We + * can't combine this with the test above, because we can't get the array + * unless we know the list is non-empty) */ + array = invlist_array(invlist); + + mid = invlist_previous_index(invlist); + assert(mid >=0 && mid <= highest_element); + + /* contains the cache of the result of the previous call to this + * function (0 the first time). See if this call is for the same result, + * or if it is for mid-1. This is under the theory that calls to this + * function will often be for related code points that are near each other. + * And benchmarks show that caching gives better results. We also test + * here if the code point is within the bounds of the list. These tests + * replace others that would have had to be made anyway to make sure that + * the array bounds were not exceeded, and give us extra information at the + * same time */ + if (cp >= array[mid]) { + if (cp >= array[highest_element]) { + return highest_element; + } + + /* Here, array[mid] <= cp < array[highest_element]. This means that + * the final element is not the answer, so can exclude it; it also + * means that is not the final element, so can refer to 'mid + 1' + * safely */ + if (cp < array[mid + 1]) { + return mid; + } + high--; + low = mid + 1; + } + else { /* cp < aray[mid] */ + if (cp < array[0]) { /* Fail if outside the array */ + return -1; + } + high = mid; + if (cp >= array[mid - 1]) { + goto found_entry; + } + } + /* Binary search. What we are looking for is such that * array[i] <= cp < array[i+1] - * The loop below converges on the i+1. */ + * The loop below converges on the i+1. Note that there may not be an + * (i+1)th element in the array, and things work nonetheless */ while (low < high) { - IV mid = (low + high) / 2; - if (array[mid] <= cp) { + mid = (low + high) / 2; + assert(mid <= highest_element); + if (array[mid] <= cp) { /* cp >= array[mid] */ low = mid + 1; /* We could do this extra test to exit the loop early. @@ -7347,7 +7410,10 @@ S_invlist_search(pTHX_ SV* const invlist, const UV cp) } } - return high - 1; + found_entry: + high--; + invlist_set_previous_index(invlist, high); + return high; } void @@ -7361,7 +7427,7 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV * that is all 0's on input */ UV current = start; - const IV len = invlist_len(invlist); + const IV len = _invlist_len(invlist); IV i; const UV * array; @@ -7374,7 +7440,7 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV array = invlist_array(invlist); /* Find which element it is */ - i = invlist_search(invlist, start); + i = _invlist_search(invlist, start); /* We populate from to */ while (current < end) { @@ -7393,7 +7459,15 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV current = array[i]; if (current >= end) { /* Finished if beyond the end of what we are populating */ - return; + if (LIKELY(end < UV_MAX)) { + return; + } + + /* We get here when the upper bound is the maximum + * representable on the machine, and we are looking for just + * that code point. Have to special case it */ + i = len; + goto join_end_of_list; } } assert(current >= start); @@ -7410,6 +7484,8 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV swatch[offset >> 3] |= 1 << (offset & 7); } + join_end_of_list: + /* Quit if at the end of the list */ if (i >= len) { @@ -7433,7 +7509,6 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV return; } - void Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output) { @@ -7482,7 +7557,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co assert(a != b); /* If either one is empty, the union is the other one */ - if (a == NULL || ((len_a = invlist_len(a)) == 0)) { + if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { if (*output == a) { if (a != NULL) { SvREFCNT_dec(a); @@ -7496,7 +7571,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co } /* else *output already = b; */ return; } - else if ((len_b = invlist_len(b)) == 0) { + else if ((len_b = _invlist_len(b)) == 0) { if (*output == b) { SvREFCNT_dec(b); } @@ -7637,7 +7712,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool co /* Set result to final length, which can change the pointer to array_u, so * re-find it */ - if (len_u != invlist_len(u)) { + if (len_u != _invlist_len(u)) { invlist_set_len(u, len_u); invlist_trim(u); array_u = invlist_array(u); @@ -7716,8 +7791,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, assert(a != b); /* Special case if either one is empty */ - len_a = invlist_len(a); - if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) { + len_a = _invlist_len(a); + if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { if (len_a != 0 && complement_b) { @@ -7863,7 +7938,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Set result to final length, which can change the pointer to array_r, so * re-find it */ - if (len_r != invlist_len(r)) { + if (len_r != _invlist_len(r)) { invlist_set_len(r, len_r); invlist_trim(r); array_r = invlist_array(r); @@ -7911,13 +7986,13 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) len = 0; } else { - len = invlist_len(invlist); + len = _invlist_len(invlist); } /* If comes after the final entry, can just append it to the end */ if (len == 0 || start >= invlist_array(invlist) - [invlist_len(invlist) - 1]) + [_invlist_len(invlist) - 1]) { _append_range_to_invlist(invlist, start, end); return invlist; @@ -7951,7 +8026,7 @@ Perl__invlist_invert(pTHX_ SV* const invlist) * have a zero; removes it otherwise. As described above, the data * structure is set up so that this is very efficient */ - UV* len_pos = get_invlist_len_addr(invlist); + UV* len_pos = _get_invlist_len_addr(invlist); PERL_ARGS_ASSERT__INVLIST_INVERT; @@ -7988,7 +8063,7 @@ Perl__invlist_invert_prop(pTHX_ SV* const invlist) _invlist_invert(invlist); - len = invlist_len(invlist); + len = _invlist_len(invlist); if (len != 0) { /* If empty do nothing */ array = invlist_array(invlist); @@ -8020,7 +8095,7 @@ S_invlist_clone(pTHX_ SV* const invlist) /* Need to allocate extra space to accommodate Perl's addition of a * trailing NUL to SvPV's, since it thinks they are always strings */ - SV* new_invlist = _new_invlist(invlist_len(invlist) + 1); + SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1); STRLEN length = SvCUR(invlist); PERL_ARGS_ASSERT_INVLIST_CLONE; @@ -8071,7 +8146,7 @@ S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) * will start over at the beginning of the list */ UV* pos = get_invlist_iter_addr(invlist); - UV len = invlist_len(invlist); + UV len = _invlist_len(invlist); UV *array; PERL_ARGS_ASSERT_INVLIST_ITERNEXT; @@ -8103,7 +8178,7 @@ S_invlist_highest(pTHX_ SV* const invlist) * 0, or if the list is empty. If this distinction matters to you, check * for emptiness before calling this function */ - UV len = invlist_len(invlist); + UV len = _invlist_len(invlist); UV *array; PERL_ARGS_ASSERT_INVLIST_HIGHEST; @@ -8180,6 +8255,77 @@ S_invlist_dump(pTHX_ SV* const invlist, const char * const header) } #endif +#if 0 +bool +S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) +{ + /* Return a boolean as to if the two passed in inversion lists are + * identical. The final argument, if TRUE, says to take the complement of + * the second inversion list before doing the comparison */ + + UV* array_a = invlist_array(a); + UV* array_b = invlist_array(b); + UV len_a = _invlist_len(a); + UV len_b = _invlist_len(b); + + UV i = 0; /* current index into the arrays */ + bool retval = TRUE; /* Assume are identical until proven otherwise */ + + PERL_ARGS_ASSERT__INVLISTEQ; + + /* If are to compare 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* The complement of nothing is everything, so would have to have + * just one element, starting at zero (ending at infinity) */ + if (len_b == 0) { + return (len_a == 1 && array_a[0] == 0); + } + else if (array_b[0] == 0) { + + /* Otherwise, to complement, we invert. Here, the first element is + * 0, just remove it. To do this, we just pretend the array starts + * one later, and clear the flag as we don't have to do anything + * else later */ + + array_b++; + len_b--; + complement_b = FALSE; + } + else { + + /* But if the first element is not zero, we unshift a 0 before the + * array. The data structure reserves a space for that 0 (which + * should be a '1' right now), so physical shifting is unneeded, + * but temporarily change that element to 0. Before exiting the + * routine, we must restore the element to '1' */ + array_b--; + len_b++; + array_b[0] = 0; + } + } + + /* Make sure that the lengths are the same, as well as the final element + * before looping through the remainder. (Thus we test the length, final, + * and first elements right off the bat) */ + if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) { + retval = FALSE; + } + else for (i = 0; i < len_a - 1; i++) { + if (array_a[i] != array_b[i]) { + retval = FALSE; + break; + } + } + + if (complement_b) { + array_b[0] = 1; + } + return retval; +} +#endif + #undef HEADER_LENGTH #undef INVLIST_INITIAL_LENGTH #undef TO_INTERNAL_SIZE @@ -8212,11 +8358,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ { dVAR; - register regnode *ret; /* Will be the head of the group. */ - register regnode *br; - register regnode *lastbr; - register regnode *ender = NULL; - register I32 parno = 0; + regnode *ret; /* Will be the head of the group. */ + regnode *br; + regnode *lastbr; + regnode *ender = NULL; + I32 parno = 0; I32 flags; U32 oregflags = RExC_flags; bool have_branch = 0; @@ -8387,7 +8533,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, ((! FOLD) ? NREF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? NREFFA : (AT_LEAST_UNI_SEMANTICS) ? NREFFU @@ -9222,9 +9368,9 @@ STATIC regnode * S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) { dVAR; - register regnode *ret; - register regnode *chain = NULL; - register regnode *latest; + regnode *ret; + regnode *chain = NULL; + regnode *latest; I32 flags = 0, c = 0; GET_RE_DEBUG_FLAGS_DECL; @@ -9295,9 +9441,9 @@ STATIC regnode * S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { dVAR; - register regnode *ret; - register char op; - register char *next; + regnode *ret; + char op; + char *next; I32 flags; const char * const origparse = RExC_parse; I32 min; @@ -9306,6 +9452,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char *parse_start; #endif const char *maxpos = NULL; + + /* Save the original in case we change the emitted regop to a FAIL. */ + regnode * const orig_emit = RExC_emit; + GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGPIECE; @@ -9352,6 +9502,23 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); RExC_parse = next; nextchar(pRExC_state); + if (max < min) { /* If can't match, warn and optimize to fail + unconditionally */ + if (SIZE_ONLY) { + ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); + + /* We can't back off the size because we have to reserve + * enough space for all the things we are about to throw + * away, but we can shrink it by the ammount we are about + * to re-use here */ + RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; + } + else { + RExC_emit = orig_emit; + } + ret = reg_node(pRExC_state, OPFAIL); + return ret; + } do_curly: if ((flags&SIMPLE)) { @@ -9389,8 +9556,6 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp = WORST; if (max > 0) *flagp |= HASWIDTH; - if (max < min) - vFAIL("Can't do {n,m} with n > m"); if (!SIZE_ONLY) { ARG1_SET(ret, (U16)min); ARG2_SET(ret, (U16)max); @@ -9486,86 +9651,95 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) return(ret); } - -/* reg_namedseq(pRExC_state,UVp, UV depth) +STATIC bool +S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class) +{ - This is expected to be called by a parser routine that has - recognized '\N' and needs to handle the rest. RExC_parse is - expected to point at the first char following the N at the time - of the call. + /* This is expected to be called by a parser routine that has recognized '\N' + and needs to handle the rest. RExC_parse is expected to point at the first + char following the N at the time of the call. On successful return, + RExC_parse has been updated to point to just after the sequence identified + by this routine, and <*flagp> has been updated. - The \N may be inside (indicated by valuep not being NULL) or outside a + The \N may be inside (indicated by the boolean ) or outside a character class. \N may begin either a named sequence, or if outside a character class, mean to match a non-newline. For non single-quoted regexes, the tokenizer has - attempted to decide which, and in the case of a named sequence converted it + attempted to decide which, and in the case of a named sequence, converted it into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...}, where c1... are the characters in the sequence. For single-quoted regexes, the tokenizer passes the \N sequence through unchanged; this code will not - attempt to determine this nor expand those. The net effect is that if the - beginning of the passed-in pattern isn't '{U+' or there is no '}', it - signals that this \N occurrence means to match a non-newline. - + attempt to determine this nor expand those, instead raising a syntax error. + The net effect is that if the beginning of the passed-in pattern isn't '{U+' + or there is no '}', it signals that this \N occurrence means to match a + non-newline. + Only the \N{U+...} form should occur in a character class, for the same reason that '.' inside a character class means to just match a period: it just doesn't make sense. - - If valuep is non-null then it is assumed that we are parsing inside - of a charclass definition and the first codepoint in the resolved - string is returned via *valuep and the routine will return NULL. - In this mode if a multichar string is returned from the charnames - handler, a warning will be issued, and only the first char in the - sequence will be examined. If the string returned is zero length - then the value of *valuep is undefined and NON-NULL will - be returned to indicate failure. (This will NOT be a valid pointer - to a regnode.) - - If valuep is null then it is assumed that we are parsing normal text and a - new EXACT node is inserted into the program containing the resolved string, - and a pointer to the new node is returned. But if the string is zero length - a NOTHING node is emitted instead. - On success RExC_parse is set to the char following the endbrace. - Parsing failures will generate a fatal error via vFAIL(...) + The function raises an error (via vFAIL), and doesn't return for various + syntax errors. Otherwise it returns TRUE and sets or on + success; it returns FALSE otherwise. + + If is non-null, it means the caller can accept an input sequence + consisting of a just a single code point; <*valuep> is set to that value + if the input is such. + + If is non-null it signifies that the caller can accept any other + legal sequence (i.e., one that isn't just a single code point). <*node_p> + is set as follows: + 1) \N means not-a-NL: points to a newly created REG_ANY node; + 2) \N{}: points to a new NOTHING node; + 3) otherwise: points to a new EXACT node containing the resolved + string. + Note that FALSE is returned for single code point sequences if is + null. */ -STATIC regnode * -S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth) -{ + char * endbrace; /* '}' following the name */ - regnode *ret = NULL; char* p; + char *endchar; /* Points to '.' or '}' ending cur char in the input + stream */ + bool has_multiple_chars; /* true if the input stream contains a sequence of + more than one character */ GET_RE_DEBUG_FLAGS_DECL; - PERL_ARGS_ASSERT_REG_NAMEDSEQ; + PERL_ARGS_ASSERT_GROK_BSLASH_N; GET_RE_DEBUG_FLAGS; + assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */ + /* The [^\n] meaning of \N ignores spaces and comments under the /x * modifier. The other meaning does not */ p = (RExC_flags & RXf_PMf_EXTENDED) ? regwhite( pRExC_state, RExC_parse ) : RExC_parse; - + /* Disambiguate between \N meaning a named character versus \N meaning * [^\n]. The former is assumed when it can't be the latter. */ if (*p != '{' || regcurly(p)) { RExC_parse = p; - if (valuep) { + if (! node_p) { /* no bare \N in a charclass */ - vFAIL("\\N in a character class must be a named character: \\N{...}"); - } + if (in_char_class) { + vFAIL("\\N in a character class must be a named character: \\N{...}"); + } + return FALSE; + } nextchar(pRExC_state); - ret = reg_node(pRExC_state, REG_ANY); + *node_p = reg_node(pRExC_state, REG_ANY); *flagp |= HASWIDTH|SIMPLE; RExC_naughty++; RExC_parse--; - Set_Node_Length(ret, 1); /* MJD */ - return ret; + Set_Node_Length(*node_p, 1); /* MJD */ + return TRUE; } - /* Here, we have decided it should be a named sequence */ + /* Here, we have decided it should be a named character or sequence */ /* The test above made sure that the next real character is a '{', but * under the /x modifier, it could be separated by space (or a comment and @@ -9587,44 +9761,48 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept } if (endbrace == RExC_parse) { /* empty: \N{} */ - if (! valuep) { - RExC_parse = endbrace + 1; - return reg_node(pRExC_state,NOTHING); - } - - if (SIZE_ONLY) { - ckWARNreg(RExC_parse, - "Ignoring zero length \\N{} in character class" - ); - RExC_parse = endbrace + 1; + bool ret = TRUE; + if (node_p) { + *node_p = reg_node(pRExC_state,NOTHING); + } + else if (in_char_class) { + if (SIZE_ONLY && in_char_class) { + ckWARNreg(RExC_parse, + "Ignoring zero length \\N{} in character class" + ); + } + ret = FALSE; } - *valuep = 0; - return (regnode *) &RExC_parse; /* Invalid regnode pointer */ + else { + return FALSE; + } + nextchar(pRExC_state); + return ret; } - REQUIRE_UTF8; /* named sequences imply Unicode semantics */ + RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ RExC_parse += 2; /* Skip past the 'U+' */ - if (valuep) { /* In a bracketed char class */ - /* We only pay attention to the first char of - multichar strings being returned. I kinda wonder + endchar = RExC_parse + strcspn(RExC_parse, ".}"); + + /* Code points are separated by dots. If none, there is only one code + * point, and is terminated by the brace */ + has_multiple_chars = (endchar < endbrace); + + if (valuep && (! has_multiple_chars || in_char_class)) { + /* We only pay attention to the first char of + multichar strings being returned in char classes. I kinda wonder if this makes sense as it does change the behaviour from earlier versions, OTOH that behaviour was broken as well. XXX Solution is to recharacterize as [rest-of-class]|multi1|multi2... */ - STRLEN length_of_hex; - I32 flags = PERL_SCAN_ALLOW_UNDERSCORES + STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); + I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES | PERL_SCAN_DISALLOW_PREFIX | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); - - char * endchar = RExC_parse + strcspn(RExC_parse, ".}"); - if (endchar < endbrace) { - ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); - } - length_of_hex = (STRLEN)(endchar - RExC_parse); - *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL); + *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL); /* The tokenizer should have guaranteed validity, but it's possible to * bypass it by using single quoting, so check */ @@ -9636,16 +9814,26 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept ? UTF8SKIP(RExC_parse) : 1; /* Guard against malformed utf8 */ - if (RExC_parse >= endchar) RExC_parse = endchar; + if (RExC_parse >= endchar) { + RExC_parse = endchar; + } vFAIL("Invalid hexadecimal number in \\N{U+...}"); - } + } - RExC_parse = endbrace + 1; - if (endchar == endbrace) return NULL; + if (in_char_class && has_multiple_chars) { + ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class"); + } + RExC_parse = endbrace + 1; + } + else if (! node_p || ! has_multiple_chars) { - ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */ + /* Here, the input is legal, but not according to the caller's + * options. We fail without advancing the parse, so that the + * caller can try again */ + RExC_parse = p; + return FALSE; } - else { /* Not a char class */ + else { /* What is done here is to convert this to a sub-pattern of the form * (?:\x{char1}\x{char2}...) @@ -9658,16 +9846,11 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP); STRLEN len; - char *endchar; /* Points to '.' or '}' ending cur char in the input - stream */ char *orig_end = RExC_end; + I32 flags; while (RExC_parse < endbrace) { - /* Code points are separated by dots. If none, there is only one - * code point, and is terminated by the brace */ - endchar = RExC_parse + strcspn(RExC_parse, ".}"); - /* Convert to notation the rest of the code understands */ sv_catpv(substitute_parse, "\\x{"); sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); @@ -9675,6 +9858,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept /* Point to the beginning of the next character in the sequence. */ RExC_parse = endchar + 1; + endchar = RExC_parse + strcspn(RExC_parse, ".}"); } sv_catpv(substitute_parse, ")"); @@ -9689,16 +9873,17 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 dept /* The values are Unicode, and therefore not subject to recoding */ RExC_override_recoding = 1; - ret = reg(pRExC_state, 1, flagp, depth+1); + *node_p = reg(pRExC_state, 1, &flags, depth+1); + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); RExC_parse = endbrace; RExC_end = orig_end; RExC_override_recoding = 0; - nextchar(pRExC_state); + nextchar(pRExC_state); } - return ret; + return TRUE; } @@ -9735,6 +9920,93 @@ S_reg_recode(pTHX_ const char value, SV **encp) return uv; } +PERL_STATIC_INLINE U8 +S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) +{ + U8 op; + + PERL_ARGS_ASSERT_COMPUTE_EXACTISH; + + if (! FOLD) { + return EXACT; + } + + op = get_regex_charset(RExC_flags); + if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { + op--; /* /a is same as /u, and map /aa's offset to what /a's would have + been, so there is no hole */ + } + + return op + EXACTF; +} + +PERL_STATIC_INLINE void +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point) +{ + /* This knows the details about sizing an EXACTish node, setting flags for + * it (by setting <*flagp>, and potentially populating it with a single + * character. + * + * If is non-zero, this function assumes that the node has already + * been populated, and just does the sizing. In this case + * should be the final code point that has already been placed into the + * node. This value will be ignored except that under some circumstances + * <*flagp> is set based on it. + * + * If and calculates what + * should be. In pass 1, it sizes the node appropriately. In pass 2, it + * additionally will populate the node's STRING with , if + * is 0. In both cases <*flagp> is appropriately set + * + * It knows that under FOLD, UTF characters and the Latin Sharp S must be + * folded (the latter only when the rules indicate it can match 'ss') */ + + bool len_passed_in = cBOOL(len != 0); + U8 character[UTF8_MAXBYTES_CASE+1]; + + PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + + if (! len_passed_in) { + if (UTF) { + if (FOLD) { + to_uni_fold(NATIVE_TO_UNI(code_point), character, &len); + } + else { + uvchr_to_utf8( character, code_point); + len = UTF8SKIP(character); + } + } + else if (! FOLD + || code_point != LATIN_SMALL_LETTER_SHARP_S + || ASCII_FOLD_RESTRICTED + || ! AT_LEAST_UNI_SEMANTICS) + { + *character = (U8) code_point; + len = 1; + } + else { + *character = 's'; + *(character + 1) = 's'; + len = 2; + } + } + + if (SIZE_ONLY) { + RExC_size += STR_SZ(len); + } + else { + RExC_emit += STR_SZ(len); + STR_LEN(node) = len; + if (! len_passed_in) { + Copy((char *) character, STRING(node), len, char); + } + } + + *flagp |= HASWIDTH; + if (len == 1 && UNI_IS_INVARIANT(code_point)) + *flagp |= SIMPLE; +} /* - regatom - the lowest level @@ -9800,7 +10072,7 @@ STATIC regnode * S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { dVAR; - register regnode *ret = NULL; + regnode *ret = NULL; I32 flags; char *parse_start = RExC_parse; U8 op; @@ -9848,13 +10120,12 @@ tryagain: case '[': { char * const oregcomp_parse = ++RExC_parse; - ret = regclass(pRExC_state,depth+1); + ret = regclass(pRExC_state, flagp,depth+1); if (*RExC_parse != ']') { RExC_parse = oregcomp_parse; vFAIL("Unmatched ["); } nextchar(pRExC_state); - *flagp |= HASWIDTH|SIMPLE; Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */ break; } @@ -10065,7 +10336,7 @@ tryagain: } RExC_parse--; - ret = regclass(pRExC_state,depth+1); + ret = regclass(pRExC_state, flagp,depth+1); RExC_end = oldregxend; RExC_parse--; @@ -10073,16 +10344,24 @@ tryagain: Set_Node_Offset(ret, parse_start + 2); Set_Node_Cur_Length(ret); nextchar(pRExC_state); - *flagp |= HASWIDTH|SIMPLE; } break; case 'N': - /* Handle \N and \N{NAME} here and not below because it can be - multicharacter. join_exact() will join them up later on. - Also this makes sure that things like /\N{BLAH}+/ and - \N{BLAH} being multi char Just Happen. dmq*/ + /* Handle \N and \N{NAME} with multiple code points here and not + * below because it can be multicharacter. join_exact() will join + * them up later on. Also this makes sure that things like + * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq. + * The options to the grok function call causes it to fail if the + * sequence is just a single code point. We then go treat it as + * just another character in the current EXACT node, and hence it + * gets uniform treatment with all the other characters. The + * special treatment for quantifiers is not needed for such single + * character sequences */ ++RExC_parse; - ret= reg_namedseq(pRExC_state, NULL, flagp, depth); + if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) { + RExC_parse--; + goto defchar; + } break; case 'k': /* Handle \k and \k'NAME' */ parse_named_seq: @@ -10112,7 +10391,7 @@ tryagain: ret = reganode(pRExC_state, ((! FOLD) ? NREF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? NREFFA : (AT_LEAST_UNI_SEMANTICS) ? NREFFU @@ -10183,7 +10462,7 @@ tryagain: ret = reganode(pRExC_state, ((! FOLD) ? REF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? REFFA : (AT_LEAST_UNI_SEMANTICS) ? REFFU @@ -10227,33 +10506,30 @@ tryagain: RExC_parse++; defchar: { - register STRLEN len; - register UV ender; - register char *p; + STRLEN len = 0; + UV ender; + char *p; char *s; +#define MAX_NODE_STRING_SIZE 127 + char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE]; + char *s0; + U8 upper_parse = MAX_NODE_STRING_SIZE; STRLEN foldlen; - U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; U8 node_type; - - /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so, - * it is folded to 'ss' even if not utf8 */ - bool is_exactfu_sharp_s; + bool next_is_quantifier; + char * oldp = NULL; ender = 0; - if (! FOLD) { - node_type = EXACT; - } - else { - node_type = get_regex_charset(RExC_flags); - if (node_type >= REGEX_ASCII_RESTRICTED_CHARSET) { - node_type--; /* /a is same as /u, and map /aa's offset to - what /a's would have been, so there is no - hole */ - } - node_type += EXACTF; - } + node_type = compute_EXACTish(pRExC_state); ret = reg_node(pRExC_state, node_type); - s = STRING(ret); + + /* In pass1, folded, we use a temporary buffer instead of the + * actual node, as the node doesn't exist yet */ + s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret); + + s0 = s; + + reparse: /* XXX The node can hold up to 255 bytes, yet this only goes to * 127. I (khw) do not know why. Keeping it somewhat less than @@ -10273,11 +10549,11 @@ tryagain: * could back off to end with only a code point that isn't such a * non-final, but it is possible for there not to be any in the * entire node. */ - for (len = 0, p = RExC_parse - 1; - len < 127 && p < RExC_end; + for (p = RExC_parse - 1; + len < upper_parse && p < RExC_end; len++) { - char * const oldp = p; + oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); @@ -10313,7 +10589,6 @@ tryagain: case 'g': case 'G': /* generic-backref, pos assertion */ case 'h': case 'H': /* HORIZWS */ case 'k': case 'K': /* named backref, keep marker */ - case 'N': /* named char sequence */ case 'p': case 'P': /* Unicode property */ case 'R': /* LNBREAK */ case 's': case 'S': /* space class */ @@ -10331,6 +10606,22 @@ tryagain: ender = '\n'; p++; break; + case 'N': /* Handle a single-code point named character. */ + /* The options cause it to fail if a multiple code + * point sequence. Handle those in the switch() above + * */ + RExC_parse = p + 1; + if (! grok_bslash_N(pRExC_state, NULL, &ender, + flagp, depth, FALSE)) + { + RExC_parse = p = oldp; + goto loopdone; + } + p = RExC_parse; + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; case 'r': ender = '\r'; p++; @@ -10476,150 +10767,270 @@ tryagain: break; } /* End of switch on the literal */ - is_exactfu_sharp_s = (node_type == EXACTFU - && ender == LATIN_SMALL_LETTER_SHARP_S); + /* Here, have looked at the literal character and + * contains its ordinal,

points to the character after it + */ + if ( RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); - if ((UTF && FOLD) || is_exactfu_sharp_s) { - /* Prime the casefolded buffer. Locale rules, which apply - * only to code points < 256, aren't known until execution, - * so for them, just output the original character using - * utf8. If we start to fold non-UTF patterns, be sure to - * update join_exact() */ - if (LOC && ender < 256) { - if (UNI_IS_INVARIANT(ender)) { - *tmpbuf = (U8) ender; - foldlen = 1; - } else { - *tmpbuf = UTF8_TWO_BYTE_HI(ender); - *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender); - foldlen = 2; - } - } - else if (isASCII(ender)) { /* Note: Here can't also be LOC - */ - ender = toLOWER(ender); - *tmpbuf = (U8) ender; - foldlen = 1; - } - else if (! MORE_ASCII_RESTRICTED && ! LOC) { - /* Locale and /aa require more selectivity about the - * fold, so are handled below. Otherwise, here, just - * use the fold */ - ender = toFOLD_uni(ender, tmpbuf, &foldlen); - } - else { - /* Under locale rules or /aa we are not to mix, - * respectively, ords < 256 or ASCII with non-. So - * reject folds that mix them, using only the - * non-folded code point. So do the fold to a - * temporary, and inspect each character in it. */ - U8 trialbuf[UTF8_MAXBYTES_CASE+1]; - U8* s = trialbuf; - UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen); - U8* e = s + foldlen; - bool fold_ok = TRUE; - - while (s < e) { - if (isASCII(*s) - || (LOC && (UTF8_IS_INVARIANT(*s) - || UTF8_IS_DOWNGRADEABLE_START(*s)))) - { - fold_ok = FALSE; - break; - } - s += UTF8SKIP(s); - } - if (fold_ok) { - Copy(trialbuf, tmpbuf, foldlen, U8); - ender = tmpender; - } - else { - uvuni_to_utf8(tmpbuf, ender); - foldlen = UNISKIP(ender); - } - } - } - if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ - if (len) - p = oldp; - else if (UTF || is_exactfu_sharp_s) { - if (FOLD) { - /* Emit all the Unicode characters. */ - STRLEN numlen; - for (foldbuf = tmpbuf; - foldlen; - foldlen -= numlen) { - - /* tmpbuf has been constructed by us, so we - * know it is valid utf8 */ - ender = valid_utf8_to_uvchr(foldbuf, &numlen); - if (numlen > 0) { - const STRLEN unilen = reguni(pRExC_state, ender, s); - s += unilen; - len += unilen; - /* In EBCDIC the numlen - * and unilen can differ. */ - foldbuf += numlen; - if (numlen >= foldlen) - break; - } - else - break; /* "Can't happen." */ - } - } - else { - const STRLEN unilen = reguni(pRExC_state, ender, s); - if (unilen > 0) { - s += unilen; - len += unilen; - } - } - } - else { - len++; - REGC((char)ender, s++); - } - break; + /* If the next thing is a quantifier, it applies to this + * character only, which means that this character has to be in + * its own node and can't just be appended to the string in an + * existing node, so if there are already other characters in + * the node, close the node with just them, and set up to do + * this character again next time through, when it will be the + * only thing in its new node */ + if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len) + { + p = oldp; + goto loopdone; + } + + if (FOLD) { + if (UTF + /* See comments for join_exact() as to why we fold + * this non-UTF at compile time */ + || (node_type == EXACTFU + && ender == LATIN_SMALL_LETTER_SHARP_S)) + { + + + /* Prime the casefolded buffer. Locale rules, which + * apply only to code points < 256, aren't known until + * execution, so for them, just output the original + * character using utf8. If we start to fold non-UTF + * patterns, be sure to update join_exact() */ + if (LOC && ender < 256) { + if (UNI_IS_INVARIANT(ender)) { + *s = (U8) ender; + foldlen = 1; + } else { + *s = UTF8_TWO_BYTE_HI(ender); + *(s + 1) = UTF8_TWO_BYTE_LO(ender); + foldlen = 2; + } + } + else { + ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen, + FOLD_FLAGS_FULL + | ((LOC) ? FOLD_FLAGS_LOCALE + : (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0) + ); + } + s += foldlen; + + /* The loop increments each time, as all but this + * path (and the one just below for UTF) through it add + * a single byte to the EXACTish node. But this one + * has changed len to be the correct final value, so + * subtract one to cancel out the increment that + * follows */ + len += foldlen - 1; + } + else { + *(s++) = ender; + } } - if (UTF || is_exactfu_sharp_s) { - if (FOLD) { - /* Emit all the Unicode characters. */ - STRLEN numlen; - for (foldbuf = tmpbuf; - foldlen; - foldlen -= numlen) { - ender = valid_utf8_to_uvchr(foldbuf, &numlen); - if (numlen > 0) { - const STRLEN unilen = reguni(pRExC_state, ender, s); - len += unilen; - s += unilen; - /* In EBCDIC the numlen - * and unilen can differ. */ - foldbuf += numlen; - if (numlen >= foldlen) - break; - } - else - break; - } - } - else { - const STRLEN unilen = reguni(pRExC_state, ender, s); - if (unilen > 0) { - s += unilen; - len += unilen; - } - } - len--; + else if (UTF) { + const STRLEN unilen = reguni(pRExC_state, ender, s); + if (unilen > 0) { + s += unilen; + len += unilen; + } + + /* See comment just above for - 1 */ + len--; } else { REGC((char)ender, s++); + } + + if (next_is_quantifier) { + + /* Here, the next input is a quantifier, and to get here, + * the current character is the only one in the node. + * Also, here doesn't include the final byte for this + * character */ + len++; + goto loopdone; } - } + + } /* End of loop through literal characters */ + + /* Here we have either exhausted the input or ran out of room in + * the node. (If we encountered a character that can't be in the + * node, transfer is made directly to , and so we + * wouldn't have fallen off the end of the loop.) In the latter + * case, we artificially have to split the node into two, because + * we just don't have enough space to hold everything. This + * creates a problem if the final character participates in a + * multi-character fold in the non-final position, as a match that + * should have occurred won't, due to the way nodes are matched, + * and our artificial boundary. So back off until we find a non- + * problematic character -- one that isn't at the beginning or + * middle of such a fold. (Either it doesn't participate in any + * folds, or appears only in the final position of all the folds it + * does participate in.) A better solution with far fewer false + * positives, and that would fill the nodes more completely, would + * be to actually have available all the multi-character folds to + * test against, and to back-off only far enough to be sure that + * this node isn't ending with a partial one. is set + * further below (if we need to reparse the node) to include just + * up through that final non-problematic character that this code + * identifies, so when it is set to less than the full node, we can + * skip the rest of this */ + if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { + + const STRLEN full_len = len; + + assert(len >= MAX_NODE_STRING_SIZE); + + /* Here, points to the final byte of the final character. + * Look backwards through the string until find a non- + * problematic character */ + + if (! UTF) { + + /* These two have no multi-char folds to non-UTF characters + */ + if (ASCII_FOLD_RESTRICTED || LOC) { + goto loopdone; + } + + while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { } + len = s - s0 + 1; + } + else { + if (! PL_NonL1NonFinalFold) { + PL_NonL1NonFinalFold = _new_invlist_C_array( + NonL1_Perl_Non_Final_Folds_invlist); + } + + /* Point to the first byte of the final character */ + s = (char *) utf8_hop((U8 *) s, -1); + + while (s >= s0) { /* Search backwards until find + non-problematic char */ + if (UTF8_IS_INVARIANT(*s)) { + + /* There are no ascii characters that participate + * in multi-char folds under /aa. In EBCDIC, the + * non-ascii invariants are all control characters, + * so don't ever participate in any folds. */ + if (ASCII_FOLD_RESTRICTED + || ! IS_NON_FINAL_FOLD(*s)) + { + break; + } + } + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + + /* No Latin1 characters participate in multi-char + * folds under /l */ + if (LOC + || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI( + *s, *(s+1)))) + { + break; + } + } + else if (! _invlist_contains_cp( + PL_NonL1NonFinalFold, + valid_utf8_to_uvchr((U8 *) s, NULL))) + { + break; + } + + /* Here, the current character is problematic in that + * it does occur in the non-final position of some + * fold, so try the character before it, but have to + * special case the very first byte in the string, so + * we don't read outside the string */ + s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); + } /* End of loop backwards through the string */ + + /* If there were only problematic characters in the string, + * will point to before s0, in which case the length + * should be 0, otherwise include the length of the + * non-problematic character just found */ + len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s); + } + + /* Here, have found the final character, if any, that is + * non-problematic as far as ending the node without splitting + * it across a potential multi-char fold. contains the + * number of bytes in the node up-to and including that + * character, or is 0 if there is no such character, meaning + * the whole node contains only problematic characters. In + * this case, give up and just take the node as-is. We can't + * do any better */ + if (len == 0) { + len = full_len; + } else { + + /* Here, the node does contain some characters that aren't + * problematic. If one such is the final character in the + * node, we are done */ + if (len == full_len) { + goto loopdone; + } + else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) { + + /* If the final character is problematic, but the + * penultimate is not, back-off that last character to + * later start a new node with it */ + p = oldp; + goto loopdone; + } + + /* Here, the final non-problematic character is earlier + * in the input than the penultimate character. What we do + * is reparse from the beginning, going up only as far as + * this final ok one, thus guaranteeing that the node ends + * in an acceptable character. The reason we reparse is + * that we know how far in the character is, but we don't + * know how to correlate its position with the input parse. + * An alternate implementation would be to build that + * correlation as we go along during the original parse, + * but that would entail extra work for every node, whereas + * this code gets executed only when the string is too + * large for the node, and the final two characters are + * problematic, an infrequent occurrence. Yet another + * possible strategy would be to save the tail of the + * string, and the next time regatom is called, initialize + * with that. The problem with this is that unless you + * back off one more character, you won't be guaranteed + * regatom will get called again, unless regbranch, + * regpiece ... are also changed. If you do back off that + * extra character, so that there is input guaranteed to + * force calling regatom, you can't handle the case where + * just the first character in the node is acceptable. I + * (khw) decided to try this method which doesn't have that + * pitfall; if performance issues are found, we can do a + * combination of the current approach plus that one */ + upper_parse = len; + len = 0; + s = s0; + goto reparse; + } + } /* End of verifying node ends with an appropriate char */ + loopdone: /* Jumped to when encounters something that shouldn't be in the node */ + + /* I (khw) don't know if you can get here with zero length, but the + * old code handled this situation by creating a zero-length EXACT + * node. Might as well be NOTHING instead */ + if (len == 0) { + OP(ret) = NOTHING; + } + else{ + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender); + } + RExC_parse = p - 1; Set_Node_Cur_Length(ret); /* MJD */ nextchar(pRExC_state); @@ -10629,20 +11040,10 @@ tryagain: if (iv < 0) vFAIL("Internal disaster"); } - if (len > 0) - *flagp |= HASWIDTH; - if (len == 1 && UNI_IS_INVARIANT(ender)) - *flagp |= SIMPLE; - if (SIZE_ONLY) - RExC_size += STR_SZ(len); - else { - STR_LEN(ret) = len; - RExC_emit += STR_SZ(len); - } - } + } /* End of label 'defchar:' */ break; - } + } /* End of giant switch on input character */ return(ret); } @@ -10719,7 +11120,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) switch (skip) { case 4: if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */ - namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM; + namedclass = ANYOF_WORDCHAR; break; case 5: /* Names all of length 5. */ @@ -10729,57 +11130,63 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) switch (posixcc[4]) { case 'a': if (memEQ(posixcc, "alph", 4)) /* alpha */ - namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA; + namedclass = ANYOF_ALPHA; break; case 'e': if (memEQ(posixcc, "spac", 4)) /* space */ - namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC; + namedclass = ANYOF_PSXSPC; break; case 'h': if (memEQ(posixcc, "grap", 4)) /* graph */ - namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH; + namedclass = ANYOF_GRAPH; break; case 'i': if (memEQ(posixcc, "asci", 4)) /* ascii */ - namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII; + namedclass = ANYOF_ASCII; break; case 'k': if (memEQ(posixcc, "blan", 4)) /* blank */ - namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK; + namedclass = ANYOF_BLANK; break; case 'l': if (memEQ(posixcc, "cntr", 4)) /* cntrl */ - namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL; + namedclass = ANYOF_CNTRL; break; case 'm': if (memEQ(posixcc, "alnu", 4)) /* alnum */ - namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC; + namedclass = ANYOF_ALNUMC; break; case 'r': if (memEQ(posixcc, "lowe", 4)) /* lower */ - namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER; + namedclass = ANYOF_LOWER; else if (memEQ(posixcc, "uppe", 4)) /* upper */ - namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER; + namedclass = ANYOF_UPPER; break; case 't': if (memEQ(posixcc, "digi", 4)) /* digit */ - namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT; + namedclass = ANYOF_DIGIT; else if (memEQ(posixcc, "prin", 4)) /* print */ - namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT; + namedclass = ANYOF_PRINT; else if (memEQ(posixcc, "punc", 4)) /* punct */ - namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT; + namedclass = ANYOF_PUNCT; break; } break; case 6: if (memEQ(posixcc, "xdigit", 6)) - namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT; + namedclass = ANYOF_XDIGIT; break; } if (namedclass == OOB_NAMEDCLASS) Simple_vFAIL3("POSIX class [:%.*s:] unknown", t - s - 1, s + 1); + + /* The #defines are structured so each complement is +1 to + * the normal one */ + if (complement) { + namedclass++; + } assert (posixcc[skip] == ':'); assert (posixcc[skip+1] == ']'); } else if (!SIZE_ONLY) { @@ -10910,7 +11317,7 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) * determined at run-time * run_time_list is a SV* that contains text names of properties that are to * be computed at run time. This concatenates - * to it, apppropriately + * to it, appropriately * This is essentially DO_POSIX, but we know only the Latin1 values at compile * time */ #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ @@ -10983,6 +11390,15 @@ S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) return; } +/* The names of properties whose definitions are not known at compile time are + * stored in this SV, after a constant heading. So if the length has been + * changed since initialization, then there is a run-time definition. */ +#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len) + +/* This converts the named class defined in regcomp.h to its equivalent class + * number defined in handy.h. */ +#define namedclass_to_classnum(class) ((class) / 2) + /* parse a class specification and produce either an ANYOF node that matches the pattern or perhaps will be optimized into an EXACTish node @@ -10991,14 +11407,14 @@ S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) above 255, a range list is used */ STATIC regnode * -S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) +S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { dVAR; - register UV nextvalue; - register IV prevvalue = OOB_UNICODE; - register IV range = 0; - UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ - register regnode *ret; + UV nextvalue; + UV prevvalue = OOB_UNICODE; + IV range = 0; + UV value = 0; + regnode *ret; STRLEN numlen; IV namedclass = OOB_NAMEDCLASS; char *rangebegin = NULL; @@ -11014,13 +11430,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) Optimizations may be possible if this is tiny */ UV n; - /* Certain named classes have equivalents that can appear outside a - * character class, e.g. \w. These flags are set for these classes. The - * first flag indicates the op depends on the character set modifier, like - * /d, /u.... The second is for those that don't have this dependency. */ - bool has_special_charset_op = FALSE; - bool has_special_non_charset_op = FALSE; - /* Unicode properties are stored in a swash; this holds the current one * being parsed. If this swash is the only above-latin1 component of the * character class, an optimization is to pass it directly on to the @@ -11049,7 +11458,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ UV literal_endpoint = 0; #endif - UV stored = 0; /* how many chars stored in the bitmap */ bool invert = FALSE; /* Is this class to be complemented */ /* Is there any thing like \W or [:^digit:] that matches above the legal @@ -11059,6 +11467,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; + const I32 orig_size = RExC_size; GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCLASS; @@ -11101,7 +11510,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) if (LOC) { ANYOF_FLAGS(ret) |= ANYOF_LOCALE; } - ANYOF_BITMAP_ZERO(ret); listsv = newSVpvs("# comment\n"); initial_listsv_len = SvCUR(listsv); } @@ -11153,8 +11561,8 @@ parseit: * A similar issue a little bit later when switching on * namedclass. --jhi */ switch ((I32)value) { - case 'w': namedclass = ANYOF_ALNUM; break; - case 'W': namedclass = ANYOF_NALNUM; break; + case 'w': namedclass = ANYOF_WORDCHAR; break; + case 'W': namedclass = ANYOF_NWORDCHAR; break; case 's': namedclass = ANYOF_SPACE; break; case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; @@ -11170,17 +11578,21 @@ parseit: if this makes sense as it does change the behaviour from earlier versions, OTOH that behaviour was broken as well. */ - UV v; /* value is register so we cant & it /grrr */ - if (reg_namedseq(pRExC_state, &v, NULL, depth)) { + if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth, + TRUE /* => charclass */)) + { goto parseit; } - value= v; } break; case 'p': case 'P': { char *e; + + /* This routine will handle any undefined properties */ + U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF; + if (RExC_parse >= RExC_end) vFAIL2("Empty \\%c{}", (U8)value); if (*RExC_parse == '{') { @@ -11201,7 +11613,6 @@ parseit: n = 1; } if (!SIZE_ONLY) { - SV** invlistsvp; SV* invlist; char* name; @@ -11236,18 +11647,10 @@ parseit: swash = _core_swash_init("utf8", name, &PL_sv_undef, 1, /* binary */ 0, /* not tr/// */ - TRUE, /* this routine will handle - undefined properties */ - NULL, FALSE /* No inversion list */ + NULL, /* No inversion list */ + &swash_init_flags ); - if ( ! swash - || ! SvROK(swash) - || ! SvTYPE(SvRV(swash)) == SVt_PVHV - || ! (invlistsvp = - hv_fetchs(MUTABLE_HV(SvRV(swash)), - "INVLIST", FALSE)) - || ! (invlist = *invlistsvp)) - { + if (! swash || ! (invlist = _get_swash_invlist(swash))) { if (swash) { SvREFCNT_dec(swash); swash = NULL; @@ -11263,7 +11666,11 @@ parseit: /* We don't know yet, so have to assume that the * property could match something in the Latin1 range, - * hence something that isn't utf8 */ + * hence something that isn't utf8. Note that this + * would cause things in to match + * inappropriately, except that any \p{}, including + * this one forces Unicode semantics, which means there + * is */ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; } else { @@ -11272,7 +11679,8 @@ parseit: * the swash is from a user-defined property, then this * whole character class should be regarded as such */ has_user_defined_property = - _is_swash_user_defined(swash); + (swash_init_flags + & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY); /* Invert if asking for the complement */ if (value == 'P') { @@ -11426,41 +11834,7 @@ parseit: element_count += 2; /* So counts for three values */ } - if (SIZE_ONLY) { - - /* In the first pass, do a little extra work so below can - * possibly optimize the whole node to one of the nodes that - * correspond to the classes given below */ - - /* The optimization will only take place if there is a single - * element in the class, so can skip if there is more than one - */ - if (element_count == 1) { - - /* Possible truncation here but in some 64-bit environments - * the compiler gets heartburn about switch on 64-bit values. - * A similar issue a little earlier when switching on value. - * --jhi */ - switch ((I32)namedclass) { - case ANYOF_ALNUM: - case ANYOF_NALNUM: - case ANYOF_DIGIT: - case ANYOF_NDIGIT: - case ANYOF_SPACE: - case ANYOF_NSPACE: - has_special_charset_op = TRUE; - break; - - case ANYOF_HORIZWS: - case ANYOF_NHORIZWS: - case ANYOF_VERTWS: - case ANYOF_NVERTWS: - has_special_non_charset_op = TRUE; - break; - } - } - } - else { + if (! SIZE_ONLY) { switch ((I32)namedclass) { case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */ @@ -11482,32 +11856,83 @@ parseit: runtime_posix_matches_above_Unicode); break; case ANYOF_ASCII: +#ifdef HAS_ISASCII if (LOC) { ANYOF_CLASS_SET(ret, namedclass); } - else { + else +#endif /* Not isascii(); just use the hard-coded definition for it */ _invlist_union(posixes, PL_ASCII, &posixes); - } break; case ANYOF_NASCII: +#ifdef HAS_ISASCII if (LOC) { ANYOF_CLASS_SET(ret, namedclass); } else { +#endif _invlist_union_complement_2nd(posixes, PL_ASCII, &posixes); if (DEPENDS_SEMANTICS) { ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; } +#ifdef HAS_ISASCII } +#endif break; case ANYOF_BLANK: - DO_POSIX(ret, namedclass, posixes, + if (hasISBLANK || ! LOC) { + DO_POSIX(ret, namedclass, posixes, PL_PosixBlank, PL_XPosixBlank); + } + else { /* There is no isblank() and we are in locale: We + use the ASCII range and the above-Latin1 range + code points */ + SV* scratch_list = NULL; + + /* Include all above-Latin1 blanks */ + _invlist_intersection(PL_AboveLatin1, + PL_XPosixBlank, + &scratch_list); + /* Add it to the running total of posix classes */ + if (! posixes) { + posixes = scratch_list; + } + else { + _invlist_union(posixes, scratch_list, &posixes); + SvREFCNT_dec(scratch_list); + } + /* Add the ASCII-range blanks to the running total. */ + _invlist_union(posixes, PL_PosixBlank, &posixes); + } break; case ANYOF_NBLANK: - DO_N_POSIX(ret, namedclass, posixes, - PL_PosixBlank, PL_XPosixBlank); + if (hasISBLANK || ! LOC) { + DO_N_POSIX(ret, namedclass, posixes, + PL_PosixBlank, PL_XPosixBlank); + } + else { /* There is no isblank() and we are in locale */ + SV* scratch_list = NULL; + + /* Include all above-Latin1 non-blanks */ + _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list); + + /* Add them to the running total of posix classes */ + _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list); + if (! posixes) { + posixes = scratch_list; + } + else { + _invlist_union(posixes, scratch_list, &posixes); + SvREFCNT_dec(scratch_list); + } + + /* Get the list of all non-ASCII-blanks in Latin 1, and + * add them to the running total */ + _invlist_subtract(PL_Latin1, PL_PosixBlank, &scratch_list); + _invlist_union(posixes, scratch_list, &posixes); + SvREFCNT_dec(scratch_list); + } break; case ANYOF_CNTRL: DO_POSIX(ret, namedclass, posixes, @@ -11523,13 +11948,11 @@ parseit: * them */ DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes, PL_PosixDigit, "XPosixDigit", listsv); - has_special_charset_op = TRUE; break; case ANYOF_NDIGIT: DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv, runtime_posix_matches_above_Unicode); - has_special_charset_op = TRUE; break; case ANYOF_GRAPH: DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, @@ -11547,12 +11970,10 @@ parseit: * cp_list is subject to folding. It turns out that \h * is just a synonym for XPosixBlank */ _invlist_union(cp_list, PL_XPosixBlank, &cp_list); - has_special_non_charset_op = TRUE; break; case ANYOF_NHORIZWS: _invlist_union_complement_2nd(cp_list, PL_XPosixBlank, &cp_list); - has_special_non_charset_op = TRUE; break; case ANYOF_LOWER: case ANYOF_NLOWER: @@ -11614,12 +12035,10 @@ parseit: case ANYOF_SPACE: DO_POSIX(ret, namedclass, posixes, PL_PerlSpace, PL_XPerlSpace); - has_special_charset_op = TRUE; break; case ANYOF_NSPACE: DO_N_POSIX(ret, namedclass, posixes, PL_PerlSpace, PL_XPerlSpace); - has_special_charset_op = TRUE; break; case ANYOF_UPPER: /* Same as LOWER, above */ case ANYOF_NUPPER: @@ -11649,16 +12068,14 @@ parseit: } break; } - case ANYOF_ALNUM: /* Really is 'Word' */ + case ANYOF_WORDCHAR: DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); - has_special_charset_op = TRUE; break; - case ANYOF_NALNUM: + case ANYOF_NWORDCHAR: DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv, runtime_posix_matches_above_Unicode); - has_special_charset_op = TRUE; break; case ANYOF_VERTWS: /* For these, we use the cp_list, as /d doesn't make a @@ -11666,12 +12083,10 @@ parseit: * if these characters had folds other than themselves, as * cp_list is subject to folding */ _invlist_union(cp_list, PL_VertSpace, &cp_list); - has_special_non_charset_op = TRUE; break; case ANYOF_NVERTWS: _invlist_union_complement_2nd(cp_list, PL_VertSpace, &cp_list); - has_special_non_charset_op = TRUE; break; case ANYOF_XDIGIT: DO_POSIX(ret, namedclass, posixes, @@ -11689,19 +12104,19 @@ parseit: break; } - continue; + continue; /* Go get next character */ } } /* end of namedclass \blah */ if (range) { - if (prevvalue > (IV)value) /* b-a */ { + if (prevvalue > value) /* b-a */ { const int w = RExC_parse - rangebegin; Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); range = 0; /* not a valid range */ } } else { - prevvalue = value; /* save the beginning of the range */ + prevvalue = value; /* save the beginning of the potential range */ if (RExC_parse+1 < RExC_end && *RExC_parse == '-' && RExC_parse[1] != ']') @@ -11718,21 +12133,26 @@ parseit: "False [] range \"%*.*s\"", w, w, rangebegin); } - if (!SIZE_ONLY) + if (!SIZE_ONLY) { cp_list = add_cp_to_invlist(cp_list, '-'); + } + element_count++; } else range = 1; /* yeah, it's a range! */ continue; /* but do it the next time */ } } + /* Here, is the beginning of the range, if any; or + * if not */ + /* non-Latin1 code point implies unicode semantics. Must be set in * pass1 so is there for the whole of pass 2 */ if (value > 255) { RExC_uni_semantics = 1; } - /* now is the next time */ + /* Ready to process either the single value, or the completed range */ if (!SIZE_ONLY) { #ifndef EBCDIC cp_list = _add_range_to_invlist(cp_list, prevvalue, value); @@ -11761,105 +12181,197 @@ parseit: } range = 0; /* this range (if it was one) is done now */ - } + } /* End of loop through all the text within the brackets */ - /* [\w] can be optimized into \w, but not if there is anything else in the - * brackets (except for an initial '^' which indictes omplementing). We - * also can optimize the common special case /[0-9]/ into /\d/a */ - if (element_count == 1 && - (has_special_charset_op - || has_special_non_charset_op - || (prevvalue == '0' && value == '9'))) - { - U8 op; - const char * cur_parse = RExC_parse; + /* If the character class contains only a single element, it may be + * optimizable into another node type which is smaller and runs faster. + * Check if this is the case for this class */ + if (element_count == 1) { + U8 op = END; + U8 arg = 0; - if (has_special_charset_op) { - U8 offset = get_regex_charset(RExC_flags); + if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or + [:digit:] or \p{foo} */ - /* /aa is the same as /a for these */ - if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) { - offset = REGEX_ASCII_RESTRICTED_CHARSET; - } + /* Certain named classes have equivalents that can appear outside a + * character class, e.g. \w, \H. We use these instead of a + * character class. */ switch ((I32)namedclass) { - case ANYOF_NALNUM: + U8 offset; + + /* The first group is for node types that depend on the charset + * modifier to the regex. We first calculate the base node + * type, and if it should be inverted */ + + case ANYOF_NWORDCHAR: invert = ! invert; /* FALLTHROUGH */ - case ANYOF_ALNUM: + case ANYOF_WORDCHAR: op = ALNUM; - break; + goto join_charset_classes; + case ANYOF_NSPACE: invert = ! invert; /* FALLTHROUGH */ case ANYOF_SPACE: op = SPACE; - break; + goto join_charset_classes; + case ANYOF_NDIGIT: invert = ! invert; /* FALLTHROUGH */ case ANYOF_DIGIT: op = DIGIT; - /* There is no DIGITU */ - if (offset == REGEX_UNICODE_CHARSET) { - offset = REGEX_DEPENDS_CHARSET; + join_charset_classes: + + /* Now that we have the base node type, we take advantage + * of the enum ordering of the charset modifiers to get the + * exact node type, For example the base SPACE also has + * SPACEL, SPACEU, and SPACEA */ + + offset = get_regex_charset(RExC_flags); + + /* /aa is the same as /a for these */ + if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) { + offset = REGEX_ASCII_RESTRICTED_CHARSET; + } + else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) { + offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */ } - break; - default: - Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass); - } - /* The number of varieties of each of these is the same, hence, so - * is the delta between the normal and complemented nodes */ - if (invert) { - offset += NALNUM - ALNUM; - } + op += offset; - op += offset; - } - else if (has_special_non_charset_op) { - switch ((I32)namedclass) { + /* The number of varieties of each of these is the same, + * hence, so is the delta between the normal and + * complemented nodes */ + if (invert) { + op += NALNUM - ALNUM; + } + *flagp |= HASWIDTH|SIMPLE; + break; + + /* The second group doesn't depend of the charset modifiers. + * We just have normal and complemented */ case ANYOF_NHORIZWS: invert = ! invert; /* FALLTHROUGH */ case ANYOF_HORIZWS: - op = HORIZWS; + is_horizws: + op = (invert) ? NHORIZWS : HORIZWS; + *flagp |= HASWIDTH|SIMPLE; break; + case ANYOF_NVERTWS: invert = ! invert; /* FALLTHROUGH */ case ANYOF_VERTWS: - op = VERTWS; + op = (invert) ? NVERTWS : VERTWS; + *flagp |= HASWIDTH|SIMPLE; + break; + + case ANYOF_MAX: break; + + case ANYOF_NBLANK: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_BLANK: + if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) { + goto is_horizws; + } + /* FALLTHROUGH */ default: - Perl_croak(aTHX_ "panic: Named character class %"IVdf" is not expected to have a non-[...] version", namedclass); + /* A generic posix class. All the /a ones can be handled + * by the POSIXA opcode. And all are closed under folding + * in the ASCII range, so FOLD doesn't matter */ + if (AT_LEAST_ASCII_RESTRICTED + || (! LOC && namedclass == ANYOF_ASCII)) + { + /* The odd numbered ones are the complements of the + * next-lower even number one */ + if (namedclass % 2 == 1) { + invert = ! invert; + namedclass--; + } + arg = namedclass_to_classnum(namedclass); + op = (invert) ? NPOSIXA : POSIXA; + } + break; } + } + else if (value == prevvalue) { + + /* Here, the class consists of just a single code point */ - /* The complement version of each of these nodes is adjacently next - * */ if (invert) { - op++; + if (! LOC && value == '\n') { + op = REG_ANY; /* Optimize [^\n] */ + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + } + else if (value < 256 || UTF) { + + /* Optimize a single value into an EXACTish node, but not if it + * would require converting the pattern to UTF-8. */ + op = compute_EXACTish(pRExC_state); + } + } /* Otherwise is a range */ + else if (! LOC) { /* locale could vary these */ + if (prevvalue == '0') { + if (value == '9') { + op = (invert) ? NDIGITA : DIGITA; + *flagp |= HASWIDTH|SIMPLE; + } } - } - else { /* The remaining possibility is [0-9] */ - op = (invert) ? NDIGITA : DIGITA; } - /* Throw away this ANYOF regnode, and emit the calculated one, which - * should correspond to the beginning, not current, state of the parse - */ - RExC_parse = (char *)orig_parse; - RExC_emit = (regnode *)orig_emit; - ret = reg_node(pRExC_state, op); - RExC_parse = (char *) cur_parse; + /* Here, we have changed away from its initial value iff we found + * an optimization */ + if (op != END) { + + /* Throw away this ANYOF regnode, and emit the calculated one, + * which should correspond to the beginning, not current, state of + * the parse */ + const char * cur_parse = RExC_parse; + RExC_parse = (char *)orig_parse; + if ( SIZE_ONLY) { + if (! LOC) { + + /* To get locale nodes to not use the full ANYOF size would + * require moving the code above that writes the portions + * of it that aren't in other nodes to after this point. + * e.g. ANYOF_CLASS_SET */ + RExC_size = orig_size; + } + } + else { + RExC_emit = (regnode *)orig_emit; + } - SvREFCNT_dec(listsv); - return ret; + ret = reg_node(pRExC_state, op); + + if (PL_regkind[op] == POSIXD) { + if (! SIZE_ONLY) { + FLAGS(ret) = arg; + } + *flagp |= HASWIDTH|SIMPLE; + } + else if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + } + + RExC_parse = (char *) cur_parse; + + SvREFCNT_dec(listsv); + return ret; + } } if (SIZE_ONLY) return ret; - /****** !SIZE_ONLY AFTER HERE *********/ + /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/ /* If folding, we calculate all characters that could fold to or from the * ones already on the list */ @@ -11877,11 +12389,13 @@ parseit: } else { - /* This is a list of all the characters that participate in folds - * (except marks, etc in multi-char folds */ + /* Here, there are non-Latin1 code points, so we will have to go + * fetch the list of all the characters that participate in folds + */ if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0); - PL_utf8_foldable = _swash_to_invlist(swash); + SV* swash = swash_init("utf8", "_Perl_Any_Folds", + &PL_sv_undef, 1, 0); + PL_utf8_foldable = _get_swash_invlist(swash); SvREFCNT_dec(swash); } @@ -11895,7 +12409,7 @@ parseit: * rules hard-coded into Perl. (This case happens legitimately * during compilation of Perl itself before the Unicode tables * are generated) */ - if (invlist_len(PL_utf8_foldable) == 0) { + if (_invlist_len(PL_utf8_foldable) == 0) { PL_utf8_foldclosures = newHV(); } else { @@ -11905,9 +12419,8 @@ parseit: U8 dummy[UTF8_MAXBYTES+1]; STRLEN dummy_len; - /* This particular string is above \xff in both UTF-8 - * and UTFEBCDIC */ - to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len); + /* This string is just a short named one above \xff */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len); assert(PL_utf8_tofold); /* Verify that worked */ } PL_utf8_foldclosures = @@ -11960,12 +12473,12 @@ parseit: } else { depends_list = - add_cp_to_invlist(depends_list, PL_fold_latin1[j]); + add_cp_to_invlist(depends_list, PL_fold_latin1[j]); } } if (HAS_NONLATIN1_FOLD_CLOSURE(j) - && (! isASCII(j) || ! MORE_ASCII_RESTRICTED)) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) { /* Certain Latin1 characters have matches outside * Latin1, or are multi-character. To get here, 'j' is @@ -11995,27 +12508,24 @@ parseit: switch (j) { case 'k': case 'K': - /* KELVIN SIGN */ cp_list = - add_cp_to_invlist(cp_list, 0x212A); + add_cp_to_invlist(cp_list, KELVIN_SIGN); break; case 's': case 'S': - /* LATIN SMALL LETTER LONG S */ - cp_list = - add_cp_to_invlist(cp_list, 0x017F); + cp_list = add_cp_to_invlist(cp_list, + LATIN_SMALL_LETTER_LONG_S); break; case MICRO_SIGN: cp_list = add_cp_to_invlist(cp_list, - GREEK_SMALL_LETTER_MU); - cp_list = add_cp_to_invlist(cp_list, GREEK_CAPITAL_LETTER_MU); + cp_list = add_cp_to_invlist(cp_list, + GREEK_SMALL_LETTER_MU); break; case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: - /* ANGSTROM SIGN */ cp_list = - add_cp_to_invlist(cp_list, 0x212B); + add_cp_to_invlist(cp_list, ANGSTROM_SIGN); break; case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: cp_list = add_cp_to_invlist(cp_list, @@ -12027,7 +12537,7 @@ parseit: /* Under /a, /d, and /u, this can match the two * chars "ss" */ - if (! MORE_ASCII_RESTRICTED) { + if (! ASCII_FOLD_RESTRICTED) { add_alternate(&unicode_alternate, (U8 *) "ss", 2); @@ -12072,7 +12582,7 @@ parseit: ((allow_full_fold) ? FOLD_FLAGS_FULL : 0) | ((LOC) ? FOLD_FLAGS_LOCALE - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? FOLD_FLAGS_NOMIX_ASCII : 0)); @@ -12131,7 +12641,8 @@ parseit: /* /aa doesn't allow folds between ASCII and non-; * /l doesn't allow them between above and below * 256 */ - if ((MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j))) + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j))) || (LOC && ((c < 256) != (j < 256)))) { continue; @@ -12144,7 +12655,7 @@ parseit: cp_list = add_cp_to_invlist(cp_list, c); } else { - depends_list = add_cp_to_invlist(depends_list, c); + depends_list = add_cp_to_invlist(depends_list, c); } } } @@ -12156,9 +12667,10 @@ parseit: /* And combine the result (if any) with any inversion list from posix * classes. The lists are kept separate up to now because we don't want to - * fold the classes */ + * fold the classes (folding of those is automatically handled by the swash + * fetching code) */ if (posixes) { - if (AT_LEAST_UNI_SEMANTICS) { + if (! DEPENDS_SEMANTICS) { if (cp_list) { _invlist_union(cp_list, posixes, &cp_list); SvREFCNT_dec(posixes); @@ -12168,7 +12680,6 @@ parseit: } } else { - /* Under /d, we put into a separate list the Latin1 things that * match only when the target string is utf8 */ SV* nonascii_but_latin1_properties = NULL; @@ -12248,18 +12759,17 @@ parseit: * Now we can see about various optimizations. Fold calculation (which we * did above) needs to take place before inversion. Otherwise /[^k]/i * would invert to include K, which under /i would match k, which it - * shouldn't. */ + * shouldn't. Therefore we can't invert folded locale now, as it won't be + * folded until runtime */ - /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't - * set the FOLD flag yet, so this does optimize those. It doesn't - * optimize locale. Doing so perhaps could be done as long as there is - * nothing like \w in it; some thought also would have to be given to the - * interaction with above 0x100 chars */ + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known + * at compile time. Besides not inverting folded locale now, we can't invert + * if there are things such as \w, which aren't known until runtime */ if (invert - && ! LOC + && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS))) && ! depends_list && ! unicode_alternate - && SvCUR(listsv) == initial_listsv_len) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) { _invlist_invert(cp_list); @@ -12273,10 +12783,151 @@ parseit: invert = FALSE; } + /* If we didn't do folding, it's because some information isn't available + * until runtime; set the run-time fold flag for these. (We don't have to + * worry about properties folding, as that is taken care of by the swash + * fetching) */ + if (FOLD && (LOC || unicode_alternate)) + { + ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD; + } + + /* Some character classes are equivalent to other nodes. Such nodes take + * up less room and generally fewer operations to execute than ANYOF nodes. + * Above, we checked for and optimized into some such equivalents for + * certain common classes that are easy to test. Getting to this point in + * the code means that the class didn't get optimized there. Since this + * code is only executed in Pass 2, it is too late to save space--it has + * been allocated in Pass 1, and currently isn't given back. But turning + * things into an EXACTish node can allow the optimizer to join it to any + * adjacent such nodes. And if the class is equivalent to things like /./, + * expensive run-time swashes can be avoided. Now that we have more + * complete information, we can find things necessarily missed by the + * earlier code. I (khw) am not sure how much to look for here. It would + * be easy, but perhaps too slow, to check any candidates against all the + * node types they could possibly match using _invlistEQ(). */ + + if (cp_list + && ! unicode_alternate + && ! invert + && ! depends_list + && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + { + UV start, end; + U8 op = END; /* The optimzation node-type */ + const char * cur_parse= RExC_parse; + + invlist_iterinit(cp_list); + if (! invlist_iternext(cp_list, &start, &end)) { + + /* Here, the list is empty. This happens, for example, when a + * Unicode property is the only thing in the character class, and + * it doesn't match anything. (perluniprops.pod notes such + * properties) */ + op = OPFAIL; + *flagp |= HASWIDTH|SIMPLE; + } + else if (start == end) { /* The range is a single code point */ + if (! invlist_iternext(cp_list, &start, &end) + + /* Don't do this optimization if it would require changing + * the pattern to UTF-8 */ + && (start < 256 || UTF)) + { + /* Here, the list contains a single code point. Can optimize + * into an EXACT node */ + + value = start; + + if (! FOLD) { + op = EXACT; + } + else if (LOC) { + + /* A locale node under folding with one code point can be + * an EXACTFL, as its fold won't be calculated until + * runtime */ + op = EXACTFL; + } + else { + + /* Here, we are generally folding, but there is only one + * code point to match. If we have to, we use an EXACT + * node, but it would be better for joining with adjacent + * nodes in the optimization pass if we used the same + * EXACTFish node that any such are likely to be. We can + * do this iff the code point doesn't participate in any + * folds. For example, an EXACTF of a colon is the same as + * an EXACT one, since nothing folds to or from a colon. + * In the Latin1 range, being an alpha means that the + * character participates in a fold (except for the + * feminine and masculine ordinals, which I (khw) don't + * think are worrying about optimizing for). */ + if (value < 256) { + if (isALPHA_L1(value)) { + op = EXACT; + } + } + else { + if (! PL_utf8_foldable) { + SV* swash = swash_init("utf8", "_Perl_Any_Folds", + &PL_sv_undef, 1, 0); + PL_utf8_foldable = _get_swash_invlist(swash); + SvREFCNT_dec(swash); + } + if (_invlist_contains_cp(PL_utf8_foldable, value)) { + op = EXACT; + } + } + + /* If we haven't found the node type, above, it means we + * can use the prevailing one */ + if (op == END) { + op = compute_EXACTish(pRExC_state); + } + } + } + } + else if (start == 0) { + if (end == UV_MAX) { + op = SANY; + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + else if (end == '\n' - 1 + && invlist_iternext(cp_list, &start, &end) + && start == '\n' + 1 && end == UV_MAX) + { + op = REG_ANY; + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; + } + } + + if (op != END) { + RExC_parse = (char *)orig_parse; + RExC_emit = (regnode *)orig_emit; + + ret = reg_node(pRExC_state, op); + + RExC_parse = (char *)cur_parse; + + if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value); + } + + SvREFCNT_dec(listsv); + return ret; + } + } + /* Here, contains all the code points we can determine at * compile time that match under all conditions. Go through it, and * for things that belong in the bitmap, put them there, and delete from - * */ + * . While we are at it, see if everything above 255 is in the + * list, and if so, set a flag to speed up execution */ + ANYOF_BITMAP_ZERO(ret); if (cp_list) { /* This gets set if we actually need to modify things */ @@ -12290,6 +12941,10 @@ parseit: UV high; int i; + if (end == UV_MAX && start <= 256) { + ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; + } + /* Quit if are above what we should change */ if (start > 255) { break; @@ -12302,7 +12957,6 @@ parseit: for (i = start; i <= (int) high; i++) { if (! ANYOF_BITMAP_TEST(ret, i)) { ANYOF_BITMAP_SET(ret, i); - stored++; prevvalue = value; value = i; } @@ -12316,7 +12970,7 @@ parseit: } /* If have completely emptied it, remove it completely */ - if (invlist_len(cp_list) == 0) { + if (_invlist_len(cp_list) == 0) { SvREFCNT_dec(cp_list); cp_list = NULL; } @@ -12326,7 +12980,9 @@ parseit: ANYOF_FLAGS(ret) |= ANYOF_INVERT; } - /* Combine the two lists into one. */ + /* Here, the bitmap has been populated with all the Latin1 code points that + * always match. Can now add to the overall list those that match only + * when the target string is UTF-8 (). */ if (depends_list) { if (cp_list) { _invlist_union(cp_list, depends_list, &cp_list); @@ -12337,123 +12993,15 @@ parseit: } } - /* Folding in the bitmap is taken care of above, but not for locale (for - * which we have to wait to see what folding is in effect at runtime), and - * for some things not in the bitmap (only the upper latin folds in this - * case, as all other single-char folding has been set above). Set - * run-time fold flag for these */ - if (FOLD && (LOC - || (DEPENDS_SEMANTICS - && cp_list - && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8)) - || unicode_alternate)) - { - ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD; - } - - /* A single character class can be "optimized" into an EXACTish node. - * Note that since we don't currently count how many characters there are - * outside the bitmap, we are XXX missing optimization possibilities for - * them. This optimization can't happen unless this is a truly single - * character class, which means that it can't be an inversion into a - * many-character class, and there must be no possibility of there being - * things outside the bitmap. 'stored' (only) for locales doesn't include - * \w, etc, so have to make a special test that they aren't present - * - * Similarly A 2-character class of the very special form like [bB] can be - * optimized into an EXACTFish node, but only for non-locales, and for - * characters which only have the two folds; so things like 'fF' and 'Ii' - * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE - * FI'. */ - if (! cp_list - && ! unicode_alternate - && SvCUR(listsv) == initial_listsv_len - && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL)) - && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) - || (! ANYOF_CLASS_TEST_ANY_SET(ret))))) - || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) - && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)) - /* If the latest code point has a fold whose - * bit is set, it must be the only other one */ - && ((prevvalue = PL_fold_latin1[value]) != (IV)value) - && ANYOF_BITMAP_TEST(ret, prevvalue))))) - { - /* Note that the information needed to decide to do this optimization - * is not currently available until the 2nd pass, and that the actually - * used EXACTish node takes less space than the calculated ANYOF node, - * and hence the amount of space calculated in the first pass is larger - * than actually used, so this optimization doesn't gain us any space. - * But an EXACT node is faster than an ANYOF node, and can be combined - * with any adjacent EXACT nodes later by the optimizer for further - * gains. The speed of executing an EXACTF is similar to an ANYOF - * node, so the optimization advantage comes from the ability to join - * it to adjacent EXACT nodes */ - - const char * cur_parse= RExC_parse; - U8 op; - RExC_emit = (regnode *)orig_emit; - RExC_parse = (char *)orig_parse; - - if (stored == 1) { - - /* A locale node with one point can be folded; all the other cases - * with folding will have two points, since we calculate them above - */ - if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) { - op = EXACTFL; - } - else { - op = EXACT; - } - } - else { /* else 2 chars in the bit map: the folds of each other */ - - /* Use the folded value, which for the cases where we get here, - * is just the lower case of the current one (which may resolve to - * itself, or to the other one */ - value = toLOWER_LATIN1(value); - - /* To join adjacent nodes, they must be the exact EXACTish type. - * Try to use the most likely type, by using EXACTFA if possible, - * then EXACTFU if the regex calls for it, or is required because - * the character is non-ASCII. (If is ASCII, its fold is - * also ASCII for the cases where we get here.) */ - if (MORE_ASCII_RESTRICTED && isASCII(value)) { - op = EXACTFA; - } - else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) { - op = EXACTFU; - } - else { /* Otherwise, more likely to be EXACTF type */ - op = EXACTF; - } - } - - ret = reg_node(pRExC_state, op); - RExC_parse = (char *)cur_parse; - if (UTF && ! NATIVE_IS_INVARIANT(value)) { - *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value); - *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value); - STR_LEN(ret)= 2; - RExC_emit += STR_SZ(2); - } - else { - *STRING(ret)= (char)value; - STR_LEN(ret)= 1; - RExC_emit += STR_SZ(1); - } - SvREFCNT_dec(listsv); - return ret; - } - /* If there is a swash and more than one element, we can't use the swash in * the optimization below. */ if (swash && element_count > 1) { SvREFCNT_dec(swash); swash = NULL; } + if (! cp_list - && SvCUR(listsv) == initial_listsv_len + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION && ! unicode_alternate) { ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); @@ -12476,9 +13024,9 @@ parseit: AV * const av = newAV(); SV *rv; - av_store(av, 0, (SvCUR(listsv) == initial_listsv_len) - ? &PL_sv_undef - : listsv); + av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + ? listsv + : &PL_sv_undef); if (swash) { av_store(av, 1, swash); SvREFCNT_dec(cp_list); @@ -12507,8 +13055,11 @@ parseit: RExC_rxi->data->data[n] = (void*)rv; ARG_SET(ret, n); } + + *flagp |= HASWIDTH|SIMPLE; return ret; } +#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION /* reg_skipcomment() @@ -12599,7 +13150,7 @@ STATIC regnode * /* Location. */ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) { dVAR; - register regnode *ptr; + regnode *ptr; regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; @@ -12641,7 +13192,7 @@ STATIC regnode * /* Location. */ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) { dVAR; - register regnode *ptr; + regnode *ptr; regnode * const ret = RExC_emit; GET_RE_DEBUG_FLAGS_DECL; @@ -12713,9 +13264,9 @@ STATIC void S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) { dVAR; - register regnode *src; - register regnode *dst; - register regnode *place; + regnode *src; + regnode *dst; + regnode *place; const int offset = regarglen[(U8)op]; const int size = NODE_STEP_REGNODE + offset; GET_RE_DEBUG_FLAGS_DECL; @@ -12801,7 +13352,7 @@ STATIC void S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) { dVAR; - register regnode *scan; + regnode *scan; GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGTAIL; @@ -12860,7 +13411,7 @@ STATIC U8 S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth) { dVAR; - register regnode *scan; + regnode *scan; U8 exact = PSEUDO; #ifdef EXPERIMENTAL_INPLACESCAN I32 min = 0; @@ -13101,7 +13652,41 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) { #ifdef DEBUGGING dVAR; - register int k; + int k; + + /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ + static const char * const anyofs[] = { + "\\w", + "\\W", + "\\s", + "\\S", + "\\d", + "\\D", + "[:alnum:]", + "[:^alnum:]", + "[:alpha:]", + "[:^alpha:]", + "[:ascii:]", + "[:^ascii:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:graph:]", + "[:^graph:]", + "[:lower:]", + "[:^lower:]", + "[:print:]", + "[:^print:]", + "[:punct:]", + "[:^punct:]", + "[:upper:]", + "[:^upper:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:blank:]", + "[:^blank:]" + }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -13222,39 +13807,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) const U8 flags = ANYOF_FLAGS(o); int do_sep = 0; - /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ - static const char * const anyofs[] = { - "\\w", - "\\W", - "\\s", - "\\S", - "\\d", - "\\D", - "[:alnum:]", - "[:^alnum:]", - "[:alpha:]", - "[:^alpha:]", - "[:ascii:]", - "[:^ascii:]", - "[:cntrl:]", - "[:^cntrl:]", - "[:graph:]", - "[:^graph:]", - "[:lower:]", - "[:^lower:]", - "[:print:]", - "[:^print:]", - "[:punct:]", - "[:^punct:]", - "[:upper:]", - "[:^upper:]", - "[:xdigit:]", - "[:^xdigit:]", - "[:space:]", - "[:^space:]", - "[:blank:]", - "[:^blank:]" - }; if (flags & ANYOF_LOCALE) sv_catpvs(sv, "{loc}"); @@ -13392,6 +13944,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } + else if (k == POSIXD) { + U8 index = FLAGS(o) * 2; + if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + } + else { + sv_catpv(sv, anyofs[index]); + } + } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); #else @@ -13895,7 +14456,7 @@ regnode * Perl_regnext(pTHX_ register regnode *p) { dVAR; - register I32 offset; + I32 offset; if (!p) return(NULL); @@ -13969,6 +14530,8 @@ Perl_save_re_context(pTHX) PL_reg_oldsaved = NULL; PL_reg_oldsavedlen = 0; + PL_reg_oldsavedoffset = 0; + PL_reg_oldsavedcoffset = 0; PL_reg_maxiter = 0; PL_reg_leftiter = 0; PL_reg_poscache = NULL; @@ -14054,8 +14617,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, SV* sv, I32 indent, U32 depth) { dVAR; - register U8 op = PSEUDO; /* Arbitrary non-END op. */ - register const regnode *next; + U8 op = PSEUDO; /* Arbitrary non-END op. */ + const regnode *next; const regnode *optstart= NULL; RXi_GET_DECL(r,ri); @@ -14106,9 +14669,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, if (PL_regkind[(U8)op] == BRANCHJ) { assert(next); { - register const regnode *nnode = (OP(next) == LONGJMP - ? regnext((regnode *)next) - : next); + const regnode *nnode = (OP(next) == LONGJMP + ? regnext((regnode *)next) + : next); if (last && nnode > last) nnode = last; DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);