X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/18af16d0e66326ce358cf094d4b27aa663e7a10d..236cbe8d7a6747fb6144c93dd767895f8acffd00:/regcomp.c diff --git a/regcomp.c b/regcomp.c index fdf96f7..1e1dcfd 100644 --- a/regcomp.c +++ b/regcomp.c @@ -86,6 +86,9 @@ #endif #include "dquote_static.c" +#ifndef PERL_IN_XSUB_RE +# include "charclass_invlists.h" +#endif #ifdef op #undef op @@ -142,6 +145,8 @@ typedef struct RExC_state_t { regnode **recurse; /* Recurse regops */ I32 recurse_count; /* Number of recurse regops */ I32 in_lookbehind; + I32 contains_locale; + I32 override_recoding; #if ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -190,6 +195,8 @@ typedef struct RExC_state_t { #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) #define RExC_in_lookbehind (pRExC_state->in_lookbehind) +#define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_override_recoding (pRExC_state->override_recoding) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') @@ -509,6 +516,13 @@ static const scan_data_t zero_scan_data = (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END +#define ckWARN2regdep(loc,m, a1) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + #define ckWARN2reg(loc, m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ @@ -678,7 +692,7 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL); else data->flags &= ~SF_FIX_BEFORE_EOL; - data->minlen_fixed=minlenp; + data->minlen_fixed=minlenp; data->lookbehind_fixed=0; } else { /* *data->longest == data->longest_float */ @@ -717,11 +731,24 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c { PERL_ARGS_ASSERT_CL_ANYTHING; - ANYOF_CLASS_ZERO(cl); ANYOF_BITMAP_SETALL(cl); - cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL; - if (LOC) + cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL + |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL; + + /* If any portion of the regex is to operate under locale rules, + * initialization includes it. The reason this isn't done for all regexes + * is that the optimizer was written under the assumption that locale was + * all-or-nothing. Given the complexity and lack of documentation in the + * optimizer, and that there are inadequate test cases for locale, so many + * parts of it may not work properly, it is safest to avoid locale unless + * necessary. */ + if (RExC_contains_locale) { + ANYOF_CLASS_SETALL(cl); /* /l uses class */ cl->flags |= ANYOF_LOCALE; + } + else { + ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */ + } } /* Can match anything (initialization) */ @@ -751,22 +778,15 @@ S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; cl_anything(pRExC_state, cl); + ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); } -STATIC void -S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) -{ - PERL_ARGS_ASSERT_CL_INIT_ZERO; - - Zero(cl, 1, struct regnode_charclass_class); - cl->type = ANYOF; - cl_anything(pRExC_state, cl); - if (LOC) - cl->flags |= ANYOF_LOCALE; -} +/* These two functions currently do the exact same thing */ +#define cl_init_zero S_cl_init -/* 'And' a given class with another one. Can create false positives */ -/* We assume that cl is not inverted */ +/* 'AND' a given class with another one. Can create false positives. 'cl' + * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if + * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void S_cl_and(struct regnode_charclass_class *cl, const struct regnode_charclass_class *and_with) @@ -775,6 +795,7 @@ S_cl_and(struct regnode_charclass_class *cl, assert(and_with->type == ANYOF); + /* I (khw) am not sure all these restrictions are necessary XXX */ if (!(ANYOF_CLASS_TEST_ANY_SET(and_with)) && !(ANYOF_CLASS_TEST_ANY_SET(cl)) && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) @@ -789,46 +810,119 @@ S_cl_and(struct regnode_charclass_class *cl, for (i = 0; i < ANYOF_BITMAP_SIZE; i++) cl->bitmap[i] &= and_with->bitmap[i]; } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ - if (!(and_with->flags & ANYOF_EOS)) - cl->flags &= ~ANYOF_EOS; - if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)) - cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD; - if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL)) - cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; + if (and_with->flags & ANYOF_INVERT) { - if (cl->flags & ANYOF_UNICODE_ALL - && ANYOF_NONBITMAP(and_with) - && !(and_with->flags & ANYOF_INVERT)) - { - if (! (and_with->flags & ANYOF_UNICODE_ALL)) { + /* Here, the and'ed node is inverted. Get the AND of the flags that + * aren't affected by the inversion. Those that are affected are + * handled individually below */ + U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS; + cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS); + cl->flags |= affected_flags; + + /* We currently don't know how to deal with things that aren't in the + * bitmap, but we know that the intersection is no greater than what + * is already in cl, so let there be false positives that get sorted + * out after the synthetic start class succeeds, and the node is + * matched for real. */ + + /* The inversion of these two flags indicate that the resulting + * intersection doesn't have them */ + if (and_with->flags & ANYOF_UNICODE_ALL) { cl->flags &= ~ANYOF_UNICODE_ALL; } - else { - - /* The intersection of all unicode with something that isn't all - * unicode is that something */ - ARG_SET(cl, ARG(and_with)); + if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) { + cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; } } - if (!(and_with->flags & ANYOF_UNICODE_ALL) && - !(and_with->flags & ANYOF_INVERT)) - { - cl->flags &= ~ANYOF_UNICODE_ALL; + else { /* and'd node is not inverted */ + U8 outside_bitmap_but_not_utf8; /* Temp variable */ + if (! ANYOF_NONBITMAP(and_with)) { - ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); + + /* Here 'and_with' doesn't match anything outside the bitmap + * (except possibly ANYOF_UNICODE_ALL), which means the + * intersection can't either, except for ANYOF_UNICODE_ALL, in + * which case we don't know what the intersection is, but it's no + * greater than what cl already has, so can just leave it alone, + * with possible false positives */ + if (! (and_with->flags & ANYOF_UNICODE_ALL)) { + ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); + cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8; + } + } + else if (! ANYOF_NONBITMAP(cl)) { + + /* Here, 'and_with' does match something outside the bitmap, and cl + * doesn't have a list of things to match outside the bitmap. If + * cl can match all code points above 255, the intersection will + * be those above-255 code points that 'and_with' matches. If cl + * can't match all Unicode code points, it means that it can't + * match anything outside the bitmap (since the 'if' that got us + * into this block tested for that), so we leave the bitmap empty. + */ + if (cl->flags & ANYOF_UNICODE_ALL) { + ARG_SET(cl, ARG(and_with)); + + /* and_with's ARG may match things that don't require UTF8. + * And now cl's will too, in spite of this being an 'and'. See + * the comments below about the kludge */ + cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8; + } } + else { + /* Here, both 'and_with' and cl match something outside the + * bitmap. Currently we do not do the intersection, so just match + * whatever cl had at the beginning. */ + } + + + /* Take the intersection of the two sets of flags. However, the + * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a + * kludge around the fact that this flag is not treated like the others + * which are initialized in cl_anything(). The way the optimizer works + * is that the synthetic start class (SSC) is initialized to match + * anything, and then the first time a real node is encountered, its + * values are AND'd with the SSC's with the result being the values of + * the real node. However, there are paths through the optimizer where + * the AND never gets called, so those initialized bits are set + * inappropriately, which is not usually a big deal, as they just cause + * false positives in the SSC, which will just mean a probably + * imperceptible slow down in execution. However this bit has a + * higher false positive consequence in that it can cause utf8.pm, + * utf8_heavy.pl ... to be loaded when not necessary, which is a much + * bigger slowdown and also causes significant extra memory to be used. + * In order to prevent this, the code now takes a different tack. The + * bit isn't set unless some part of the regular expression needs it, + * but once set it won't get cleared. This means that these extra + * modules won't get loaded unless there was some path through the + * pattern that would have required them anyway, and so any false + * positives that occur by not ANDing them out when they could be + * aren't as severe as they would be if we treated this bit like all + * the others */ + outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags) + & ANYOF_NONBITMAP_NON_UTF8; + cl->flags &= and_with->flags; + cl->flags |= outside_bitmap_but_not_utf8; } } -/* 'OR' a given class with another one. Can create false positives */ -/* We assume that cl is not inverted */ +/* 'OR' a given class with another one. Can create false positives. 'cl' + * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if + * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) { PERL_ARGS_ASSERT_CL_OR; if (or_with->flags & ANYOF_INVERT) { + + /* Here, the or'd node is to be inverted. This means we take the + * complement of everything not in the bitmap, but currently we don't + * know what that is, so give up and match anything */ + if (ANYOF_NONBITMAP(or_with)) { + cl_anything(pRExC_state, cl); + } /* We do not use * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) * <= (B1 | !B2) | (CL1 | !CL2) @@ -838,7 +932,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) */ - if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) + else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) { int i; @@ -849,7 +943,21 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con else { cl_anything(pRExC_state, cl); } - } else { + + /* And, we can just take the union of the flags that aren't affected + * by the inversion */ + cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS; + + /* For the remaining flags: + ANYOF_UNICODE_ALL and inverted means to not match anything above + 255, which means that the union with cl should just be + what cl has in it, so can ignore this flag + ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord + is 127-255 to match them, but then invert that, so the + union with cl should just be what cl has in it, so can + ignore this flag + */ + } else { /* 'or_with' is not inverted */ /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) @@ -868,25 +976,32 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con else { /* XXXX: logic is complicated, leave it along for a moment. */ cl_anything(pRExC_state, cl); } - } - if (or_with->flags & ANYOF_EOS) - cl->flags |= ANYOF_EOS; - if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL)) - cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL; - if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) - cl->flags |= ANYOF_LOC_NONBITMAP_FOLD; + if (ANYOF_NONBITMAP(or_with)) { + + /* Use the added node's outside-the-bit-map match if there isn't a + * conflict. If there is a conflict (both nodes match something + * outside the bitmap, but what they match outside is not the same + * pointer, and hence not easily compared until XXX we extend + * inversion lists this far), give up and allow the start class to + * match everything outside the bitmap. If that stuff is all above + * 255, can just set UNICODE_ALL, otherwise caould be anything. */ + if (! ANYOF_NONBITMAP(cl)) { + ARG_SET(cl, ARG(or_with)); + } + else if (ARG(cl) != ARG(or_with)) { - /* If both nodes match something outside the bitmap, but what they match - * outside is not the same pointer, and hence not easily compared, give up - * and allow the start class to match everything outside the bitmap */ - if (ANYOF_NONBITMAP(cl) && ANYOF_NONBITMAP(or_with) && - ARG(cl) != ARG(or_with)) { - cl->flags |= ANYOF_UNICODE_ALL; - } + if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) { + cl_anything(pRExC_state, cl); + } + else { + cl->flags |= ANYOF_UNICODE_ALL; + } + } + } - if (or_with->flags & ANYOF_UNICODE_ALL) { - cl->flags |= ANYOF_UNICODE_ALL; + /* Take the union */ + cl->flags |= or_with->flags; } } @@ -1275,8 +1390,8 @@ is the recommended Unicode-aware way of saying scan += len; \ len = 0; \ } else { \ - uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\ - uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \ + len = UTF8SKIP(uc);\ + uvc = to_utf8_fold( uc, foldbuf, &foldlen); \ foldlen -= UNISKIP( uvc ); \ scan = foldbuf + UNISKIP( uvc ); \ } \ @@ -1403,10 +1518,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #endif switch (flags) { + case EXACT: break; case EXACTFA: case EXACTFU: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; case EXACTFL: folder = PL_fold_locale; break; + default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags ); } trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) ); @@ -1593,7 +1710,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, "%*sCompiling trie using list compiler\n", (int)depth * 2 + 2, "")); - + trie->states = (reg_trie_state *) PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, sizeof(reg_trie_state) ); @@ -2391,15 +2508,116 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode }}); +/* The below joins as many adjacent EXACTish nodes as possible into a single + * one, and looks for problematic sequences of characters whose folds vs. + * non-folds have sufficiently different lengths, that the optimizer would be + * fooled into rejecting legitimate matches of them, and the trie construction + * code can't cope with them. The joining is only done if: + * 1) there is room in the current conglomerated node to entirely contain the + * next one. + * 2) they are the exact same node type + * + * The adjacent nodes actually may be separated by NOTHING kind nodes, and + * these get optimized out + * + * If there are problematic code sequences, *min_subtract is set to the delta + * that the minimum size of the node can be less than its actual size. And, + * the node type of the result is changed to reflect that it contains these + * sequences. + * + * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF + * and contains LATIN SMALL LETTER SHARP S + * + * This is as good a place as any to discuss the design of handling these + * problematic sequences. It's been wrong in Perl for a very long time. There + * are three code points in Unicode whose folded lengths differ so much from + * the un-folded lengths that it causes problems for the optimizer and trie + * construction. Why only these are problematic, and not others where lengths + * also differ is something I (khw) do not understand. New versions of Unicode + * might add more such code points. Hopefully the logic in fold_grind.t that + * figures out what to test (in part by verifying that each size-combination + * gets tested) will catch any that do come along, so they can be added to the + * special handling below. The chances of new ones are actually rather small, + * as most, if not all, of the world's scripts that have casefolding have + * already been encoded by Unicode. Also, a number of Unicode's decisions were + * made to allow compatibility with pre-existing standards, and almost all of + * those have already been dealt with. These would otherwise be the most + * likely candidates for generating further tricky sequences. In other words, + * Unicode by itself is unlikely to add new ones unless it is for compatibility + * with pre-existing standards, and there aren't many of those left. + * + * The previous designs for dealing with these involved assigning a special + * node for them. This approach doesn't work, as evidenced by this example: + * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches + * Both these fold to "sss", but if the pattern is parsed to create a node of + * that would match just the \xDF, it won't be able to handle the case where a + * successful match would have to cross the node's boundary. The new approach + * that hopefully generally solves the problem generates an EXACTFU_SS node + * that is "sss". + * + * There are a number of components to the approach (a lot of work for just + * three code points!): + * 1) This routine examines each EXACTFish node that could contain the + * problematic sequences. It returns in *min_subtract how much to + * subtract from the the actual length of the string to get a real minimum + * for one that could match it. This number is usually 0 except for the + * problematic sequences. This delta is used by the caller to adjust the + * min length of the match, and the delta between min and max, so that the + * optimizer doesn't reject these possibilities based on size constraints. + * 2) These sequences are not currently correctly handled by the trie code + * either, so it changes the joined node type to ops that are not handled + * by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE. + * 3) This is sufficient for the two Greek sequences (described below), but + * the one involving the Sharp s (\xDF) needs more. The node type + * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss" + * sequence in it. For non-UTF-8 patterns and strings, this is the only + * case where there is a possible fold length change. That means that a + * regular EXACTFU node without UTF-8 involvement doesn't have to concern + * itself with length changes, and so can be processed faster. regexec.c + * takes advantage of this. Generally, an EXACTFish node that is in UTF-8 + * is pre-folded by regcomp.c. This saves effort in regex matching. + * However, probably mostly for historical reasons, the pre-folding isn't + * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL + * nodes, as what they fold to isn't known until runtime.) The fold + * possibilities for the non-UTF8 patterns are quite simple, except for + * the sharp s. All the ones that don't involve a UTF-8 target string + * are members of a fold-pair, and arrays are set up for all of them + * that quickly find the other member of the pair. It might actually + * be faster to pre-fold these, but it isn't currently done, except for + * the sharp s. Code elsewhere in this file makes sure that it gets + * folded to 'ss', even if the pattern isn't UTF-8. This avoids the + * issues described in the next item. + * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches + * 'ss' or not is not knowable at compile time. It will match iff the + * target string is in UTF-8, unlike the EXACTFU nodes, where it always + * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus + * it can't be folded to "ss" at compile time, unlike EXACTFU does as + * described in item 3). An assumption that the optimizer part of + * regexec.c (probably unwittingly) makes is that a character in the + * pattern corresponds to at most a single character in the target string. + * (And I do mean character, and not byte here, unlike other parts of the + * documentation that have never been updated to account for multibyte + * Unicode.) This assumption is wrong only in this case, as all other + * cases are either 1-1 folds when no UTF-8 is involved; or is true by + * virtue of having this file pre-fold UTF-8 patterns. I'm + * reluctant to try to change this assumption, so instead the code punts. + * This routine examines EXACTF nodes for the sharp s, and returns a + * boolean indicating whether or not the node is an EXACTF node that + * contains a sharp s. When it is true, the caller sets a flag that later + * causes the optimizer in this file to not set values for the floating + * and fixed string lengths, and thus avoids the optimizer code in + * regexec.c that makes the invalid assumption. Thus, there is no + * optimization based on string lengths for EXACTF nodes that contain the + * sharp s. This only happens for /id rules (which means the pattern + * isn't in UTF-8). + */ - - -#define JOIN_EXACT(scan,min,flags) \ +#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \ if (PL_regkind[OP(scan)] == EXACT) \ - join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1) + join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1) STATIC U32 -S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) { +S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) { /* Merge several consecutive EXACTish nodes into one. */ regnode *n = regnext(scan); U32 stringok = 1; @@ -2419,13 +2637,15 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags PERL_UNUSED_ARG(val); #endif DEBUG_PEEP("join",scan,depth); - - /* Skip NOTHING, merge EXACT*. */ - while (n && - ( PL_regkind[OP(n)] == NOTHING || - (stringok && (OP(n) == OP(scan)))) + + /* Look through the subsequent nodes in the chain. Skip NOTHING, merge + * EXACT ones that are mergeable to the current one. */ + while (n + && (PL_regkind[OP(n)] == NOTHING + || (stringok && OP(n) == OP(scan))) && NEXT_OFF(n) - && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) { + && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) + { if (OP(n) == TAIL || n > next) stringok = 0; @@ -2442,12 +2662,13 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags else if (stringok) { const unsigned int oldl = STR_LEN(scan); regnode * const nnext = regnext(n); + + if (oldl + STR_LEN(n) > U8_MAX) + break; DEBUG_PEEP("merg",n,depth); - merged++; - if (oldl + STR_LEN(n) > U8_MAX) - break; + NEXT_OFF(scan) += NEXT_OFF(n); STR_LEN(scan) += STR_LEN(n); next = n + NODE_SZ_STR(n); @@ -2473,75 +2694,188 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags } #endif } -#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390 -#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS -#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0 -#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS - if (UTF - && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA) - && ( STR_LEN(scan) >= 6 ) ) - { - /* - Two problematic code points in Unicode casefolding of EXACT nodes: - - U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS - U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS - - which casefold to - - Unicode UTF-8 - - U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 - U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 - - This means that in case-insensitive matching (or "loose matching", - as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte - length of the above casefolded versions) can match a target string - of length two (the byte length of UTF-8 encoded U+0390 or U+03B0). - This would rather mess up the minimum length computation. - - What we'll do is to look for the tail four bytes, and then peek - at the preceding two bytes to see whether we need to decrease - the minimum length by four (six minus two). - - Thanks to the design of UTF-8, there cannot be false matches: - A sequence of valid UTF-8 bytes cannot be a subsequence of - another valid sequence of UTF-8 bytes. - - */ - char * const s0 = STRING(scan), *s, *t; - char * const s1 = s0 + STR_LEN(scan) - 1; - char * const s2 = s1 - 4; + *min_subtract = 0; + *has_exactf_sharp_s = FALSE; + + /* Here, all the adjacent mergeable EXACTish nodes have been merged. We + * can now analyze for sequences of problematic code points. (Prior to + * this final joining, sequences could have been split over boundaries, and + * hence missed). The sequences only happen in folding, hence for any + * non-EXACT EXACTish node */ + if (OP(scan) != EXACT) { + U8 *s; + U8 * s0 = (U8*) STRING(scan); + U8 * const s_end = s0 + STR_LEN(scan); + + /* The below is perhaps overboard, but this allows us to save a test + * each time through the loop at the expense of a mask. This is + * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a + * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64. + * This uses an exclusive 'or' to find that bit and then inverts it to + * form a mask, with just a single 0, in the bit position where 'S' and + * 's' differ. */ + const U8 S_or_s_mask = (U8) ~ ('S' ^ 's'); + const U8 s_masked = 's' & S_or_s_mask; + + /* One pass is made over the node's string looking for all the + * possibilities. to avoid some tests in the loop, there are two main + * cases, for UTF-8 patterns (which can't have EXACTF nodes) and + * non-UTF-8 */ + if (UTF) { + + /* There are two problematic Greek code points in Unicode + * casefolding + * + * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS + * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS + * + * which casefold to + * + * Unicode UTF-8 + * + * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81 + * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81 + * + * This means that in case-insensitive matching (or "loose + * matching", as Unicode calls it), an EXACTF of length six (the + * UTF-8 encoded byte length of the above casefolded versions) can + * match a target string of length two (the byte length of UTF-8 + * encoded U+0390 or U+03B0). This would rather mess up the + * minimum length computation. (there are other code points that + * also fold to these two sequences, but the delta is smaller) + * + * If these sequences are found, the minimum length is decreased by + * four (six minus two). + * + * Similarly, 'ss' may match the single char and byte LATIN SMALL + * LETTER SHARP S. We decrease the min length by 1 for each + * occurrence of 'ss' found */ + #ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */ - const char t0[] = "\xaf\x49\xaf\x42"; -#else - const char t0[] = "\xcc\x88\xcc\x81"; -#endif - const char * const t1 = t0 + 3; - - for (s = s0 + 2; - s < s2 && (t = ninstr(s, s1, t0, t1)); - s = t + 4) { -#ifdef EBCDIC - if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) || - ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5)) +# define U390_first_byte 0xb4 + const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42"; +# define U3B0_first_byte 0xb5 + const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42"; #else - if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) || - ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF)) +# define U390_first_byte 0xce + const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81"; +# define U3B0_first_byte 0xcf + const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81"; #endif - *min -= 4; - } + const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte; + yields a net of 0 */ + /* Examine the string for one of the problematic sequences */ + for (s = s0; + s < s_end - 1; /* Can stop 1 before the end, as minimum length + * sequence we are looking for is 2 */ + s += UTF8SKIP(s)) + { + + /* Look for the first byte in each problematic sequence */ + switch (*s) { + /* We don't have to worry about other things that fold to + * 's' (such as the long s, U+017F), as all above-latin1 + * code points have been pre-folded */ + case 's': + case 'S': + + /* Current character is an 's' or 'S'. If next one is + * as well, we have the dreaded sequence */ + if (((*(s+1) & S_or_s_mask) == s_masked) + /* These two node types don't have special handling + * for 'ss' */ + && OP(scan) != EXACTFL && OP(scan) != EXACTFA) + { + *min_subtract += 1; + OP(scan) = EXACTFU_SS; + s++; /* No need to look at this character again */ + } + break; + + case U390_first_byte: + if (s_end - s >= len + + /* The 1's are because are skipping comparing the + * first byte */ + && memEQ(s + 1, U390_tail, len - 1)) + { + goto greek_sequence; + } + break; + + case U3B0_first_byte: + if (! (s_end - s >= len + && memEQ(s + 1, U3B0_tail, len - 1))) + { + break; + } + greek_sequence: + *min_subtract += 4; + + /* This can't currently be handled by trie's, so change + * the node type to indicate this. If EXACTFA and + * EXACTFL were ever to be handled by trie's, this + * would have to be changed. If this node has already + * been changed to EXACTFU_SS in this loop, leave it as + * is. (I (khw) think it doesn't matter in regexec.c + * for UTF patterns, but no need to change it */ + if (OP(scan) == EXACTFU) { + OP(scan) = EXACTFU_NO_TRIE; + } + s += 6; /* We already know what this sequence is. Skip + the rest of it */ + break; + } + } + } + else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) { + + /* Here, the pattern is not UTF-8. We need to look only for the + * 'ss' sequence, and in the EXACTF case, the sharp s, which can be + * in the final position. Otherwise we can stop looking 1 byte + * earlier because have to find both the first and second 's' */ + const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1; + + for (s = s0; s < upper; s++) { + switch (*s) { + case 'S': + case 's': + if (s_end - s > 1 + && ((*(s+1) & S_or_s_mask) == s_masked)) + { + *min_subtract += 1; + + /* EXACTF nodes need to know that the minimum + * length changed so that a sharp s in the string + * can match this ss in the pattern, but they + * remain EXACTF nodes, as they are not trie'able, + * so don't have to invent a new node type to + * exclude them from the trie code */ + if (OP(scan) != EXACTF) { + OP(scan) = EXACTFU_SS; + } + s++; + } + break; + case LATIN_SMALL_LETTER_SHARP_S: + if (OP(scan) == EXACTF) { + *has_exactf_sharp_s = TRUE; + } + break; + } + } + } } - + #ifdef DEBUGGING - /* Allow dumping */ + /* Allow dumping but overwriting the collection of skipped + * ops and/or strings with fake optimized ops */ n = scan + NODE_SZ_STR(scan); while (n <= stop) { - if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) { - OP(n) = OPTIMIZED; - NEXT_OFF(n) = 0; - } + OP(n) = OPTIMIZED; + FLAGS(n) = 0; + NEXT_OFF(n) = 0; n++; } #endif @@ -2647,10 +2981,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, fake_study_recurse: while ( scan && OP(scan) != END && scan < last ){ + UV min_subtract = 0; /* How much to subtract from the minimum node + length to get a real minimum (because the + folded version may be shorter) */ + bool has_exactf_sharp_s = FALSE; /* Peephole optimizer: */ DEBUG_STUDYDATA("Peep:", data,depth); DEBUG_PEEP("Peep",scan,depth); - JOIN_EXACT(scan,&min,0); + + /* Its not clear to khw or hv why this is done here, and not in the + * clauses that deal with EXACT nodes. khw's guess is that it's + * because of a previous design */ + JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0); /* Follow the next-chain of the current node and optimize away all the NOTHINGs from it. */ @@ -2662,7 +3004,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); int noff; regnode *n = scan; - + /* Skip NOTHING and LONGJMP. */ while ((n = regnext(n)) && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n))) @@ -2684,7 +3026,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, next = regnext(scan); code = OP(scan); /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */ - + if (OP(next) == code || code == IFTHEN) { /* NOTE - There is similar code to this block below for handling TRIE nodes on a re-study. If you change stuff here check there @@ -2692,7 +3034,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 max1 = 0, min1 = I32_MAX, num = 0; struct regnode_charclass_class accum; regnode * const startbranch=scan; - + if (flags & SCF_DO_SUBSTR) SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */ if (flags & SCF_DO_STCLASS) @@ -2829,7 +3171,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, a nested if into a case structure of sorts. */ - + int made=0; if (!re_trie_maxbuff) { re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); @@ -2936,17 +3278,20 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } else { /* - Currently we do not believe that the trie logic can - handle case insensitive matching properly when the - pattern is not unicode (thus forcing unicode semantics). + Currently the trie logic handles case insensitive matching properly only + when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode + semantics). If/when this is fixed the following define can be swapped in below to fully enable trie logic. #define TRIE_TYPE_IS_SAFE 1 +Note that join_exact() assumes that the other types of EXACTFish nodes are not +used in tries, so that would have to be updated if this changed + */ -#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT) +#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT) if ( last && TRIE_TYPE_IS_SAFE ) { make_trie( pRExC_state, @@ -2979,7 +3324,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( last && TRIE_TYPE_IS_SAFE ) { made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 ); -#ifdef TRIE_STUDY_OPT +#ifdef TRIE_STUDY_OPT if ( ((made == MADE_EXACT_TRIE && startbranch == first) || ( first_non_open == first )) && @@ -3164,9 +3509,24 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, l = utf8_length(s, s + l); uc = utf8_to_uvchr(s, NULL); } - min += l; - if (flags & SCF_DO_SUBSTR) - data->pos_min += l; + else if (has_exactf_sharp_s) { + RExC_seen |= REG_SEEN_EXACTF_SHARP_S; + } + min += l - min_subtract; + if (min < 0) { + min = 0; + } + delta += min_subtract; + if (flags & SCF_DO_SUBSTR) { + data->pos_min += l - min_subtract; + if (data->pos_min < 0) { + data->pos_min = 0; + } + data->pos_delta += min_subtract; + if (min_subtract) { + data->longest = &(data->longest_float); + } + } if (flags & SCF_DO_STCLASS_AND) { /* Check whether it is compatible with what we know already! */ int compat = 1; @@ -3184,6 +3544,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->start_class->flags &= ~ANYOF_EOS; data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD; if (OP(scan) == EXACTFL) { + /* XXX This set is probably no longer necessary, and + * probably wrong as LOCALE now is on in the initial + * state */ data->start_class->flags |= ANYOF_LOCALE; } else { @@ -3191,8 +3554,22 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* Also set the other member of the fold pair. In case * that unicode semantics is called for at runtime, use * the full latin1 fold. (Can't do this for locale, - * because not known until runtime */ + * because not known until runtime) */ ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); + + /* All other (EXACTFL handled above) folds except under + * /iaa that include s, S, and sharp_s also may include + * the others */ + if (OP(scan) != EXACTFA) { + if (uc == 's' || uc == 'S') { + ANYOF_BITMAP_SET(data->start_class, + LATIN_SMALL_LETTER_SHARP_S); + } + else if (uc == LATIN_SMALL_LETTER_SHARP_S) { + ANYOF_BITMAP_SET(data->start_class, 's'); + ANYOF_BITMAP_SET(data->start_class, 'S'); + } + } } } else if (uc >= 0x100) { @@ -3217,6 +3594,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * run-time */ ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); + + /* All folds except under /iaa that include s, S, + * and sharp_s also may include the others */ + if (OP(scan) != EXACTFA) { + if (uc == 's' || uc == 'S') { + ANYOF_BITMAP_SET(data->start_class, + LATIN_SMALL_LETTER_SHARP_S); + } + else if (uc == LATIN_SMALL_LETTER_SHARP_S) { + ANYOF_BITMAP_SET(data->start_class, 's'); + ANYOF_BITMAP_SET(data->start_class, 'S'); + } + } } } data->start_class->flags &= ~ANYOF_EOS; @@ -3622,18 +4012,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - else if (OP(scan) == FOLDCHAR) { - int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2; - flags &= ~SCF_DO_STCLASS; - min += 1; - delta += d; - if (flags & SCF_DO_SUBSTR) { - SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ - data->pos_min += 1; - data->pos_delta += d; - data->longest = &(data->longest_float); - } - } else if (REGNODE_SIMPLE(OP(scan))) { int value = 0; @@ -3696,7 +4074,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); - else if (OP(scan) == ALNUMU) { + + /* Even if under locale, set the bits for non-locale + * in case it isn't a true locale-node. This will + * create false positives if it truly is locale */ + if (OP(scan) == ALNUMU) { for (value = 0; value < 256; value++) { if (isWORDCHAR_L1(value)) { ANYOF_BITMAP_SET(data->start_class, value); @@ -3733,19 +4115,21 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); - else { - if (OP(scan) == NALNUMU) { - for (value = 0; value < 256; value++) { - if (! isWORDCHAR_L1(value)) { - ANYOF_BITMAP_SET(data->start_class, value); - } - } - } else { - for (value = 0; value < 256; value++) { - if (! isALNUM(value)) { - ANYOF_BITMAP_SET(data->start_class, value); - } - } + + /* Even if under locale, set the bits for non-locale in + * case it isn't a true locale-node. This will create + * false positives if it truly is locale */ + if (OP(scan) == NALNUMU) { + for (value = 0; value < 256; value++) { + if (! isWORDCHAR_L1(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (! isALNUM(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } } } } @@ -3773,7 +4157,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (data->start_class->flags & ANYOF_LOCALE) { ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); } - else if (OP(scan) == SPACEU) { + if (OP(scan) == SPACEU) { for (value = 0; value < 256; value++) { if (isSPACE_L1(value)) { ANYOF_BITMAP_SET(data->start_class, value); @@ -3810,7 +4194,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); - else if (OP(scan) == NSPACEU) { + if (OP(scan) == NSPACEU) { for (value = 0; value < 256; value++) { if (!isSPACE_L1(value)) { ANYOF_BITMAP_SET(data->start_class, value); @@ -3828,24 +4212,25 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, break; case DIGIT: if (flags & SCF_DO_STCLASS_AND) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT); - for (value = 0; value < 256; value++) - if (!isDIGIT(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + if (!(data->start_class->flags & ANYOF_LOCALE)) { + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT); + for (value = 0; value < 256; value++) + if (!isDIGIT(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } } else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT); - else { - for (value = 0; value < 256; value++) - if (isDIGIT(value)) - ANYOF_BITMAP_SET(data->start_class, value); - } + for (value = 0; value < 256; value++) + if (isDIGIT(value)) + ANYOF_BITMAP_SET(data->start_class, value); } break; case NDIGIT: if (flags & SCF_DO_STCLASS_AND) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT); + if (!(data->start_class->flags & ANYOF_LOCALE)) + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT); for (value = 0; value < 256; value++) if (isDIGIT(value)) ANYOF_BITMAP_CLEAR(data->start_class, value); @@ -3853,16 +4238,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT); - else { - for (value = 0; value < 256; value++) - if (!isDIGIT(value)) - ANYOF_BITMAP_SET(data->start_class, value); - } + for (value = 0; value < 256; value++) + if (!isDIGIT(value)) + ANYOF_BITMAP_SET(data->start_class, value); } break; CASE_SYNST_FNC(VERTWS); CASE_SYNST_FNC(HORIZWS); - + } if (flags & SCF_DO_STCLASS_OR) cl_and(data->start_class, and_withp); @@ -4243,7 +4626,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, flags &= ~SCF_DO_SUBSTR; } #endif /* old or new */ -#endif /* TRIE_STUDY_OPT */ +#endif /* TRIE_STUDY_OPT */ /* Else: zero-length, ignore. */ scan = regnext(scan); @@ -4403,7 +4786,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) struct regexp *r; register regexp_internal *ri; STRLEN plen; - char *exp; + char* VOL exp; char* xend; regnode *scan; I32 flags; @@ -4416,6 +4799,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) I32 sawplus = 0; I32 sawopen = 0; bool used_setjump = FALSE; + regex_charset initial_charset = get_regex_charset(orig_pm_flags); U8 jump_ret = 0; dJMPENV; @@ -4432,8 +4816,74 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) DEBUG_r(if (!PL_colorset) reginitcolors()); - RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); +#ifndef PERL_IN_XSUB_RE + /* Initialize these here instead of as-needed, as is quick and avoids + * having to test them each time otherwise */ + if (! PL_AboveLatin1) { + PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); + PL_ASCII = _new_invlist_C_array(ASCII_invlist); + PL_Latin1 = _new_invlist_C_array(Latin1_invlist); + + PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); + PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); + + PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist); + PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist); + + PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist); + PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist); + + PL_L1Cased = _new_invlist_C_array(L1Cased_invlist); + + PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist); + PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist); + + PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist); + + PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist); + PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist); + + PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); + PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); + + PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist); + PL_PosixLower = _new_invlist_C_array(PosixLower_invlist); + + PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist); + PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist); + + PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist); + PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist); + + PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist); + PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist); + + PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist); + PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist); + + PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist); + PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist); + + PL_VertSpace = _new_invlist_C_array(VertSpace_invlist); + + PL_PosixWord = _new_invlist_C_array(PosixWord_invlist); + PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist); + + PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist); + PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist); + } +#endif + + exp = SvPV(pattern, plen); + + if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */ + RExC_utf8 = RExC_orig_utf8 = 0; + } + else { + RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); + } RExC_uni_semantics = 0; + RExC_contains_locale = 0; /****************** LONG JUMP TARGET HERE***********************/ /* Longjmp back to here if have to switch in midstream to utf8 */ @@ -4443,12 +4893,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) } if (jump_ret == 0) { /* First time through */ - exp = SvPV(pattern, plen); xend = exp + plen; - /* ignore the utf8ness if the pattern is 0 length */ - if (plen == 0) { - RExC_utf8 = RExC_orig_utf8 = 0; - } DEBUG_COMPILE_r({ SV *dsv= sv_newmortal(); @@ -4480,7 +4925,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) -- dmq */ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); - exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len); + exp = (char*)Perl_bytes_to_utf8(aTHX_ + (U8*)SvPV_nomg(pattern, plen), + &len); xend = exp + len; RExC_orig_utf8 = RExC_utf8 = 1; SAVEFREEPV(exp); @@ -4490,11 +4937,15 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) restudied = 0; #endif - /* Set to use unicode semantics if the pattern is in utf8 and has the - * 'depends' charset specified, as it means unicode when utf8 */ pm_flags = orig_pm_flags; - if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) { + if (initial_charset == REGEX_LOCALE_CHARSET) { + RExC_contains_locale = 1; + } + else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + + /* Set to use unicode semantics if the pattern is in utf8 and has the + * 'depends' charset specified, as it means unicode when utf8 */ set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET); } @@ -4507,6 +4958,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_seen_evals = 0; RExC_extralen = 0; + RExC_override_recoding = 0; /* First pass: determine size, legality. */ RExC_parse = exp; @@ -4532,7 +4984,11 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) * Clever compilers notice this and complain. --jhi */ REGC((U8)REG_MAGIC, (char*)RExC_emit); #endif - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n")); + DEBUG_PARSE_r( + PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"); + RExC_lastnum=0; + RExC_lastparse=NULL; + ); if (reg(pRExC_state, 0, &flags,1) == NULL) { RExC_precomp = NULL; return(NULL); @@ -4784,7 +5240,7 @@ reStudy: sawplus = 1; else first += regarglen[OP(first)]; - + first = NEXTOPER(first); first_next= regnext(first); } @@ -4799,7 +5255,7 @@ reStudy: else ri->regstclass = first; } -#ifdef TRIE_STCLASS +#ifdef TRIE_STCLASS else if (PL_regkind[OP(first)] == TRIE && ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) { @@ -4820,7 +5276,7 @@ reStudy: make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); ri->regstclass = trie_op; } -#endif +#endif else if (REGNODE_SIMPLE(OP(first))) ri->regstclass = first; else if (PL_regkind[OP(first)] == BOUND || @@ -4886,7 +5342,7 @@ reStudy: * it happens that c_offset_min has been invalidated, since the * earlier string may buy us something the later one won't.] */ - + data.longest_fixed = newSVpvs(""); data.longest_float = newSVpvs(""); data.last_found = newSVpvs(""); @@ -4904,7 +5360,7 @@ reStudy: &data, -1, NULL, NULL, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0); - + CHECK_RESTUDY_GOTO; @@ -4929,9 +5385,11 @@ reStudy: { I32 t,ml; - if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ - && data.offset_fixed == data.offset_float_min - && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) + /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ + if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S) + || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))) goto remove_float; /* As in (a)+. */ /* copy the information about the longest float from the reg_scan_data @@ -4974,10 +5432,13 @@ reStudy: Be careful. */ longest_fixed_length = CHR_SVLEN(data.longest_fixed); - if (longest_fixed_length - || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ - && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (RExC_flags & RXf_PMf_MULTILINE)))) + + /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */ + if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) + && (longest_fixed_length + || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ + && (!(data.flags & SF_FIX_BEFORE_MEOL) + || (RExC_flags & RXf_PMf_MULTILINE)))) ) { I32 t,ml; @@ -5015,14 +5476,13 @@ reStudy: && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) ri->regstclass = NULL; - /* If the synthetic start class were to ever be used when EOS is set, - * that bit would have to be cleared, as it is shared with another */ if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) && stclass_flag && !(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { const U32 n = add_data(pRExC_state, 1, "f"); + data.start_class->flags |= ANYOF_IS_SYNTHETIC; Newx(RExC_rxi->data->data[n], 1, struct regnode_charclass_class); @@ -5073,7 +5533,7 @@ reStudy: I32 fake; struct regnode_charclass_class ch_class; I32 last_close = 0; - + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); scan = ri->program + 1; @@ -5090,12 +5550,11 @@ reStudy: r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; - /* If the synthetic start class were to ever be used when EOS is set, - * that bit would have to be cleared, as it is shared with another */ if (!(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { const U32 n = add_data(pRExC_state, 1, "f"); + data.start_class->flags |= ANYOF_IS_SYNTHETIC; Newx(RExC_rxi->data->data[n], 1, struct regnode_charclass_class); @@ -5280,7 +5739,8 @@ Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv, if (!retarray) return ret; } else { - ret = newSVsv(&PL_sv_undef); + if (retarray) + ret = newSVsv(&PL_sv_undef); } if (retarray) av_push(retarray, ret); @@ -5644,7 +6104,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) return sv_dat; } else { - Perl_croak(aTHX_ "panic: bad flag in reg_scan_name"); + Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", + (unsigned long) flags); } /* NOT REACHED */ } @@ -5699,123 +6160,195 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) /* This section of code defines the inversion list object and its methods. The * interfaces are highly subject to change, so as much as possible is static to - * this file. An inversion list is here implemented as a malloc'd C array with - * some added info. More will be coming when functionality is added later. + * this file. An inversion list is here implemented as a malloc'd C UV array + * with some added info that is placed as UVs at the beginning in a header + * portion. An inversion list for Unicode is an array of code points, sorted + * by ordinal number. The zeroth element is the first code point in the list. + * The 1th element is the first element beyond that not in the list. In other + * words, the first range is + * invlist[0]..(invlist[1]-1) + * The other ranges follow. Thus every element whose index is divisible by two + * marks the beginning of a range that is in the list, and every element not + * divisible by two marks the beginning of a range not in the list. A single + * element inversion list that contains the single code point N generally + * consists of two elements + * invlist[0] == N + * invlist[1] == N+1 + * (The exception is when N is the highest representable value on the + * machine, in which case the list containing just it would be a single + * element, itself. By extension, if the last range in the list extends to + * infinity, then the first element of that range will be in the inversion list + * at a position that is divisible by two, and is the final element in the + * list.) + * Taking the complement (inverting) an inversion list is quite simple, if the + * first element is 0, remove it; otherwise add a 0 element at the beginning. + * This implementation reserves an element at the beginning of each inversion list + * to contain 0 when the list contains 0, and contains 1 otherwise. The actual + * beginning of the list is either that element if 0, or the next one if 1. + * + * More about inversion lists can be found in "Unicode Demystified" + * Chapter 13 by Richard Gillam, published by Addison-Wesley. + * More will be coming when functionality is added later. + * + * The inversion list data structure is currently implemented as an SV pointing + * to an array of UVs that the SV thinks are bytes. This allows us to have an + * array of UV whose memory management is automatically handled by the existing + * facilities for SV's. * * Some of the methods should always be private to the implementation, and some * should eventually be made public */ +#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */ +#define INVLIST_ITER_OFFSET 1 /* Current iteration position */ + +/* This is a combination of a version and data structure type, so that one + * being passed in can be validated to be an inversion list of the correct + * vintage. When the structure of the header is changed, a new random number + * in the range 2**31-1 should be generated and the new() method changed to + * insert that at this location. Then, if an auxiliary program doesn't change + * correspondingly, it will be discovered immediately */ +#define INVLIST_VERSION_ID_OFFSET 2 +#define INVLIST_VERSION_ID 1064334010 + +/* For safety, when adding new elements, remember to #undef them at the end of + * the inversion list code section */ + +#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */ +/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list + * contains the code point U+00000, and begins here. If 1, the inversion list + * doesn't contain U+0000, and it begins at the next UV in the array. + * Inverting an inversion list consists of adding or removing the 0 at the + * beginning of it. By reserving a space for that 0, inversion can be made + * very fast */ + +#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1) + +/* Internally things are UVs */ +#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV)) +#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH) + #define INVLIST_INITIAL_LEN 10 -#define INVLIST_ARRAY_KEY "array" -#define INVLIST_MAX_KEY "max" -#define INVLIST_LEN_KEY "len" PERL_STATIC_INLINE UV* -S_invlist_array(pTHX_ HV* const invlist) +S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0) { - /* Returns the pointer to the inversion list's array. Every time the - * length changes, this needs to be called in case malloc or realloc moved - * it */ + /* Returns a pointer to the first element in the inversion list's array. + * This is called upon initialization of an inversion list. Where the + * array begins depends on whether the list has the code point U+0000 + * in it or not. The other parameter tells it whether the code that + * follows this call is about to put a 0 in the inversion list or not. + * The first element is either the element with 0, if 0, or the next one, + * if 1 */ - SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE); + UV* zero = get_invlist_zero_addr(invlist); - PERL_ARGS_ASSERT_INVLIST_ARRAY; + PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT; - if (list_ptr == NULL) { - Perl_croak(aTHX_ "panic: inversion list without a '%s' element", - INVLIST_ARRAY_KEY); - } + /* Must be empty */ + assert(! *get_invlist_len_addr(invlist)); - return INT2PTR(UV *, SvUV(*list_ptr)); + /* 1^1 = 0; 1^0 = 1 */ + *zero = 1 ^ will_have_0; + return zero + *zero; } -PERL_STATIC_INLINE void -S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array) +PERL_STATIC_INLINE UV* +S_invlist_array(pTHX_ SV* const invlist) { - PERL_ARGS_ASSERT_INVLIST_SET_ARRAY; + /* Returns the pointer to the inversion list's array. Every time the + * length changes, this needs to be called in case malloc or realloc moved + * it */ - /* Sets the array stored in the inversion list to the memory beginning with - * the parameter */ + PERL_ARGS_ASSERT_INVLIST_ARRAY; - if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) { - Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list", - INVLIST_ARRAY_KEY); - } + /* Must not be empty. If these fail, you probably didn't check for + * being non-zero before trying to get the array */ + assert(*get_invlist_len_addr(invlist)); + assert(*get_invlist_zero_addr(invlist) == 0 + || *get_invlist_zero_addr(invlist) == 1); + + /* The array begins either at the element reserved for zero if the + * list contains 0 (that element will be set to 0), or otherwise the next + * element (in which case the reserved element will be set to 1). */ + return (UV *) (get_invlist_zero_addr(invlist) + + *get_invlist_zero_addr(invlist)); } -PERL_STATIC_INLINE UV -S_invlist_len(pTHX_ HV* const invlist) +PERL_STATIC_INLINE UV* +S_get_invlist_len_addr(pTHX_ SV* invlist) { - /* Returns the current number of elements in the inversion list's array */ - - SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE); - - PERL_ARGS_ASSERT_INVLIST_LEN; + /* Return the address of the UV that contains the current number + * of used elements in the inversion list */ - if (len_ptr == NULL) { - Perl_croak(aTHX_ "panic: inversion list without a '%s' element", - INVLIST_LEN_KEY); - } + PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR; - return SvUV(*len_ptr); + return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV))); } PERL_STATIC_INLINE UV -S_invlist_max(pTHX_ HV* const invlist) +S_invlist_len(pTHX_ SV* const invlist) { - /* Returns the maximum number of elements storable in the inversion list's - * array, without having to realloc() */ - - SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE); + /* Returns the current number of elements stored in the inversion list's + * array */ - PERL_ARGS_ASSERT_INVLIST_MAX; - - if (max_ptr == NULL) { - Perl_croak(aTHX_ "panic: inversion list without a '%s' element", - INVLIST_MAX_KEY); - } + PERL_ARGS_ASSERT_INVLIST_LEN; - return SvUV(*max_ptr); + return *get_invlist_len_addr(invlist); } PERL_STATIC_INLINE void -S_invlist_set_len(pTHX_ HV* const invlist, const UV len) +S_invlist_set_len(pTHX_ SV* const invlist, const UV len) { /* Sets the current number of elements stored in the inversion list */ PERL_ARGS_ASSERT_INVLIST_SET_LEN; - if (len != 0 && len > invlist_max(invlist)) { - Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist)); - } - - if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) { - Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list", - INVLIST_LEN_KEY); - } + *get_invlist_len_addr(invlist) = len; + + assert(len <= SvLEN(invlist)); + + SvCUR_set(invlist, TO_INTERNAL_SIZE(len)); + /* If the list contains U+0000, that element is part of the header, + * and should not be counted as part of the array. It will contain + * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and + * subtract: + * SvCUR_set(invlist, + * TO_INTERNAL_SIZE(len + * - (*get_invlist_zero_addr(inv_list) ^ 1))); + * But, this is only valid if len is not 0. The consequences of not doing + * this is that the memory allocation code may think that 1 more UV is + * being used than actually is, and so might do an unnecessary grow. That + * seems worth not bothering to make this the precise amount. + * + * Note that when inverting, SvCUR shouldn't change */ } -PERL_STATIC_INLINE void -S_invlist_set_max(pTHX_ HV* const invlist, const UV max) +PERL_STATIC_INLINE UV +S_invlist_max(pTHX_ SV* const invlist) { + /* Returns the maximum number of elements storable in the inversion list's + * array, without having to realloc() */ - /* Sets the maximum number of elements storable in the inversion list - * without having to realloc() */ + PERL_ARGS_ASSERT_INVLIST_MAX; - PERL_ARGS_ASSERT_INVLIST_SET_MAX; + return FROM_INTERNAL_SIZE(SvLEN(invlist)); +} - if (max < invlist_len(invlist)) { - Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist)); - } +PERL_STATIC_INLINE UV* +S_get_invlist_zero_addr(pTHX_ SV* invlist) +{ + /* Return the address of the UV that is reserved to hold 0 if the inversion + * list contains 0. This has to be the last element of the heading, as the + * list proper starts with either it if 0, or the next element if not. + * (But we force it to contain either 0 or 1) */ - if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) { - Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list", - INVLIST_LEN_KEY); - } + PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR; + + return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV))); } #ifndef PERL_IN_XSUB_RE -HV* +SV* Perl__new_invlist(pTHX_ IV initial_size) { @@ -5823,99 +6356,102 @@ Perl__new_invlist(pTHX_ IV initial_size) * space to store 'initial_size' elements. If that number is negative, a * system default is used instead */ - HV* invlist = newHV(); - UV* list; + SV* new_list; if (initial_size < 0) { initial_size = INVLIST_INITIAL_LEN; } /* Allocate the initial space */ - Newx(list, initial_size, UV); - invlist_set_array(invlist, list); + new_list = newSV(TO_INTERNAL_SIZE(initial_size)); + invlist_set_len(new_list, 0); - /* set_len has to come before set_max, as the latter inspects the len */ - invlist_set_len(invlist, 0); - invlist_set_max(invlist, initial_size); + /* Force iterinit() to be used to get iteration to work */ + *get_invlist_iter_addr(new_list) = UV_MAX; - return invlist; + /* This should force a segfault if a method doesn't initialize this + * properly */ + *get_invlist_zero_addr(new_list) = UV_MAX; + + *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID; +#if HEADER_LENGTH != 4 +# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length +#endif + + return new_list; } #endif -PERL_STATIC_INLINE void -S_invlist_destroy(pTHX_ HV* const invlist) +STATIC SV* +S__new_invlist_C_array(pTHX_ UV* list) { - /* Inversion list destructor */ + /* Return a pointer to a newly constructed inversion list, initialized to + * point to , which has to be in the exact correct inversion list + * form, including internal fields. Thus this is a dangerous routine that + * should not be used in the wrong hands */ - SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE); + SV* invlist = newSV_type(SVt_PV); - PERL_ARGS_ASSERT_INVLIST_DESTROY; + PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY; - if (list_ptr != NULL) { - UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */ - Safefree(list); + SvPV_set(invlist, (char *) list); + SvLEN_set(invlist, 0); /* Means we own the contents, and the system + shouldn't touch it */ + SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist))); + + if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) { + Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list"); } + + return invlist; } STATIC void -S_invlist_extend(pTHX_ HV* const invlist, const UV new_max) +S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) { - /* Change the maximum size of an inversion list (up or down) */ - - UV* orig_array; - UV* array; - const UV old_max = invlist_max(invlist); + /* Grow the maximum size of an inversion list */ PERL_ARGS_ASSERT_INVLIST_EXTEND; - if (old_max == new_max) { /* If a no-op */ - return; - } - - array = orig_array = invlist_array(invlist); - Renew(array, new_max, UV); - - /* If the size change moved the list in memory, set the new one */ - if (array != orig_array) { - invlist_set_array(invlist, array); - } - - invlist_set_max(invlist, new_max); - + SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max)); } PERL_STATIC_INLINE void -S_invlist_trim(pTHX_ HV* const invlist) +S_invlist_trim(pTHX_ SV* const invlist) { PERL_ARGS_ASSERT_INVLIST_TRIM; /* Change the length of the inversion list to how many entries it currently * has */ - invlist_extend(invlist, invlist_len(invlist)); + SvPV_shrink_to_cur((SV *) invlist); } /* An element is in an inversion list iff its index is even numbered: 0, 2, 4, * etc */ +#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1)) +#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i)) -#define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1)) +#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output) #ifndef PERL_IN_XSUB_RE void -Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end) +Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end) { /* Subject to change or removal. Append the range from 'start' to 'end' at * the end of the inversion list. The range must be above any existing * ones. */ - UV* array = invlist_array(invlist); + UV* array; UV max = invlist_max(invlist); UV len = invlist_len(invlist); PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST; - if (len > 0) { - + if (len == 0) { /* Empty lists must be initialized */ + array = _invlist_array_init(invlist, start == 0); + } + else { /* Here, the existing list is non-empty. The current max entry in the * list is generally the first value not in the set, except when the * set extends to the end of permissible values, in which case it is @@ -5923,10 +6459,13 @@ Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV * append out-of-order */ UV final_element = len - 1; + array = invlist_array(invlist); if (array[final_element] > start - || ELEMENT_IN_INVLIST_SET(final_element)) + || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) { - Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list"); + Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", + array[final_element], start, + ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); } /* Here, it is a legal append. If the new range begins with the first @@ -5939,7 +6478,7 @@ Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV } else { /* But if the end is the maximum representable on the machine, - * just let the range that this would extend have no end */ + * just let the range that this would extend to have no end */ invlist_set_len(invlist, len - 1); } return; @@ -5954,10 +6493,13 @@ Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV * moved */ if (max < len) { invlist_extend(invlist, len); + invlist_set_len(invlist, len); /* Have to set len here to avoid assert + failure in invlist_array() */ array = invlist_array(invlist); } - - invlist_set_len(invlist, len); + else { + invlist_set_len(invlist, len); + } /* The next item on the list starts the range, the one after that is * one past the new range. */ @@ -5971,12 +6513,143 @@ Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV invlist_set_len(invlist, len - 1); } } -#endif -STATIC HV* -S_invlist_union(pTHX_ HV* const a, HV* const b) +STATIC IV +S_invlist_search(pTHX_ SV* const invlist, const UV cp) { - /* Return a new inversion list which is the union of two inversion lists. + /* Searches the inversion list for the entry that contains the input code + * point . If is not in the list, -1 is returned. Otherwise, the + * return value is the index into the list's array of the range that + * contains */ + + IV low = 0; + IV high = invlist_len(invlist); + const UV * const array = invlist_array(invlist); + + PERL_ARGS_ASSERT_INVLIST_SEARCH; + + /* If list is empty or the code point is before the first element, return + * failure. */ + if (high == 0 || cp < array[0]) { + return -1; + } + + /* Binary search. What we are looking for is such that + * array[i] <= cp < array[i+1] + * The loop below converges on the i+1. */ + while (low < high) { + IV mid = (low + high) / 2; + if (array[mid] <= cp) { + low = mid + 1; + + /* We could do this extra test to exit the loop early. + if (cp < array[low]) { + return mid; + } + */ + } + else { /* cp < array[mid] */ + high = mid; + } + } + + return high - 1; +} + +void +Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch) +{ + /* populates a swatch of a swash the same way swatch_get() does in utf8.c, + * but is used when the swash has an inversion list. This makes this much + * faster, as it uses a binary search instead of a linear one. This is + * intimately tied to that function, and perhaps should be in utf8.c, + * except it is intimately tied to inversion lists as well. It assumes + * that is all 0's on input */ + + UV current = start; + const IV len = invlist_len(invlist); + IV i; + const UV * array; + + PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH; + + if (len == 0) { /* Empty inversion list */ + return; + } + + array = invlist_array(invlist); + + /* Find which element it is */ + i = invlist_search(invlist, start); + + /* We populate from to */ + while (current < end) { + UV upper; + + /* The inversion list gives the results for every possible code point + * after the first one in the list. Only those ranges whose index is + * even are ones that the inversion list matches. For the odd ones, + * and if the initial code point is not in the list, we have to skip + * forward to the next element */ + if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) { + i++; + if (i >= len) { /* Finished if beyond the end of the array */ + return; + } + current = array[i]; + if (current >= end) { /* Finished if beyond the end of what we + are populating */ + return; + } + } + assert(current >= start); + + /* The current range ends one below the next one, except don't go past + * */ + i++; + upper = (i < len && array[i] < end) ? array[i] : end; + + /* Here we are in a range that matches. Populate a bit in the 3-bit U8 + * for each code point in it */ + for (; current < upper; current++) { + const STRLEN offset = (STRLEN)(current - start); + swatch[offset >> 3] |= 1 << (offset & 7); + } + + /* Quit if at the end of the list */ + if (i >= len) { + + /* But first, have to deal with the highest possible code point on + * the platform. The previous code assumes that is one + * beyond where we want to populate, but that is impossible at the + * platform's infinity, so have to handle it specially */ + if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1))) + { + const STRLEN offset = (STRLEN)(end - start); + swatch[offset >> 3] |= 1 << (offset & 7); + } + return; + } + + /* Advance to the next range, which will be for code points not in the + * inversion list */ + current = array[i]; + } + + return; +} + + +void +Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output) +{ + /* Take the union of two inversion lists and point to it. *output + * should be defined upon input, and if it points to one of the two lists, + * the reference count to that list will be decremented. The first list, + * , may be NULL, in which case a copy of the second list is returned. + * If is TRUE, the union is taken of the complement + * (inversion) of instead of b itself. + * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some * length there. The preface says to incorporate its examples into your @@ -5987,14 +6660,15 @@ S_invlist_union(pTHX_ HV* const a, HV* const b) * XXX A potential performance improvement is to keep track as we go along * if only one of the inputs contributes to the result, meaning the other * is a subset of that one. In that case, we can skip the final copy and - * return the larger of the input lists */ + * return the larger of the input lists, but then outside code might need + * to keep track of whether to free the input list or not */ - UV* array_a = invlist_array(a); /* a's array */ - UV* array_b = invlist_array(b); - UV len_a = invlist_len(a); /* length of a's array */ - UV len_b = invlist_len(b); + UV* array_a; /* a's array */ + UV* array_b; + UV len_a; /* length of a's array */ + UV len_b; - HV* u; /* the resulting union */ + SV* u; /* the resulting union */ UV* array_u; UV len_u; @@ -6010,12 +6684,81 @@ S_invlist_union(pTHX_ HV* const a, HV* const b) */ UV count = 0; - PERL_ARGS_ASSERT_INVLIST_UNION; + PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; + assert(a != b); + + /* If either one is empty, the union is the other one */ + if (a == NULL || ((len_a = invlist_len(a)) == 0)) { + if (*output == a) { + if (a != NULL) { + SvREFCNT_dec(a); + } + } + if (*output != b) { + *output = invlist_clone(b); + if (complement_b) { + _invlist_invert(*output); + } + } /* else *output already = b; */ + return; + } + else if ((len_b = invlist_len(b)) == 0) { + if (*output == b) { + SvREFCNT_dec(b); + } + + /* The complement of an empty list is a list that has everything in it, + * so the union with includes everything too */ + if (complement_b) { + if (a == *output) { + SvREFCNT_dec(a); + } + *output = _new_invlist(1); + _append_range_to_invlist(*output, 0, UV_MAX); + } + else if (*output != a) { + *output = invlist_clone(a); + } + /* else *output already = a; */ + return; + } + + /* Here both lists exist and are non-empty */ + array_a = invlist_array(a); + array_b = invlist_array(b); + + /* If are to take the union of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later, and clear the + * flag as we don't have to do anything else later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + complement_b = FALSE; + } + else { + + /* But if the first element is not zero, we unshift a 0 before the + * array. The data structure reserves a space for that 0 (which + * should be a '1' right now), so physical shifting is unneeded, + * but temporarily change that element to 0. Before exiting the + * routine, we must restore the element to '1' */ + array_b--; + len_b++; + array_b[0] = 0; + } + } /* Size the union for the worst case: that the sets are completely * disjoint */ u = _new_invlist(len_a + len_b); - array_u = invlist_array(u); + + /* Will contain U+0000 if either component does */ + array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0) + || (len_b > 0 && array_b[0] == 0)); /* Go through each list item by item, stopping when exhausted one of * them */ @@ -6035,13 +6778,14 @@ S_invlist_union(pTHX_ HV* const a, HV* const b) * be seamlessly merged. (In a tie and both are in the set or both not * in the set, it doesn't matter which we take first.) */ if (array_a[i_a] < array_b[i_b] - || (array_a[i_a] == array_b[i_b] && ELEMENT_IN_INVLIST_SET(i_a))) + || (array_a[i_a] == array_b[i_b] + && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) { - cp_in_set = ELEMENT_IN_INVLIST_SET(i_a); + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); cp= array_a[i_a++]; } else { - cp_in_set = ELEMENT_IN_INVLIST_SET(i_b); + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); cp= array_b[i_b++]; } @@ -6065,9 +6809,9 @@ S_invlist_union(pTHX_ HV* const a, HV* const b) /* Here, we are finished going through at least one of the lists, which * means there is something remaining in at most one. We check if the list * that hasn't been exhausted is positioned such that we are in the middle - * of a range in its set or not. (We are in the set if the next item in - * the array marks the beginning of something not in the set) If in the - * set, we decrement 'count'; if 0, there is potentially more to output. + * of a range in its set or not. (i_a and i_b point to the element beyond + * the one we care about.) If in the set, we decrement 'count'; if 0, there + * is potentially more to output. * There are four cases: * 1) Both weren't in their sets, count is 0, and remains 0. What's left * in the union is entirely from the non-exhausted set. @@ -6077,12 +6821,12 @@ S_invlist_union(pTHX_ HV* const a, HV* const b) * that * 3) the exhausted was in its set, non-exhausted isn't, count is 1. * Nothing further should be output because the union includes - * everything from the exhausted set. Not decrementing insures that. + * everything from the exhausted set. Not decrementing ensures that. * 4) the exhausted wasn't in its set, non-exhausted is, count is 1; * decrementing to 0 insures that we look at the remainder of the * non-exhausted set */ - if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a)) - || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b))) + if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) { count--; } @@ -6121,27 +6865,44 @@ S_invlist_union(pTHX_ HV* const a, HV* const b) } } - return u; + /* We may be removing a reference to one of the inputs */ + if (a == *output || b == *output) { + SvREFCNT_dec(*output); + } + + /* If we've changed b, restore it */ + if (complement_b) { + array_b[0] = 1; + } + + *output = u; + return; } -STATIC HV* -S_invlist_intersection(pTHX_ HV* const a, HV* const b) +void +Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i) { - /* Return the intersection of two inversion lists. The basis for this - * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published - * by Addison-Wesley, and explained at some length there. The preface says - * to incorporate its examples into your code at your own risk. + /* Take the intersection of two inversion lists and point to it. *i + * should be defined upon input, and if it points to one of the two lists, + * the reference count to that list will be decremented. + * If is TRUE, the result will be the intersection of + * and the complement (or inversion) of instead of directly. + * + * The basis for this comes from "Unicode Demystified" Chapter 13 by + * Richard Gillam, published by Addison-Wesley, and explained at some + * length there. The preface says to incorporate its examples into your + * code at your own risk. In fact, it had bugs * * The algorithm is like a merge sort, and is essentially the same as the * union above */ - UV* array_a = invlist_array(a); /* a's array */ - UV* array_b = invlist_array(b); - UV len_a = invlist_len(a); /* length of a's array */ - UV len_b = invlist_len(b); + UV* array_a; /* a's array */ + UV* array_b; + UV len_a; /* length of a's array */ + UV len_b; - HV* r; /* the resulting intersection */ + SV* r; /* the resulting intersection */ UV* array_r; UV len_r; @@ -6157,12 +6918,78 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b) */ UV count = 0; - PERL_ARGS_ASSERT_INVLIST_INTERSECTION; + PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; + assert(a != b); + + /* Special case if either one is empty */ + len_a = invlist_len(a); + if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) { + + if (len_a != 0 && complement_b) { + + /* Here, 'a' is not empty, therefore from the above 'if', 'b' must + * be empty. Here, also we are using 'b's complement, which hence + * must be every possible code point. Thus the intersection is + * simply 'a'. */ + if (*i != a) { + *i = invlist_clone(a); + + if (*i == b) { + SvREFCNT_dec(b); + } + } + /* else *i is already 'a' */ + return; + } + + /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The + * intersection must be empty */ + if (*i == a) { + SvREFCNT_dec(a); + } + else if (*i == b) { + SvREFCNT_dec(b); + } + *i = _new_invlist(0); + return; + } + + /* Here both lists exist and are non-empty */ + array_a = invlist_array(a); + array_b = invlist_array(b); + + /* If are to take the intersection of 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* To complement, we invert: if the first element is 0, remove it. To + * do this, we just pretend the array starts one later, and clear the + * flag as we don't have to do anything else later */ + if (array_b[0] == 0) { + array_b++; + len_b--; + complement_b = FALSE; + } + else { + + /* But if the first element is not zero, we unshift a 0 before the + * array. The data structure reserves a space for that 0 (which + * should be a '1' right now), so physical shifting is unneeded, + * but temporarily change that element to 0. Before exiting the + * routine, we must restore the element to '1' */ + array_b--; + len_b++; + array_b[0] = 0; + } + } /* Size the intersection for the worst case: that the intersection ends up * fragmenting everything to be completely disjoint */ r= _new_invlist(len_a + len_b); - array_r = invlist_array(r); + + /* Will contain U+0000 iff both components do */ + array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 + && len_b > 0 && array_b[0] == 0); /* Go through each list item by item, stopping when exhausted one of * them */ @@ -6171,25 +6998,26 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b) array */ bool cp_in_set; /* Is it in the input list's set or not */ - /* We need to take one or the other of the two inputs for the union. - * Since we are merging two sorted lists, we take the smaller of the - * next items. In case of a tie, we take the one that is not in its - * set first (a difference from the union algorithm). If we took one - * in the set first, it would increment the count, possibly to 2 which - * would cause it to be output as starting a range in the intersection, - * and the next time through we would take that same number, and output - * it again as ending the set. By doing it the opposite of this, we - * there is no possibility that the count will be momentarily - * incremented to 2. (In a tie and both are in the set or both not in - * the set, it doesn't matter which we take first.) */ + /* We need to take one or the other of the two inputs for the + * intersection. Since we are merging two sorted lists, we take the + * smaller of the next items. In case of a tie, we take the one that + * is not in its set first (a difference from the union algorithm). If + * we took one in the set first, it would increment the count, possibly + * to 2 which would cause it to be output as starting a range in the + * intersection, and the next time through we would take that same + * number, and output it again as ending the set. By doing it the + * opposite of this, there is no possibility that the count will be + * momentarily incremented to 2. (In a tie and both are in the set or + * both not in the set, it doesn't matter which we take first.) */ if (array_a[i_a] < array_b[i_b] - || (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a))) + || (array_a[i_a] == array_b[i_b] + && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) { - cp_in_set = ELEMENT_IN_INVLIST_SET(i_a); + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); cp= array_a[i_a++]; } else { - cp_in_set = ELEMENT_IN_INVLIST_SET(i_b); + cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); cp= array_b[i_b++]; } @@ -6210,19 +7038,32 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b) } } - /* Here, we are finished going through at least one of the sets, which - * means there is something remaining in at most one. See the comments in - * the union code */ - if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a)) - || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b))) + /* Here, we are finished going through at least one of the lists, which + * means there is something remaining in at most one. We check if the list + * that has been exhausted is positioned such that we are in the middle + * of a range in its set or not. (i_a and i_b point to elements 1 beyond + * the ones we care about.) There are four cases: + * 1) Both weren't in their sets, count is 0, and remains 0. There's + * nothing left in the intersection. + * 2) Both were in their sets, count is 2 and perhaps is incremented to + * above 2. What should be output is exactly that which is in the + * non-exhausted set, as everything it has is also in the intersection + * set, and everything it doesn't have can't be in the intersection + * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and + * gets incremented to 2. Like the previous case, the intersection is + * everything that remains in the non-exhausted set. + * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and + * remains 1. And the intersection has nothing more. */ + if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) { - count--; + count++; } /* The final length is what we've output so far plus what else is in the - * intersection. Only one of the subexpressions below will be non-zero */ + * intersection. At most one of the subexpressions below will be non-zero */ len_r = i_r; - if (count == 2) { + if (count >= 2) { len_r += (len_a - i_a) + (len_b - i_b); } @@ -6235,7 +7076,7 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b) } /* Finish outputting any remaining */ - if (count == 2) { /* Only one of will have a non-zero copy count */ + if (count >= 2) { /* At most one will have a non-zero copy count */ IV copy_count; if ((copy_count = len_a - i_a) > 0) { Copy(array_a + i_a, array_r + i_r, copy_count, UV); @@ -6245,11 +7086,24 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b) } } - return r; + /* We may be removing a reference to one of the inputs */ + if (a == *i || b == *i) { + SvREFCNT_dec(*i); + } + + /* If we've changed b, restore it */ + if (complement_b) { + array_b[0] = 1; + } + + *i = r; + return; } -STATIC HV* -S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end) +#endif + +STATIC SV* +S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) { /* Add the range from 'start' to 'end' inclusive to the inversion list's * set. A pointer to the inversion list is returned. This may actually be @@ -6257,8 +7111,7 @@ S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end) * passed in inversion list can be NULL, in which case a new one is created * with just the one range in it */ - HV* range_invlist; - HV* added_invlist; + SV* range_invlist; UV len; if (invlist == NULL) { @@ -6283,16 +7136,234 @@ S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end) range_invlist = _new_invlist(2); _append_range_to_invlist(range_invlist, start, end); - added_invlist = invlist_union(invlist, range_invlist); + _invlist_union(invlist, range_invlist, &invlist); + + /* The temporary can be freed */ + SvREFCNT_dec(range_invlist); + + return invlist; +} + +PERL_STATIC_INLINE SV* +S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { + return add_range_to_invlist(invlist, cp, cp); +} + +#ifndef PERL_IN_XSUB_RE +void +Perl__invlist_invert(pTHX_ SV* const invlist) +{ + /* Complement the input inversion list. This adds a 0 if the list didn't + * have a zero; removes it otherwise. As described above, the data + * structure is set up so that this is very efficient */ + + UV* len_pos = get_invlist_len_addr(invlist); + + PERL_ARGS_ASSERT__INVLIST_INVERT; + + /* The inverse of matching nothing is matching everything */ + if (*len_pos == 0) { + _append_range_to_invlist(invlist, 0, UV_MAX); + return; + } + + /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the + * zero element was a 0, so it is being removed, so the length decrements + * by 1; and vice-versa. SvCUR is unaffected */ + if (*get_invlist_zero_addr(invlist) ^= 1) { + (*len_pos)--; + } + else { + (*len_pos)++; + } +} + +void +Perl__invlist_invert_prop(pTHX_ SV* const invlist) +{ + /* Complement the input inversion list (which must be a Unicode property, + * all of which don't match above the Unicode maximum code point.) And + * Perl has chosen to not have the inversion match above that either. This + * adds a 0x110000 if the list didn't end with it, and removes it if it did + */ + + UV len; + UV* array; + + PERL_ARGS_ASSERT__INVLIST_INVERT_PROP; + + _invlist_invert(invlist); + + len = invlist_len(invlist); + + if (len != 0) { /* If empty do nothing */ + array = invlist_array(invlist); + if (array[len - 1] != PERL_UNICODE_MAX + 1) { + /* Add 0x110000. First, grow if necessary */ + len++; + if (invlist_max(invlist) < len) { + invlist_extend(invlist, len); + array = invlist_array(invlist); + } + invlist_set_len(invlist, len); + array[len - 1] = PERL_UNICODE_MAX + 1; + } + else { /* Remove the 0x110000 */ + invlist_set_len(invlist, len - 1); + } + } + + return; +} +#endif + +PERL_STATIC_INLINE SV* +S_invlist_clone(pTHX_ SV* const invlist) +{ + + /* Return a new inversion list that is a copy of the input one, which is + * unchanged */ + + /* Need to allocate extra space to accommodate Perl's addition of a + * trailing NUL to SvPV's, since it thinks they are always strings */ + SV* new_invlist = _new_invlist(invlist_len(invlist) + 1); + STRLEN length = SvCUR(invlist); + + PERL_ARGS_ASSERT_INVLIST_CLONE; + + SvCUR_set(new_invlist, length); /* This isn't done automatically */ + Copy(SvPVX(invlist), SvPVX(new_invlist), length, char); + + return new_invlist; +} + +PERL_STATIC_INLINE UV* +S_get_invlist_iter_addr(pTHX_ SV* invlist) +{ + /* Return the address of the UV that contains the current iteration + * position */ + + PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR; + + return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV))); +} + +PERL_STATIC_INLINE UV* +S_get_invlist_version_id_addr(pTHX_ SV* invlist) +{ + /* Return the address of the UV that contains the version id. */ + + PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR; - /* The passed in list can be freed, as well as our temporary */ - invlist_destroy(range_invlist); - if (invlist != added_invlist) { - invlist_destroy(invlist); + return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV))); +} + +PERL_STATIC_INLINE void +S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */ +{ + PERL_ARGS_ASSERT_INVLIST_ITERINIT; + + *get_invlist_iter_addr(invlist) = 0; +} + +STATIC bool +S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) +{ + /* An C call on must be used to set this up. + * This call sets in <*start> and <*end>, the next range in . + * Returns if successful and the next call will return the next + * range; if was already at the end of the list. If the latter, + * <*start> and <*end> are unchanged, and the next call to this function + * will start over at the beginning of the list */ + + UV* pos = get_invlist_iter_addr(invlist); + UV len = invlist_len(invlist); + UV *array; + + PERL_ARGS_ASSERT_INVLIST_ITERNEXT; + + if (*pos >= len) { + *pos = UV_MAX; /* Force iternit() to be required next time */ + return FALSE; } - return added_invlist; + array = invlist_array(invlist); + + *start = array[(*pos)++]; + + if (*pos >= len) { + *end = UV_MAX; + } + else { + *end = array[(*pos)++] - 1; + } + + return TRUE; +} + +#ifndef PERL_IN_XSUB_RE +SV * +Perl__invlist_contents(pTHX_ SV* const invlist) +{ + /* Get the contents of an inversion list into a string SV so that they can + * be printed out. It uses the format traditionally done for debug tracing + */ + + UV start, end; + SV* output = newSVpvs("\n"); + + PERL_ARGS_ASSERT__INVLIST_CONTENTS; + + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (end == UV_MAX) { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start); + } + else if (end != start) { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n", + start, end); + } + else { + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start); + } + } + + return output; } +#endif + +#if 0 +void +S_invlist_dump(pTHX_ SV* const invlist, const char * const header) +{ + /* Dumps out the ranges in an inversion list. The string 'header' + * if present is output on a line before the first range */ + + UV start, end; + + if (header && strlen(header)) { + PerlIO_printf(Perl_debug_log, "%s\n", header); + } + invlist_iterinit(invlist); + while (invlist_iternext(invlist, &start, &end)) { + if (end == UV_MAX) { + PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start); + } + else { + PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end); + } + } +} +#endif + +#undef HEADER_LENGTH +#undef INVLIST_INITIAL_LENGTH +#undef TO_INTERNAL_SIZE +#undef FROM_INTERNAL_SIZE +#undef INVLIST_LEN_OFFSET +#undef INVLIST_ZERO_OFFSET +#undef INVLIST_ITER_OFFSET +#undef INVLIST_VERSION_ID /* End of inversion list object */ @@ -6583,6 +7654,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SvIV_set(sv_dat, 1); } #ifdef DEBUGGING + /* Yes this does cause a memory leak in debugging Perls */ if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname))) SvREFCNT_dec(svname); #endif @@ -6674,7 +7746,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse++; if (*RExC_parse!=')') vFAIL("Expecting close bracket"); - + gen_recurse_regop: if ( paren == '-' ) { /* @@ -6751,7 +7823,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse++; } if (*RExC_parse != ')') { - RExC_parse = s; + RExC_parse = s; vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); } if (!SIZE_ONLY) { @@ -6809,7 +7881,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) || RExC_parse[1] == '<' || RExC_parse[1] == '{') { /* Lookahead or eval. */ I32 flag; - + ret = reg_node(pRExC_state, LOGICAL); if (!SIZE_ONLY) ret->flags = 1; @@ -6937,8 +8009,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) { U32 posflags = 0, negflags = 0; U32 *flagsp = &posflags; - bool has_charset_modifier = 0; - regex_charset cs = REGEX_DEPENDS_CHARSET; + char has_charset_modifier = '\0'; + regex_charset cs = (RExC_utf8 || RExC_uni_semantics) + ? REGEX_UNICODE_CHARSET + : REGEX_DEPENDS_CHARSET; while (*RExC_parse) { /* && strchr("iogcmsx", *RExC_parse) */ @@ -6947,39 +8021,51 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) switch (*RExC_parse) { CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); case LOCALE_PAT_MOD: - if (has_charset_modifier || flagsp == &negflags) { - goto fail_modifiers; + if (has_charset_modifier) { + goto excess_modifier; + } + else if (flagsp == &negflags) { + goto neg_modifier; } cs = REGEX_LOCALE_CHARSET; - has_charset_modifier = 1; + has_charset_modifier = LOCALE_PAT_MOD; + RExC_contains_locale = 1; break; case UNICODE_PAT_MOD: - if (has_charset_modifier || flagsp == &negflags) { - goto fail_modifiers; + if (has_charset_modifier) { + goto excess_modifier; + } + else if (flagsp == &negflags) { + goto neg_modifier; } cs = REGEX_UNICODE_CHARSET; - has_charset_modifier = 1; + has_charset_modifier = UNICODE_PAT_MOD; break; case ASCII_RESTRICT_PAT_MOD: - if (has_charset_modifier || flagsp == &negflags) { - goto fail_modifiers; + if (flagsp == &negflags) { + goto neg_modifier; } - if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) { + if (has_charset_modifier) { + if (cs != REGEX_ASCII_RESTRICTED_CHARSET) { + goto excess_modifier; + } /* Doubled modifier implies more restricted */ - cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET; - RExC_parse++; - } + cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET; + } else { cs = REGEX_ASCII_RESTRICTED_CHARSET; } - has_charset_modifier = 1; + has_charset_modifier = ASCII_RESTRICT_PAT_MOD; break; case DEPENDS_PAT_MOD: - if (has_use_defaults - || has_charset_modifier - || flagsp == &negflags) - { + if (has_use_defaults) { goto fail_modifiers; + } + else if (flagsp == &negflags) { + goto neg_modifier; + } + else if (has_charset_modifier) { + goto excess_modifier; } /* The dual charset means unicode semantics if the @@ -6989,8 +8075,24 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) cs = (RExC_utf8 || RExC_uni_semantics) ? REGEX_UNICODE_CHARSET : REGEX_DEPENDS_CHARSET; - has_charset_modifier = 1; + has_charset_modifier = DEPENDS_PAT_MOD; break; + excess_modifier: + RExC_parse++; + if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) { + vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD); + } + else if (has_charset_modifier == *(RExC_parse - 1)) { + vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1)); + } + else { + vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1)); + } + /*NOTREACHED*/ + neg_modifier: + RExC_parse++; + vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1)); + /*NOTREACHED*/ case ONCE_PAT_MOD: /* 'o' */ case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { @@ -7281,7 +8383,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) Set_Node_Length(ret, 1); } } - + if (!first && SIZE_ONLY) RExC_extralen += 1; /* BRANCHJ */ @@ -7341,7 +8443,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) const char * const origparse = RExC_parse; I32 min; I32 max = REG_INFTY; +#ifdef RE_TRACK_PATTERN_OFFSETS char *parse_start; +#endif const char *maxpos = NULL; GET_RE_DEBUG_FLAGS_DECL; @@ -7360,7 +8464,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (op == '{' && regcurly(RExC_parse)) { maxpos = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS parse_start = RExC_parse; /* MJD */ +#endif next = RExC_parse + 1; while (isDIGIT(*next) || *next == ',') { if (*next == ',') { @@ -7456,7 +8562,9 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) vFAIL("Regexp *+ operand could be empty"); #endif +#ifdef RE_TRACK_PATTERN_OFFSETS parse_start = RExC_parse; +#endif nextchar(pRExC_state); *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); @@ -7520,7 +8628,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } -/* reg_namedseq(pRExC_state,UVp) +/* reg_namedseq(pRExC_state,UVp, UV depth) This is expected to be called by a parser routine that has recognized '\N' and needs to handle the rest. RExC_parse is @@ -7563,13 +8671,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) Parsing failures will generate a fatal error via vFAIL(...) */ STATIC regnode * -S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) +S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth) { char * endbrace; /* '}' following the name */ regnode *ret = NULL; -#ifdef DEBUGGING - char* parse_start = RExC_parse - 2; /* points to the '\N' */ -#endif char* p; GET_RE_DEBUG_FLAGS_DECL; @@ -7682,168 +8787,55 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */ } else { /* Not a char class */ - char *s; /* String to put in generated EXACT node */ - STRLEN len = 0; /* Its current byte length */ + + /* What is done here is to convert this to a sub-pattern of the form + * (?:\x{char1}\x{char2}...) + * and then call reg recursively. That way, it retains its atomicness, + * while not having to worry about special handling that some code + * points may have. toke.c has converted the original Unicode values + * to native, so that we can just pass on the hex values unchanged. We + * do have to set a flag to keep recoding from happening in the + * recursion */ + + SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP); + STRLEN len; char *endchar; /* Points to '.' or '}' ending cur char in the input stream */ - ret = reg_node(pRExC_state, - (U8) ((! FOLD) ? EXACT - : (LOC) - ? EXACTFL - : (MORE_ASCII_RESTRICTED) - ? EXACTFA - : (AT_LEAST_UNI_SEMANTICS) - ? EXACTFU - : EXACTF)); - s= STRING(ret); - - /* Exact nodes can hold only a U8 length's of text = 255. Loop through - * the input which is of the form now 'c1.c2.c3...}' until find the - * ending brace or exceed length 255. The characters that exceed this - * limit are dropped. The limit could be relaxed should it become - * desirable by reparsing this as (?:\N{NAME}), so could generate - * multiple EXACT nodes, as is done for just regular input. But this - * is primarily a named character, and not intended to be a huge long - * string, so 255 bytes should be good enough */ - while (1) { - STRLEN length_of_hex; - I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES - | PERL_SCAN_DISALLOW_PREFIX - | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); - UV cp; /* Ord of current character */ - bool use_this_char_fold = FOLD; + char *orig_end = RExC_end; + + while (RExC_parse < endbrace) { /* Code points are separated by dots. If none, there is only one * code point, and is terminated by the brace */ endchar = RExC_parse + strcspn(RExC_parse, ".}"); - /* The values are Unicode even on EBCDIC machines */ - length_of_hex = (STRLEN)(endchar - RExC_parse); - cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL); - if ( length_of_hex == 0 - || length_of_hex != (STRLEN)(endchar - RExC_parse) ) - { - RExC_parse += length_of_hex; /* Includes all the valid */ - RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ - ? UTF8SKIP(RExC_parse) - : 1; - /* Guard against malformed utf8 */ - if (RExC_parse >= endchar) RExC_parse = endchar; - vFAIL("Invalid hexadecimal number in \\N{U+...}"); - } - - /* XXX ? Change to ANYOF node - if (FOLD - && (cp > 255 || (! MORE_ASCII_RESTRICTED && ! LOC)) - && is_TRICKYFOLD_cp(cp)) - { - } - */ - - /* Under /aa, we can't mix ASCII with non- in a fold. If we are - * folding, and the source isn't ASCII, look through all the - * characters it folds to. If any one of them is ASCII, forbid - * this fold. (cp is uni, so the 127 below is correct even for - * EBCDIC). Similarly under locale rules, we don't mix under 256 - * with above 255. XXX It really doesn't make sense to have \N{} - * which means a Unicode rules under locale. I (khw) think this - * should be warned about, but the counter argument is that people - * who have programmed around Perl's earlier lack of specifying the - * rules and used \N{} to force Unicode things in a local - * environment shouldn't get suddenly a warning */ - if (use_this_char_fold) { - if (LOC && cp < 256) { /* Fold not known until run-time */ - use_this_char_fold = FALSE; - } - else if ((cp > 127 && MORE_ASCII_RESTRICTED) - || (cp > 255 && LOC)) - { - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; - U8* s = tmpbuf; - U8* e; - STRLEN foldlen; - - (void) toFOLD_uni(cp, tmpbuf, &foldlen); - e = s + foldlen; - - while (s < e) { - if (isASCII(*s) - || (LOC && (UTF8_IS_INVARIANT(*s) - || UTF8_IS_DOWNGRADEABLE_START(*s)))) - { - use_this_char_fold = FALSE; - break; - } - s += UTF8SKIP(s); - } - } - } - - if (! use_this_char_fold) { /* Not folding, just append to the - string */ - STRLEN unilen; - - /* Quit before adding this character if would exceed limit */ - if (len + UNISKIP(cp) > U8_MAX) break; - - unilen = reguni(pRExC_state, cp, s); - if (unilen > 0) { - s += unilen; - len += unilen; - } - } else { /* Folding, output the folded equivalent */ - STRLEN foldlen,numlen; - U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; - cp = toFOLD_uni(cp, tmpbuf, &foldlen); - - /* Quit before exceeding size limit */ - if (len + foldlen > U8_MAX) break; - - for (foldbuf = tmpbuf; - foldlen; - foldlen -= numlen) - { - cp = utf8_to_uvchr(foldbuf, &numlen); - if (numlen > 0) { - const STRLEN unilen = reguni(pRExC_state, cp, s); - s += unilen; - len += unilen; - /* In EBCDIC the numlen and unilen can differ. */ - foldbuf += numlen; - if (numlen >= foldlen) - break; - } - else - break; /* "Can't happen." */ - } - } + /* Convert to notation the rest of the code understands */ + sv_catpv(substitute_parse, "\\x{"); + sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); + sv_catpv(substitute_parse, "}"); /* Point to the beginning of the next character in the sequence. */ RExC_parse = endchar + 1; - - /* Quit if no more characters */ - if (RExC_parse >= endbrace) break; } + sv_catpv(substitute_parse, ")"); + RExC_parse = SvPV(substitute_parse, len); - if (SIZE_ONLY) { - if (RExC_parse < endbrace) { - ckWARNreg(RExC_parse - 1, - "Using just the first characters returned by \\N{}"); - } - - RExC_size += STR_SZ(len); - } else { - STR_LEN(ret) = len; - RExC_emit += STR_SZ(len); + /* Don't allow empty number */ + if (len < 8) { + vFAIL("Invalid hexadecimal number in \\N{U+...}"); } + RExC_end = RExC_parse + len; - RExC_parse = endbrace + 1; + /* The values are Unicode, and therefore not subject to recoding */ + RExC_override_recoding = 1; + + ret = reg(pRExC_state, 1, flagp, depth+1); + + RExC_parse = endbrace; + RExC_end = orig_end; + RExC_override_recoding = 0; - *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail - with malformed in t/re/pat_advanced.t */ - RExC_parse --; - Set_Node_Cur_Length(ret); /* MJD */ nextchar(pRExC_state); } @@ -8006,27 +8998,6 @@ tryagain: RExC_parse++; vFAIL("Quantifier follows nothing"); break; - case LATIN_SMALL_LETTER_SHARP_S: - case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S): - case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T): -#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T) -#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below. - case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T): -#endif - do_foldchar: - if (!LOC && FOLD) { - U32 len,cp; - len=0; /* silence a spurious compiler warning */ - if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) { - *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */ - RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */ - ret = reganode(pRExC_state, FOLDCHAR, cp); - Set_Node_Length(ret, 1); /* MJD */ - nextchar(pRExC_state); /* kill whitespace under /x */ - return ret; - } - } - goto outer_default; case '\\': /* Special Escapes @@ -8041,10 +9012,6 @@ tryagain: literal text handling code. */ switch ((U8)*++RExC_parse) { - case LATIN_SMALL_LETTER_SHARP_S: - case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S): - case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T): - goto do_foldchar; /* Special Escapes */ case 'A': RExC_seen_zerolen++; @@ -8286,7 +9253,7 @@ tryagain: break; case 'p': case 'P': - { + { char* const oldregxend = RExC_end; #ifdef DEBUGGING char* parse_start = RExC_parse - 2; @@ -8327,7 +9294,7 @@ tryagain: Also this makes sure that things like /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq*/ ++RExC_parse; - ret= reg_namedseq(pRExC_state, NULL, flagp); + ret= reg_namedseq(pRExC_state, NULL, flagp, depth); break; case 'k': /* Handle \k and \k'NAME' */ parse_named_seq: @@ -8465,49 +9432,63 @@ tryagain: /* FALL THROUGH */ default: - outer_default:{ + + parse_start = RExC_parse - 1; + + RExC_parse++; + + defchar: { register STRLEN len; register UV ender; register char *p; char *s; STRLEN foldlen; U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; - regnode * orig_emit; + U8 node_type; - parse_start = RExC_parse - 1; - - RExC_parse++; + /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so, + * it is folded to 'ss' even if not utf8 */ + bool is_exactfu_sharp_s; - defchar: ender = 0; - orig_emit = RExC_emit; /* Save the original output node position in - case we need to output a different node - type */ - ret = reg_node(pRExC_state, - (U8) ((! FOLD) ? EXACT - : (LOC) - ? EXACTFL - : (MORE_ASCII_RESTRICTED) - ? EXACTFA - : (AT_LEAST_UNI_SEMANTICS) - ? EXACTFU - : EXACTF) - ); + node_type = ((! FOLD) ? EXACT + : (LOC) + ? EXACTFL + : (MORE_ASCII_RESTRICTED) + ? EXACTFA + : (AT_LEAST_UNI_SEMANTICS) + ? EXACTFU + : EXACTF); + ret = reg_node(pRExC_state, node_type); s = STRING(ret); + + /* XXX The node can hold up to 255 bytes, yet this only goes to + * 127. I (khw) do not know why. Keeping it somewhat less than + * 255 allows us to not have to worry about overflow due to + * converting to utf8 and fold expansion, but that value is + * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes + * split up by this limit into a single one using the real max of + * 255. Even at 127, this breaks under rare circumstances. If + * folding, we do not want to split a node at a character that is a + * non-final in a multi-char fold, as an input string could just + * happen to want to match across the node boundary. The join + * would solve that problem if the join actually happens. But a + * series of more than two nodes in a row each of 127 would cause + * the first join to succeed to get to 254, but then there wouldn't + * be room for the next one, which could at be one of those split + * multi-char folds. I don't know of any fool-proof solution. One + * could back off to end with only a code point that isn't such a + * non-final, but it is possible for there not to be any in the + * entire node. */ for (len = 0, p = RExC_parse - 1; - len < 127 && p < RExC_end; - len++) + len < 127 && p < RExC_end; + len++) { char * const oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); switch ((U8)*p) { - case LATIN_SMALL_LETTER_SHARP_S: - case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S): - case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T): - if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) - goto normal_default; case '^': case '$': case '.': @@ -8532,11 +9513,6 @@ tryagain: switch ((U8)*++p) { /* These are all the special escapes. */ - case LATIN_SMALL_LETTER_SHARP_S: - case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S): - case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T): - if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) - goto normal_default; case 'A': /* Start assertion */ case 'b': case 'B': /* Word-boundary assertion*/ case 'C': /* Single char !DANGEROUS! */ @@ -8614,7 +9590,7 @@ tryagain: case 'x': if (*++p == '{') { char* const e = strchr(p, '}'); - + if (!e) { RExC_parse = p + 1; vFAIL("Missing right brace on \\x{}"); @@ -8663,7 +9639,7 @@ tryagain: goto recode_encoding; break; recode_encoding: - { + if (! RExC_override_recoding) { SV* enc = PL_encoding; ender = reg_recode((const char)(U8)ender, &enc); if (!enc && SIZE_ONLY) @@ -8699,73 +9675,16 @@ tryagain: break; } /* End of switch on the literal */ - /* Certain characters are problematic because their folded - * length is so different from their original length that it - * isn't handleable by the optimizer. They are therefore not - * placed in an EXACTish node; and are here handled specially. - * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S, - * putting it in a special node keeps regexec from having to - * deal with a non-utf8 multi-char fold */ - if (FOLD - && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC)) - && is_TRICKYFOLD_cp(ender)) - { - /* If is in middle of outputting characters into an - * EXACTish node, go output what we have so far, and - * position the parse so that this will be called again - * immediately */ - if (len) { - p = RExC_parse + len - 1; - goto loopdone; - } - else { - - /* Here we are ready to output our tricky fold - * character. What's done is to pretend it's in a - * [bracketed] class, and let the code that deals with - * those handle it, as that code has all the - * intelligence necessary. First save the current - * parse state, get rid of the already allocated EXACT - * node that the ANYOFV node will replace, and point - * the parse to a buffer which we fill with the - * character we want the regclass code to think is - * being parsed */ - char* const oldregxend = RExC_end; - char tmpbuf[2]; - RExC_emit = orig_emit; - RExC_parse = tmpbuf; - if (UTF) { - tmpbuf[0] = UTF8_TWO_BYTE_HI(ender); - tmpbuf[1] = UTF8_TWO_BYTE_LO(ender); - RExC_end = RExC_parse + 2; - } - else { - tmpbuf[0] = (char) ender; - RExC_end = RExC_parse + 1; - } - - ret = regclass(pRExC_state,depth+1); - - /* Here, have parsed the buffer. Reset the parse to - * the actual input, and return */ - RExC_end = oldregxend; - RExC_parse = p - 1; - - Set_Node_Offset(ret, RExC_parse); - Set_Node_Cur_Length(ret); - nextchar(pRExC_state); - *flagp |= HASWIDTH|SIMPLE; - return ret; - } - } - + is_exactfu_sharp_s = (node_type == EXACTFU + && ender == LATIN_SMALL_LETTER_SHARP_S); if ( RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); - if (UTF && FOLD) { + if ((UTF && FOLD) || is_exactfu_sharp_s) { /* Prime the casefolded buffer. Locale rules, which apply * only to code points < 256, aren't known until execution, * so for them, just output the original character using - * utf8 */ + * utf8. If we start to fold non-UTF patterns, be sure to + * update join_exact() */ if (LOC && ender < 256) { if (UNI_IS_INVARIANT(ender)) { *tmpbuf = (U8) ender; @@ -8824,7 +9743,7 @@ tryagain: if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; - else if (UTF) { + else if (UTF || is_exactfu_sharp_s) { if (FOLD) { /* Emit all the Unicode characters. */ STRLEN numlen; @@ -8860,7 +9779,7 @@ tryagain: } break; } - if (UTF) { + if (UTF || is_exactfu_sharp_s) { if (FOLD) { /* Emit all the Unicode characters. */ STRLEN numlen; @@ -8891,8 +9810,9 @@ tryagain: } len--; } - else + else { REGC((char)ender, s++); + } } loopdone: /* Jumped to when encounters something that shouldn't be in the node */ @@ -8909,7 +9829,7 @@ tryagain: *flagp |= HASWIDTH; if (len == 1 && UNI_IS_INVARIANT(ender)) *flagp |= SIMPLE; - + if (SIZE_ONLY) RExC_size += STR_SZ(len); else { @@ -8978,7 +9898,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) POSIXCC(UCHARAT(RExC_parse))) { const char c = UCHARAT(RExC_parse); char* const s = RExC_parse++; - + while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c) RExC_parse++; if (RExC_parse == RExC_end) @@ -9113,109 +10033,150 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) } } -/* No locale test, and always Unicode semantics */ -#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ -ANYOF_##NAME: \ - for (value = 0; value < 256; value++) \ - if (TEST) \ - stored += set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); \ - yesno = '+'; \ - what = WORD; \ - break; \ -case ANYOF_N##NAME: \ - for (value = 0; value < 256; value++) \ - if (!TEST) \ - stored += set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); \ - yesno = '!'; \ - what = WORD; \ - break - -/* Like the above, but there are differences if we are in uni-8-bit or not, so - * there are two tests passed in, to use depending on that. There aren't any - * cases where the label is different from the name, so no need for that - * parameter */ -#define _C_C_T_(NAME, TEST_8, TEST_7, WORD) \ -ANYOF_##NAME: \ - if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \ - else if (UNI_SEMANTICS) { \ - for (value = 0; value < 256; value++) { \ - if (TEST_8(value)) stored += \ - set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); \ - } \ - } \ - else { \ - for (value = 0; value < 128; value++) { \ - if (TEST_7(UNI_TO_NATIVE(value))) stored += \ - set_regclass_bit(pRExC_state, ret, \ - (U8) UNI_TO_NATIVE(value), &nonbitmap); \ - } \ - } \ - yesno = '+'; \ - what = WORD; \ - break; \ -case ANYOF_N##NAME: \ - if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \ - else if (UNI_SEMANTICS) { \ - for (value = 0; value < 256; value++) { \ - if (! TEST_8(value)) stored += \ - set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); \ - } \ - } \ - else { \ - for (value = 0; value < 128; value++) { \ - if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \ - pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \ - } \ - if (AT_LEAST_ASCII_RESTRICTED) { \ - for (value = 128; value < 256; value++) { \ - stored += set_regclass_bit( \ - pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \ - } \ - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \ - } \ - else { \ - /* For a non-ut8 target string with DEPENDS semantics, all above \ - * ASCII Latin1 code points match the complement of any of the \ - * classes. But in utf8, they have their Unicode semantics, so \ - * can't just set them in the bitmap, or else regexec.c will think \ - * they matched when they shouldn't. */ \ - ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \ - } \ - } \ - yesno = '!'; \ - what = WORD; \ - break - -/* - We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test - so that it is possible to override the option here without having to - rebuild the entire core. as we are required to do if we change regcomp.h - which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined. -*/ -#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS -#define BROKEN_UNICODE_CHARCLASS_MAPPINGS -#endif - -#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS -#define POSIX_CC_UNI_NAME(CCNAME) CCNAME -#else -#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME -#endif +/* Generate the code to add a full posix character to the bracketed + * character class given by . ( is needed only under locale rules) + * destlist is the inversion list for non-locale rules that this class is + * to be added to + * sourcelist is the ASCII-range inversion list to add under /a rules + * Xsourcelist is the full Unicode range list to use otherwise. */ +#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \ + if (LOC) { \ + SV* scratch_list = NULL; \ + \ + /* Set this class in the node for runtime matching */ \ + ANYOF_CLASS_SET(node, class); \ + \ + /* For above Latin1 code points, we use the full Unicode range */ \ + _invlist_intersection(PL_AboveLatin1, \ + Xsourcelist, \ + &scratch_list); \ + /* And set the output to it, adding instead if there already is an \ + * output. Checking if is NULL first saves an extra \ + * clone. Its reference count will be decremented at the next \ + * union, etc, or if this is the only instance, at the end of the \ + * routine */ \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + } \ + else { \ + /* For non-locale, just add it to any existing list */ \ + _invlist_union(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : Xsourcelist, \ + &destlist); \ + } + +/* Like DO_POSIX, but matches the complement of and . + */ +#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \ + if (LOC) { \ + SV* scratch_list = NULL; \ + ANYOF_CLASS_SET(node, class); \ + _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + } \ + else { \ + _invlist_union_complement_2nd(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : Xsourcelist, \ + &destlist); \ + /* Under /d, everything in the upper half of the Latin1 range \ + * matches this complement */ \ + if (DEPENDS_SEMANTICS) { \ + ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \ + } \ + } + +/* Generate the code to add a posix character to the bracketed + * character class given by . ( is needed only under locale rules) + * destlist is the inversion list for non-locale rules that this class is + * to be added to + * sourcelist is the ASCII-range inversion list to add under /a rules + * l1_sourcelist is the Latin1 range list to use otherwise. + * Xpropertyname is the name to add to of the property to + * specify the code points above Latin1 that will have to be + * determined at run-time + * run_time_list is a SV* that contains text names of properties that are to + * be computed at run time. This concatenates + * to it, apppropriately + * This is essentially DO_POSIX, but we know only the Latin1 values at compile + * time */ +#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ + l1_sourcelist, Xpropertyname, run_time_list) \ + /* If not /a matching, there are going to be code points we will have \ + * to defer to runtime to look-up */ \ + if (! AT_LEAST_ASCII_RESTRICTED) { \ + Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \ + } \ + if (LOC) { \ + ANYOF_CLASS_SET(node, class); \ + } \ + else { \ + _invlist_union(destlist, \ + (AT_LEAST_ASCII_RESTRICTED) \ + ? sourcelist \ + : l1_sourcelist, \ + &destlist); \ + } + +/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of + * this and DO_N_POSIX */ +#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ + l1_sourcelist, Xpropertyname, run_time_list) \ + if (AT_LEAST_ASCII_RESTRICTED) { \ + _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \ + } \ + else { \ + Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \ + if (LOC) { \ + ANYOF_CLASS_SET(node, namedclass); \ + } \ + else { \ + SV* scratch_list = NULL; \ + _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \ + if (! destlist) { \ + destlist = scratch_list; \ + } \ + else { \ + _invlist_union(destlist, scratch_list, &destlist); \ + SvREFCNT_dec(scratch_list); \ + } \ + if (DEPENDS_SEMANTICS) { \ + ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \ + } \ + } \ + } STATIC U8 -S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr) +S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr) { /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes. * Locale folding is done at run-time, so this function should not be * called for nodes that are for locales. * - * This function simply sets the bit corresponding to the fold of the input + * This function sets the bit corresponding to the fold of the input * 'value', if not already set. The fold of 'f' is 'F', and the fold of * 'F' is 'f'. * - * It also sets any necessary flags, and returns the number of bits that - * actually changed from 0 to 1 */ + * It also knows about the characters that are in the bitmap that have + * folds that are matchable only outside it, and sets the appropriate lists + * and flags. + * + * It returns the number of bits that actually changed from 0 to 1 */ U8 stored = 0; U8 fold; @@ -9230,17 +10191,109 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 ANYOF_BITMAP_SET(node, fold); stored++; } - if ((_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) - || (! UNI_SEMANTICS + if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) { + /* Certain Latin1 characters have matches outside the bitmap. To get + * here, 'value' is one of those characters. None of these matches is + * valid for ASCII characters under /aa, which have been excluded by + * the 'if' above. The matches fall into three categories: + * 1) They are singly folded-to or -from an above 255 character, as + * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y + * WITH DIAERESIS; + * 2) They are part of a multi-char fold with another character in the + * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill; + * 3) They are part of a multi-char fold with a character not in the + * bitmap, such as various ligatures. + * We aren't dealing fully with multi-char folds, except we do deal + * with the pattern containing a character that has a multi-char fold + * (not so much the inverse). + * For types 1) and 3), the matches only happen when the target string + * is utf8; that's not true for 2), and we set a flag for it. + * + * The code below adds to the passed in inversion list the single fold + * closures for 'value'. The values are hard-coded here so that an + * innocent-looking character class, like /[ks]/i won't have to go out + * to disk to find the possible matches. XXX It would be better to + * generate these via regen, in case a new version of the Unicode + * standard adds new mappings, though that is not really likely. */ + switch (value) { + case 'k': + case 'K': + /* KELVIN SIGN */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A); + break; + case 's': + case 'S': + /* LATIN SMALL LETTER LONG S */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F); + break; + case MICRO_SIGN: + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + GREEK_SMALL_LETTER_MU); + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + GREEK_CAPITAL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + /* ANGSTROM SIGN */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B); + if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + PL_fold_latin1[value]); + } + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + case LATIN_SMALL_LETTER_SHARP_S: + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + LATIN_CAPITAL_LETTER_SHARP_S); + + /* Under /a, /d, and /u, this can match the two chars "ss" */ + if (! MORE_ASCII_RESTRICTED) { + add_alternate(alternate_ptr, (U8 *) "ss", 2); + + /* And under /u or /a, it can match even if the target is + * not utf8 */ + if (AT_LEAST_UNI_SEMANTICS) { + ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8; + } + } + break; + case 'F': case 'f': + case 'I': case 'i': + case 'L': case 'l': + case 'T': case 't': + case 'A': case 'a': + case 'H': case 'h': + case 'J': case 'j': + case 'N': case 'n': + case 'W': case 'w': + case 'Y': case 'y': + /* These all are targets of multi-character folds from code + * points that require UTF8 to express, so they can't match + * unless the target string is in UTF-8, so no action here is + * necessary, as regexec.c properly handles the general case + * for UTF-8 matching */ + break; + default: + /* Use deprecated warning to increase the chances of this + * being output */ + ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value); + break; + } + } + else if (DEPENDS_SEMANTICS && ! isASCII(value) - && PL_fold_latin1[value] != value)) - { /* A character that has a fold outside of Latin1 matches outside the - bitmap, but only when the target string is utf8. Similarly when we - don't have unicode semantics for the above ASCII Latin-1 characters, - and they have a fold, they should match if the target is utf8, and - not otherwise. We add the character here, and calculate the fold - later, with the other nonbitmap folds */ - *nonbitmap_ptr = add_range_to_invlist(*nonbitmap_ptr, value, value); + && PL_fold_latin1[value] != value) + { + /* Under DEPENDS rules, non-ASCII Latin1 characters match their + * folds only when the target string is in UTF-8. We add the fold + * here to the list of things to match outside the bitmap, which + * won't be looked at unless it is UTF8 (or else if something else + * says to look even if not utf8, but those things better not happen + * under DEPENDS semantics. */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]); } return stored; @@ -9248,7 +10301,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 PERL_STATIC_INLINE U8 -S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr) +S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr) { /* This inline function sets a bit in the bitmap if not already set, and if * appropriate, its fold, returning the number of bits that actually @@ -9266,12 +10319,30 @@ S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 valu stored = 1; if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */ - stored += set_regclass_bit_fold(pRExC_state, node, value, nonbitmap_ptr); + stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr); } return stored; } +STATIC void +S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) +{ + /* Adds input 'string' with length 'len' to the ANYOF node's unicode + * alternate list, pointed to by 'alternate_ptr'. This is an array of + * the multi-character folds of characters in the node */ + SV *sv; + + PERL_ARGS_ASSERT_ADD_ALTERNATE; + + if (! *alternate_ptr) { + *alternate_ptr = newAV(); + } + sv = newSVpvn_utf8((char*)string, len, TRUE); + av_push(*alternate_ptr, sv); + return; +} + /* parse a class specification and produce either an ANYOF node that matches the pattern or perhaps will be optimized into an EXACTish node @@ -9292,11 +10363,47 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) IV namedclass; char *rangebegin = NULL; bool need_class = 0; + bool allow_full_fold = TRUE; /* Assume wants multi-char folding */ SV *listsv = NULL; STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more than just initialized. */ + SV* properties = NULL; /* Code points that match \p{} \P{} */ + UV element_count = 0; /* Number of distinct elements in the class. + Optimizations may be possible if this is tiny */ UV n; - HV* nonbitmap = NULL; + + /* Unicode properties are stored in a swash; this holds the current one + * being parsed. If this swash is the only above-latin1 component of the + * character class, an optimization is to pass it directly on to the + * execution engine. Otherwise, it is set to NULL to indicate that there + * are other things in the class that have to be dealt with at execution + * time */ + SV* swash = NULL; /* Code points that match \p{} \P{} */ + + /* Set if a component of this character class is user-defined; just passed + * on to the engine */ + UV has_user_defined_property = 0; + + /* code points this node matches that can't be stored in the bitmap */ + SV* nonbitmap = NULL; + + /* The items that are to match that aren't stored in the bitmap, but are a + * result of things that are stored there. This is the fold closure of + * such a character, either because it has DEPENDS semantics and shouldn't + * be matched unless the target string is utf8, or is a code point that is + * too large for the bit map, as for example, the fold of the MICRO SIGN is + * above 255. This all is solely for performance reasons. By having this + * code know the outside-the-bitmap folds that the bitmapped characters are + * involved with, we don't have to go out to disk to find the list of + * matches, unless the character class includes code points that aren't + * storable in the bit map. That means that a character class with an 's' + * in it, for example, doesn't need to go out to disk to find everything + * that matches. A 2nd list is used so that the 'nonbitmap' list is kept + * empty unless there is something whose fold we don't know about, and will + * have to go out to the disk to find. */ + SV* l1_fold_invlist = NULL; + + /* List of multi-character folds that are matched by this node */ AV* unicode_alternate = NULL; #ifdef EBCDIC UV literal_endpoint = 0; @@ -9328,24 +10435,26 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) RExC_parse++; if (!SIZE_ONLY) ANYOF_FLAGS(ret) |= ANYOF_INVERT; + + /* We have decided to not allow multi-char folds in inverted character + * classes, due to the confusion that can happen, especially with + * classes that are designed for a non-Unicode world: You have the + * peculiar case that: + "s s" =~ /^[^\xDF]+$/i => Y + "ss" =~ /^[^\xDF]+$/i => N + * + * See [perl #89750] */ + allow_full_fold = FALSE; } if (SIZE_ONLY) { RExC_size += ANYOF_SKIP; -#ifdef ANYOF_ADD_LOC_SKIP - if (LOC) { - RExC_size += ANYOF_ADD_LOC_SKIP; - } -#endif listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ } else { RExC_emit += ANYOF_SKIP; if (LOC) { ANYOF_FLAGS(ret) |= ANYOF_LOCALE; -#ifdef ANYOF_ADD_LOC_SKIP - RExC_emit += ANYOF_ADD_LOC_SKIP; -#endif } ANYOF_BITMAP_ZERO(ret); listsv = newSVpvs("# comment\n"); @@ -9368,8 +10477,10 @@ parseit: namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ - if (!range) + if (!range) { rangebegin = RExC_parse; + element_count++; + } if (UTF) { value = utf8n_to_uvchr((U8*)RExC_parse, RExC_end - RExC_parse, @@ -9415,7 +10526,7 @@ parseit: from earlier versions, OTOH that behaviour was broken as well. */ UV v; /* value is register so we cant & it /grrr */ - if (reg_namedseq(pRExC_state, &v, NULL)) { + if (reg_namedseq(pRExC_state, &v, NULL, depth)) { goto parseit; } value= v; @@ -9445,6 +10556,9 @@ parseit: n = 1; } if (!SIZE_ONLY) { + SV** invlistsvp; + SV* invlist; + char* name; if (UCHARAT(RExC_parse) == '^') { RExC_parse++; n--; @@ -9454,24 +10568,88 @@ parseit: n--; } } + /* Try to get the definition of the property into + * . If /i is in effect, the effective property + * will have its name be <__NAME_i>. The design is + * discussed in commit + * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ + Newx(name, n + sizeof("_i__\n"), char); + + sprintf(name, "%s%.*s%s\n", + (FOLD) ? "__" : "", + (int)n, + RExC_parse, + (FOLD) ? "_i" : "" + ); + + /* Look up the property name, and get its swash and + * inversion list, if the property is found */ + if (swash) { + SvREFCNT_dec(swash); + } + swash = _core_swash_init("utf8", name, &PL_sv_undef, + 1, /* binary */ + 0, /* not tr/// */ + TRUE, /* this routine will handle + undefined properties */ + NULL, FALSE /* No inversion list */ + ); + if ( ! swash + || ! SvROK(swash) + || ! SvTYPE(SvRV(swash)) == SVt_PVHV + || ! (invlistsvp = + hv_fetchs(MUTABLE_HV(SvRV(swash)), + "INVLIST", FALSE)) + || ! (invlist = *invlistsvp)) + { + if (swash) { + SvREFCNT_dec(swash); + swash = NULL; + } + + /* Here didn't find it. It could be a user-defined + * property that will be available at run-time. Add it + * to the list to look up then */ + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", + (value == 'p' ? '+' : '!'), + name); + has_user_defined_property = 1; + + /* We don't know yet, so have to assume that the + * property could match something in the Latin1 range, + * hence something that isn't utf8 */ + ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; + } + else { + + /* Here, did get the swash and its inversion list. If + * the swash is from a user-defined property, then this + * whole character class should be regarded as such */ + SV** user_defined_svp = + hv_fetchs(MUTABLE_HV(SvRV(swash)), + "USER_DEFINED", FALSE); + if (user_defined_svp) { + has_user_defined_property + |= SvUV(*user_defined_svp); + } - /* Add the property name to the list. If /i matching, give - * a different name which consists of the normal name - * sandwiched between two underscores and '_i'. The design - * is discussed in the commit message for this. */ - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n", - (value=='p' ? '+' : '!'), - (FOLD) ? "__" : "", - (int)n, - RExC_parse, - (FOLD) ? "_i" : "" - ); + /* Invert if asking for the complement */ + if (value == 'P') { + _invlist_union_complement_2nd(properties, invlist, &properties); + + /* The swash can't be used as-is, because we've + * inverted things; delay removing it to here after + * have copied its invlist above */ + SvREFCNT_dec(swash); + swash = NULL; + } + else { + _invlist_union(properties, invlist, &properties); + } + } + Safefree(name); } RExC_parse = e + 1; - - /* The \p could match something in the Latin1 range, hence - * something that isn't utf8 */ - ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; namedclass = ANYOF_MAX; /* no official name, but it's named */ /* \p means they want Unicode semantics */ @@ -9540,7 +10718,7 @@ parseit: break; } recode_encoding: - { + if (! RExC_override_recoding) { SV* enc = PL_encoding; value = reg_recode((const char)(U8)value, &enc); if (!enc && SIZE_ONLY) @@ -9574,21 +10752,18 @@ parseit: if (LOC && namedclass < ANYOF_MAX && ! need_class) { need_class = 1; if (SIZE_ONLY) { -#ifdef ANYOF_CLASS_ADD_SKIP - RExC_size += ANYOF_CLASS_ADD_SKIP; -#endif + RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP; } else { -#ifdef ANYOF_CLASS_ADD_SKIP - RExC_emit += ANYOF_CLASS_ADD_SKIP; -#endif + RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP; ANYOF_CLASS_ZERO(ret); } ANYOF_FLAGS(ret) |= ANYOF_CLASS; } /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a - * literal */ + * literal, as is the character that began the false range, i.e. + * the 'a' in the examples */ if (range) { if (!SIZE_ONLY) { const int w = @@ -9598,109 +10773,224 @@ parseit: "False [] range \"%*.*s\"", w, w, rangebegin); + stored += + set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate); if (prevvalue < 256) { stored += - set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &nonbitmap); - stored += - set_regclass_bit(pRExC_state, ret, '-', &nonbitmap); + set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate); } else { - Perl_sv_catpvf(aTHX_ listsv, - "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-'); + nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue); } } range = 0; /* this was not a true range */ } - - if (!SIZE_ONLY) { - const char *what = NULL; - char yesno = 0; /* Possible truncation here but in some 64-bit environments * the compiler gets heartburn about switch on 64-bit values. * A similar issue a little earlier when switching on value. * --jhi */ switch ((I32)namedclass) { - - case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum"); - case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha"); - case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank"); - case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl"); - case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph"); - case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower"); - case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint"); - case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace"); - case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct"); - case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper"); -#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS - /* \s, \w match all unicode if utf8. */ - case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl"); - case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word"); -#else - /* \s, \w match ascii and locale only */ - case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "PerlSpace"); - case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "PerlWord"); -#endif - case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit"); - case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); - case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); + + case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */ + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); + break; + case ANYOF_NALNUMC: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); + break; + case ANYOF_ALPHA: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); + break; + case ANYOF_NALPHA: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); + break; case ANYOF_ASCII: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_ASCII); - else { - for (value = 0; value < 128; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap); + if (LOC) { + ANYOF_CLASS_SET(ret, namedclass); } - yesno = '+'; - what = NULL; /* Doesn't match outside ascii, so - don't want to add +utf8:: */ + else { + _invlist_union(properties, PL_ASCII, &properties); + } break; case ANYOF_NASCII: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NASCII); - else { - for (value = 128; value < 256; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap); + if (LOC) { + ANYOF_CLASS_SET(ret, namedclass); } - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; - yesno = '!'; - what = "ASCII"; - break; + else { + _invlist_union_complement_2nd(properties, + PL_ASCII, &properties); + if (DEPENDS_SEMANTICS) { + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; + } + } + break; + case ANYOF_BLANK: + DO_POSIX(ret, namedclass, properties, + PL_PosixBlank, PL_XPosixBlank); + break; + case ANYOF_NBLANK: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixBlank, PL_XPosixBlank); + break; + case ANYOF_CNTRL: + DO_POSIX(ret, namedclass, properties, + PL_PosixCntrl, PL_XPosixCntrl); + break; + case ANYOF_NCNTRL: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixCntrl, PL_XPosixCntrl); + break; case ANYOF_DIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_DIGIT); + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv); + break; + case ANYOF_NDIGIT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv); + break; + case ANYOF_GRAPH: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); + break; + case ANYOF_NGRAPH: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); + break; + case ANYOF_HORIZWS: + /* For these, we use the nonbitmap, as /d doesn't make a + * difference in what these match. There would be problems + * if these characters had folds other than themselves, as + * nonbitmap is subject to folding. It turns out that \h + * is just a synonym for XPosixBlank */ + _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap); + break; + case ANYOF_NHORIZWS: + _invlist_union_complement_2nd(nonbitmap, + PL_XPosixBlank, &nonbitmap); + break; + case ANYOF_LOWER: + case ANYOF_NLOWER: + { /* These require special handling, as they differ under + folding, matching Cased there (which in the ASCII range + is the same as Alpha */ + + SV* ascii_source; + SV* l1_source; + const char *Xname; + + if (FOLD && ! LOC) { + ascii_source = PL_PosixAlpha; + l1_source = PL_L1Cased; + Xname = "Cased"; + } + else { + ascii_source = PL_PosixLower; + l1_source = PL_L1PosixLower; + Xname = "XPosixLower"; + } + if (namedclass == ANYOF_LOWER) { + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + ascii_source, l1_source, Xname, listsv); + } else { - /* consecutive digits assumed */ - for (value = '0'; value <= '9'; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, + properties, ascii_source, l1_source, Xname, listsv); } - yesno = '+'; - what = POSIX_CC_UNI_NAME("Digit"); break; - case ANYOF_NDIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); + } + case ANYOF_PRINT: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); + break; + case ANYOF_NPRINT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); + break; + case ANYOF_PUNCT: + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); + break; + case ANYOF_NPUNCT: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); + break; + case ANYOF_PSXSPC: + DO_POSIX(ret, namedclass, properties, + PL_PosixSpace, PL_XPosixSpace); + break; + case ANYOF_NPSXSPC: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixSpace, PL_XPosixSpace); + break; + case ANYOF_SPACE: + DO_POSIX(ret, namedclass, properties, + PL_PerlSpace, PL_XPerlSpace); + break; + case ANYOF_NSPACE: + DO_N_POSIX(ret, namedclass, properties, + PL_PerlSpace, PL_XPerlSpace); + break; + case ANYOF_UPPER: /* Same as LOWER, above */ + case ANYOF_NUPPER: + { + SV* ascii_source; + SV* l1_source; + const char *Xname; + + if (FOLD && ! LOC) { + ascii_source = PL_PosixAlpha; + l1_source = PL_L1Cased; + Xname = "Cased"; + } else { - /* consecutive digits assumed */ - for (value = 0; value < '0'; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); - for (value = '9' + 1; value < 256; value++) - stored += - set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); + ascii_source = PL_PosixUpper; + l1_source = PL_L1PosixUpper; + Xname = "XPosixUpper"; + } + if (namedclass == ANYOF_UPPER) { + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + ascii_source, l1_source, Xname, listsv); } - yesno = '!'; - what = POSIX_CC_UNI_NAME("Digit"); - if (AT_LEAST_ASCII_RESTRICTED ) { - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; + else { + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, + properties, ascii_source, l1_source, Xname, listsv); } - break; + break; + } + case ANYOF_ALNUM: /* Really is 'Word' */ + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + break; + case ANYOF_NALNUM: + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + break; + case ANYOF_VERTWS: + /* For these, we use the nonbitmap, as /d doesn't make a + * difference in what these match. There would be problems + * if these characters had folds other than themselves, as + * nonbitmap is subject to folding */ + _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap); + break; + case ANYOF_NVERTWS: + _invlist_union_complement_2nd(nonbitmap, + PL_VertSpace, &nonbitmap); + break; + case ANYOF_XDIGIT: + DO_POSIX(ret, namedclass, properties, + PL_PosixXDigit, PL_XPosixXDigit); + break; + case ANYOF_NXDIGIT: + DO_N_POSIX(ret, namedclass, properties, + PL_PosixXDigit, PL_XPosixXDigit); + break; case ANYOF_MAX: /* this is to handle \p and \P */ break; @@ -9708,10 +10998,6 @@ parseit: vFAIL("Invalid [::] class"); break; } - if (what && ! (AT_LEAST_ASCII_RESTRICTED)) { - /* Strings such as "+utf8::isWord\n" */ - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what); - } continue; } @@ -9726,8 +11012,10 @@ parseit: } else { prevvalue = value; /* save the beginning of the range */ - if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && - RExC_parse[1] != ']') { + if (RExC_parse+1 < RExC_end + && *RExC_parse == '-' + && RExC_parse[1] != ']') + { RExC_parse++; /* a bad range like \w-, [:word:]- ? */ @@ -9742,7 +11030,7 @@ parseit: } if (!SIZE_ONLY) stored += - set_regclass_bit(pRExC_state, ret, '-', &nonbitmap); + set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate); } else range = 1; /* yeah, it's a range! */ continue; /* but do it the next time */ @@ -9771,20 +11059,20 @@ parseit: for (i = prevvalue; i <= ceilvalue; i++) if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { stored += - set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap); + set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); } } else { for (i = prevvalue; i <= ceilvalue; i++) if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { stored += - set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap); + set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); } } } else #endif for (i = prevvalue; i <= ceilvalue; i++) { - stored += set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap); + stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); } } if (value > 255) { @@ -9792,104 +11080,6 @@ parseit: const UV natvalue = NATIVE_TO_UNI(value); nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue); } -#if 0 - - /* If the code point requires utf8 to represent, and we are not - * folding, it can't match unless the target is in utf8. Only - * a few code points above 255 fold to below it, so XXX an - * optimization would be to know which ones and set the flag - * appropriately. */ - ANYOF_FLAGS(ret) |= (FOLD || value < 256) - ? ANYOF_NONBITMAP - : ANYOF_UTF8; - if (prevnatvalue < natvalue) { /* '>' case is fatal error above */ - - /* The \t sets the whole range */ - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", - prevnatvalue, natvalue); - - /* Currently, we don't look at every value in the range. - * Therefore we have to assume the worst case: that if - * folding, it will match more than one character. But in - * lookbehind patterns, can only be single character - * length, so disallow those folds */ - if (FOLD && ! RExC_in_lookbehind) { - OP(ret) = ANYOFV; - } - } - else if (prevnatvalue == natvalue) { - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue); - if (FOLD) { - U8 foldbuf[UTF8_MAXBYTES_CASE+1]; - STRLEN foldlen; - const UV f = to_uni_fold(natvalue, foldbuf, &foldlen); - -#ifdef EBCDIC /* RD t/uni/fold ff and 6b */ - if (RExC_precomp[0] == ':' && - RExC_precomp[1] == '[' && - (f == 0xDF || f == 0x92)) { - f = NATIVE_TO_UNI(f); - } -#endif - /* If folding and foldable and a single - * character, insert also the folded version - * to the charclass. */ - if (f != value) { -#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */ - if ((RExC_precomp[0] == ':' && - RExC_precomp[1] == '[' && - (f == 0xA2 && - (value == 0xFB05 || value == 0xFB06))) ? - foldlen == ((STRLEN)UNISKIP(f) - 1) : - foldlen == (STRLEN)UNISKIP(f) ) -#else - if (foldlen == (STRLEN)UNISKIP(f)) -#endif - Perl_sv_catpvf(aTHX_ listsv, - "%04"UVxf"\n", f); - else if (! RExC_in_lookbehind) { - /* Any multicharacter foldings - * (disallowed in lookbehind patterns) - * require the following transform: - * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) - * where E folds into "pq" and F folds - * into "rst", all other characters - * fold to single characters. We save - * away these multicharacter foldings, - * to be later saved as part of the - * additional "s" data. */ - SV *sv; - - if (!unicode_alternate) - unicode_alternate = newAV(); - sv = newSVpvn_utf8((char*)foldbuf, foldlen, - TRUE); - av_push(unicode_alternate, sv); - OP(ret) = ANYOFV; - } - } - - /* If folding and the value is one of the Greek - * sigmas insert a few more sigmas to make the - * folding rules of the sigmas to work right. - * Note that not all the possible combinations - * are handled here: some of them are handled - * by the standard folding rules, and some of - * them (literal or EXACTF cases) are handled - * during runtime in regexec.c:S_find_byclass(). */ - if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) { - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", - (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA); - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", - (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); - } - else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA) - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", - (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); - } - } - } -#endif #ifdef EBCDIC literal_endpoint = 0; #endif @@ -9904,213 +11094,395 @@ parseit: return ret; /****** !SIZE_ONLY AFTER HERE *********/ - /* Finish up the non-bitmap entries */ - if (nonbitmap) { - UV i; - - /* If folding, we add to the list all characters that could fold to or - * from the ones already on the list */ - if (FOLD) { - HV* fold_intersection; - UV* fold_list; - - /* This is a list of all the characters that participate in folds - * (except marks, etc in multi-char folds */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0); - PL_utf8_foldable = _swash_to_invlist(swash); - } + /* If folding and there are code points above 255, we calculate all + * characters that could fold to or from the ones already on the list */ + if (FOLD && nonbitmap) { + UV start, end; /* End points of code point ranges */ - /* This is a hash that for a particular fold gives all characters - * that are involved in it */ - if (! PL_utf8_foldclosures) { - - /* If we were unable to find any folds, then we likely won't be - * able to find the closures. So just create an empty list. - * Folding will effectively be restricted to the non-Unicode - * rules hard-coded into Perl. (This case happens legitimately - * during compilation of Perl itself before the Unicode tables - * are generated) */ - if (invlist_len(PL_utf8_foldable) == 0) { - PL_utf8_foldclosures = _new_invlist(0); - } else { - /* If the folds haven't been read in, call a fold function - * to force that */ - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; - STRLEN dummy_len; - to_utf8_fold((U8*) "A", dummy, &dummy_len); - } - PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); + SV* fold_intersection = NULL; + + /* This is a list of all the characters that participate in folds + * (except marks, etc in multi-char folds */ + if (! PL_utf8_foldable) { + SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0); + PL_utf8_foldable = _swash_to_invlist(swash); + SvREFCNT_dec(swash); + } + + /* This is a hash that for a particular fold gives all characters + * that are involved in it */ + if (! PL_utf8_foldclosures) { + + /* If we were unable to find any folds, then we likely won't be + * able to find the closures. So just create an empty list. + * Folding will effectively be restricted to the non-Unicode rules + * hard-coded into Perl. (This case happens legitimately during + * compilation of Perl itself before the Unicode tables are + * generated) */ + if (invlist_len(PL_utf8_foldable) == 0) { + PL_utf8_foldclosures = newHV(); + } else { + /* If the folds haven't been read in, call a fold function + * to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES+1]; + STRLEN dummy_len; + + /* This particular string is above \xff in both UTF-8 and + * UTFEBCDIC */ + to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len); + assert(PL_utf8_tofold); /* Verify that worked */ } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); } + } + + /* Only the characters in this class that participate in folds need be + * checked. Get the intersection of this class and all the possible + * characters that are foldable. This can quickly narrow down a large + * class */ + _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection); + + /* Now look at the foldable characters in this class individually */ + invlist_iterinit(fold_intersection); + while (invlist_iternext(fold_intersection, &start, &end)) { + UV j; - /* Only the characters in this class that participate in folds need - * be checked. Get the intersection of this class and all the - * possible characters that are foldable. This can quickly narrow - * down a large class */ - fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap); - - /* Now look at the foldable characters in this class individually */ - fold_list = invlist_array(fold_intersection); - for (i = 0; i < invlist_len(fold_intersection); i++) { - UV j; - - /* The next entry is the beginning of the range that is in the - * class */ - UV start = fold_list[i++]; - - - /* The next entry is the beginning of the next range, which - * isn't in the class, so the end of the current range is one - * less than that */ - UV end = fold_list[i] - 1; - - /* Look at every character in the range */ - for (j = start; j <= end; j++) { - - /* Get its fold */ - U8 foldbuf[UTF8_MAXBYTES_CASE+1]; - STRLEN foldlen; - const UV f = to_uni_fold(j, foldbuf, &foldlen); - - if (foldlen > (STRLEN)UNISKIP(f)) { - - /* Any multicharacter foldings (disallowed in - * lookbehind patterns) require the following - * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where - * E folds into "pq" and F folds into "rst", all other - * characters fold to single characters. We save away - * these multicharacter foldings, to be later saved as - * part of the additional "s" data. */ - if (! RExC_in_lookbehind) { - SV *sv; - U8* loc = foldbuf; - U8* e = foldbuf + foldlen; - - /* If any of the folded characters of this are in - * the Latin1 range, tell the regex engine that - * this can match a non-utf8 target string. The - * only multi-byte fold whose source is in the - * Latin1 range (U+00DF) applies only when the - * target string is utf8, or under unicode rules */ - if (j > 255 || AT_LEAST_UNI_SEMANTICS) { - while (loc < e) { - - /* Can't mix ascii with non- under /aa */ - if (MORE_ASCII_RESTRICTED - && (isASCII(*loc) != isASCII(j))) - { + /* Look at every character in the range */ + for (j = start; j <= end; j++) { + + /* Get its fold */ + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen; + const UV f = + _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold); + + if (foldlen > (STRLEN)UNISKIP(f)) { + + /* Any multicharacter foldings (disallowed in lookbehind + * patterns) require the following transform: [ABCDEF] -> + * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F + * folds into "rst", all other characters fold to single + * characters. We save away these multicharacter foldings, + * to be later saved as part of the additional "s" data. */ + if (! RExC_in_lookbehind) { + U8* loc = foldbuf; + U8* e = foldbuf + foldlen; + + /* If any of the folded characters of this are in the + * Latin1 range, tell the regex engine that this can + * match a non-utf8 target string. The only multi-byte + * fold whose source is in the Latin1 range (U+00DF) + * applies only when the target string is utf8, or + * under unicode rules */ + if (j > 255 || AT_LEAST_UNI_SEMANTICS) { + while (loc < e) { + + /* Can't mix ascii with non- under /aa */ + if (MORE_ASCII_RESTRICTED + && (isASCII(*loc) != isASCII(j))) + { + goto end_multi_fold; + } + if (UTF8_IS_INVARIANT(*loc) + || UTF8_IS_DOWNGRADEABLE_START(*loc)) + { + /* Can't mix above and below 256 under LOC + */ + if (LOC) { goto end_multi_fold; } - if (UTF8_IS_INVARIANT(*loc) - || UTF8_IS_DOWNGRADEABLE_START(*loc)) - { - /* Can't mix above and below 256 under - * LOC */ - if (LOC) { - goto end_multi_fold; - } - ANYOF_FLAGS(ret) - |= ANYOF_NONBITMAP_NON_UTF8; - break; - } - loc += UTF8SKIP(loc); + ANYOF_FLAGS(ret) + |= ANYOF_NONBITMAP_NON_UTF8; + break; } + loc += UTF8SKIP(loc); } - - if (!unicode_alternate) { - unicode_alternate = newAV(); - } - sv = newSVpvn_utf8((char*)foldbuf, foldlen, TRUE); - av_push(unicode_alternate, sv); - - /* This node is variable length */ - OP(ret) = ANYOFV; - end_multi_fold: ; } + + add_alternate(&unicode_alternate, foldbuf, foldlen); + end_multi_fold: ; } - else { - /* Single character fold. Add everything in its fold - * closure to the list that this node should match */ - SV** listp; - - /* The fold closures data structure is a hash with the - * keys being every character that is folded to, like - * 'k', and the values each an array of everything that - * folds to its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */ - if ((listp = hv_fetch(PL_utf8_foldclosures, - (char *) foldbuf, foldlen, FALSE))) - { - AV* list = (AV*) *listp; - IV k; - for (k = 0; k <= av_len(list); k++) { - SV** c_p = av_fetch(list, k, FALSE); - UV c; - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - c = SvUV(*c_p); - /* /aa doesn't allow folds between ASCII and - * non-; /l doesn't allow them between above - * and below 256 */ - if ((MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j))) - || (LOC && ((c < 256) != (j < 256)))) - { - continue; - } + /* This is special-cased, as it is the only letter which + * has both a multi-fold and single-fold in Latin1. All + * the other chars that have single and multi-folds are + * always in utf8, and the utf8 folding algorithm catches + * them */ + if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) { + stored += set_regclass_bit(pRExC_state, + ret, + LATIN_SMALL_LETTER_SHARP_S, + &l1_fold_invlist, &unicode_alternate); + } + } + else { + /* Single character fold. Add everything in its fold + * closure to the list that this node should match */ + SV** listp; + + /* The fold closures data structure is a hash with the keys + * being every character that is folded to, like 'k', and + * the values each an array of everything that folds to its + * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_len(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non-; + * /l doesn't allow them between above and below + * 256 */ + if ((MORE_ASCII_RESTRICTED + && (isASCII(c) != isASCII(j))) + || (LOC && ((c < 256) != (j < 256)))) + { + continue; + } - if (c < 256 && AT_LEAST_UNI_SEMANTICS) { - stored += set_regclass_bit(pRExC_state, ret, (U8) c, &nonbitmap); - } - /* It may be that the code point is already - * in this range or already in the bitmap, - * in which case we need do nothing */ - else if ((c < start || c > end) - && (c > 255 - || ! ANYOF_BITMAP_TEST(ret, c))) - { - nonbitmap = add_range_to_invlist(nonbitmap, c, c); - } + if (c < 256 && AT_LEAST_UNI_SEMANTICS) { + stored += set_regclass_bit(pRExC_state, + ret, + (U8) c, + &l1_fold_invlist, &unicode_alternate); + } + /* It may be that the code point is already in + * this range or already in the bitmap, in + * which case we need do nothing */ + else if ((c < start || c > end) + && (c > 255 + || ! ANYOF_BITMAP_TEST(ret, c))) + { + nonbitmap = add_cp_to_invlist(nonbitmap, c); } } } } } - invlist_destroy(fold_intersection); - } /* End of processing all the folds */ + } + SvREFCNT_dec(fold_intersection); + } + + /* Combine the two lists into one. */ + if (l1_fold_invlist) { + if (nonbitmap) { + _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap); + SvREFCNT_dec(l1_fold_invlist); + } + else { + nonbitmap = l1_fold_invlist; + } + } + + /* And combine the result (if any) with any inversion list from properties. + * The lists are kept separate up to now because we don't want to fold the + * properties */ + if (properties) { + if (nonbitmap) { + _invlist_union(nonbitmap, properties, &nonbitmap); + SvREFCNT_dec(properties); + } + else { + nonbitmap = properties; + } + } + + /* Here, contains all the code points we can determine at + * compile time that we haven't put into the bitmap. Go through it, and + * for things that belong in the bitmap, put them there, and delete from + * */ + if (nonbitmap) { + + /* Above-ASCII code points in /d have to stay in , as they + * possibly only should match when the target string is UTF-8 */ + UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255; + + /* This gets set if we actually need to modify things */ + bool change_invlist = FALSE; + + UV start, end; + + /* Start looking through */ + invlist_iterinit(nonbitmap); + while (invlist_iternext(nonbitmap, &start, &end)) { + UV high; + int i; + + /* Quit if are above what we should change */ + if (start > max_cp_to_set) { + break; + } + + change_invlist = TRUE; + + /* Set all the bits in the range, up to the max that we are doing */ + high = (end < max_cp_to_set) ? end : max_cp_to_set; + for (i = start; i <= (int) high; i++) { + if (! ANYOF_BITMAP_TEST(ret, i)) { + ANYOF_BITMAP_SET(ret, i); + stored++; + prevvalue = value; + value = i; + } + } + } + + /* Done with loop; remove any code points that are in the bitmap from + * */ + if (change_invlist) { + _invlist_subtract(nonbitmap, + (DEPENDS_SEMANTICS) + ? PL_ASCII + : PL_Latin1, + &nonbitmap); + } + + /* If have completely emptied it, remove it completely */ + if (invlist_len(nonbitmap) == 0) { + SvREFCNT_dec(nonbitmap); + nonbitmap = NULL; + } } /* Here, we have calculated what code points should be in the character - * class. Now we can see about various optimizations. Fold calculation - * needs to take place before inversion. Otherwise /[^k]/i would invert to - * include K, which under /i would match k. */ + * class. does not overlap the bitmap except possibly in the + * case of DEPENDS rules. + * + * Now we can see about various optimizations. Fold calculation (which we + * did above) needs to take place before inversion. Otherwise /[^k]/i + * would invert to include K, which under /i would match k, which it + * shouldn't. */ /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't - * set the FOLD flag yet, so this this does optimize those. It doesn't + * set the FOLD flag yet, so this does optimize those. It doesn't * optimize locale. Doing so perhaps could be done as long as there is * nothing like \w in it; some thought also would have to be given to the * interaction with above 0x100 chars */ - if (! LOC - && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT + if ((ANYOF_FLAGS(ret) & ANYOF_INVERT) + && ! LOC && ! unicode_alternate - && ! nonbitmap + /* In case of /d, there are some things that should match only when in + * not in the bitmap, i.e., they require UTF8 to match. These are + * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this + * case, they don't require UTF8, so can invert here */ + && (! nonbitmap + || ! DEPENDS_SEMANTICS + || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8)) && SvCUR(listsv) == initial_listsv_len) { - for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) - ANYOF_BITMAP(ret)[value] ^= 0xFF; + int i; + if (! nonbitmap) { + for (i = 0; i < 256; ++i) { + if (ANYOF_BITMAP_TEST(ret, i)) { + ANYOF_BITMAP_CLEAR(ret, i); + } + else { + ANYOF_BITMAP_SET(ret, i); + prevvalue = value; + value = i; + } + } + /* The inversion means that everything above 255 is matched */ + ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; + } + else { + /* Here, also has things outside the bitmap that may overlap with + * the bitmap. We have to sync them up, so that they get inverted + * in both places. Earlier, we removed all overlaps except in the + * case of /d rules, so no syncing is needed except for this case + */ + SV *remove_list = NULL; + + if (DEPENDS_SEMANTICS) { + UV start, end; + + /* Set the bits that correspond to the ones that aren't in the + * bitmap. Otherwise, when we invert, we'll miss these. + * Earlier, we removed from the nonbitmap all code points + * < 128, so there is no extra work here */ + invlist_iterinit(nonbitmap); + while (invlist_iternext(nonbitmap, &start, &end)) { + if (start > 255) { /* The bit map goes to 255 */ + break; + } + if (end > 255) { + end = 255; + } + for (i = start; i <= (int) end; ++i) { + ANYOF_BITMAP_SET(ret, i); + prevvalue = value; + value = i; + } + } + } + + /* Now invert both the bitmap and the nonbitmap. Anything in the + * bitmap has to also be removed from the non-bitmap, but again, + * there should not be overlap unless is /d rules. */ + _invlist_invert(nonbitmap); + + /* Any swash can't be used as-is, because we've inverted things */ + if (swash) { + SvREFCNT_dec(swash); + swash = NULL; + } + + for (i = 0; i < 256; ++i) { + if (ANYOF_BITMAP_TEST(ret, i)) { + ANYOF_BITMAP_CLEAR(ret, i); + if (DEPENDS_SEMANTICS) { + if (! remove_list) { + remove_list = _new_invlist(2); + } + remove_list = add_cp_to_invlist(remove_list, i); + } + } + else { + ANYOF_BITMAP_SET(ret, i); + prevvalue = value; + value = i; + } + } + + /* And do the removal */ + if (DEPENDS_SEMANTICS) { + if (remove_list) { + _invlist_subtract(nonbitmap, remove_list, &nonbitmap); + SvREFCNT_dec(remove_list); + } + } + else { + /* There is no overlap for non-/d, so just delete anything + * below 256 */ + _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap); + } + } + stored = 256 - stored; - /* The inversion means that everything above 255 is matched; and at the - * same time we clear the invert flag */ - ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL; + /* Clear the invert flag since have just done it here */ + ANYOF_FLAGS(ret) &= ~ANYOF_INVERT; } /* Folding in the bitmap is taken care of above, but not for locale (for * which we have to wait to see what folding is in effect at runtime), and - * for things not in the bitmap. Set run-time fold flag for these */ - if (FOLD && (LOC || nonbitmap)) { + * for some things not in the bitmap (only the upper latin folds in this + * case, as all other single-char folding has been set above). Set + * run-time fold flag for these */ + if (FOLD && (LOC + || (DEPENDS_SEMANTICS + && nonbitmap + && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8)) + || unicode_alternate)) + { ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD; } @@ -10129,6 +11501,7 @@ parseit: * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE * FI'. */ if (! nonbitmap + && ! unicode_alternate && SvCUR(listsv) == initial_listsv_len && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL)) && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) @@ -10167,17 +11540,28 @@ parseit: else { op = EXACT; } - } /* else 2 chars in the bit map: the folds of each other */ - else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) { + } + else { /* else 2 chars in the bit map: the folds of each other */ + + /* Use the folded value, which for the cases where we get here, + * is just the lower case of the current one (which may resolve to + * itself, or to the other one */ + value = toLOWER_LATIN1(value); /* To join adjacent nodes, they must be the exact EXACTish type. - * Try to use the most likely type, by using EXACTFU if the regex - * calls for them, or is required because the character is - * non-ASCII */ - op = EXACTFU; - } - else { /* Otherwise, more likely to be EXACTF type */ - op = EXACTF; + * Try to use the most likely type, by using EXACTFA if possible, + * then EXACTFU if the regex calls for it, or is required because + * the character is non-ASCII. (If is ASCII, its fold is + * also ASCII for the cases where we get here.) */ + if (MORE_ASCII_RESTRICTED && isASCII(value)) { + op = EXACTFA; + } + else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) { + op = EXACTFU; + } + else { /* Otherwise, more likely to be EXACTF type */ + op = EXACTF; + } } ret = reg_node(pRExC_state, op); @@ -10197,65 +11581,62 @@ parseit: return ret; } - if (nonbitmap) { - UV* nonbitmap_array = invlist_array(nonbitmap); - UV nonbitmap_len = invlist_len(nonbitmap); - UV i; - - /* Here have the full list of items to match that aren't in the - * bitmap. Convert to the structure that the rest of the code is - * expecting. XXX That rest of the code should convert to this - * structure */ - for (i = 0; i < nonbitmap_len; i++) { - - /* The next entry is the beginning of the range that is in the - * class */ - UV start = nonbitmap_array[i++]; - UV end; - - /* The next entry is the beginning of the next range, which isn't - * in the class, so the end of the current range is one less than - * that. But if there is no next range, it means that the range - * begun by 'start' extends to infinity, which for this platform - * ends at UV_MAX */ - if (i == nonbitmap_len) { - end = UV_MAX; - } - else { - end = nonbitmap_array[i] - 1; - } - - if (start == end) { - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start); - } - else { - /* The \t sets the whole range */ - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", - /* XXX EBCDIC */ - start, end); - } - } - invlist_destroy(nonbitmap); + /* If there is a swash and more than one element, we can't use the swash in + * the optimization below. */ + if (swash && element_count > 1) { + SvREFCNT_dec(swash); + swash = NULL; } - - if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) { + if (! nonbitmap + && SvCUR(listsv) == initial_listsv_len + && ! unicode_alternate) + { ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); SvREFCNT_dec(listsv); SvREFCNT_dec(unicode_alternate); } else { - + /* av[0] stores the character class description in its textual form: + * used later (regexec.c:Perl_regclass_swash()) to initialize the + * appropriate swash, and is also useful for dumping the regnode. + * av[1] if NULL, is a placeholder to later contain the swash computed + * from av[0]. But if no further computation need be done, the + * swash is stored there now. + * av[2] stores the multicharacter foldings, used later in + * regexec.c:S_reginclass(). + * av[3] stores the nonbitmap inversion list for use in addition or + * instead of av[0]; not used if av[1] isn't NULL + * av[4] is set if any component of the class is from a user-defined + * property; not used if av[1] isn't NULL */ AV * const av = newAV(); SV *rv; - /* The 0th element stores the character class description - * in its textual form: used later (regexec.c:Perl_regclass_swash()) - * to initialize the appropriate swash (which gets stored in - * the 1st element), and also useful for dumping the regnode. - * The 2nd element stores the multicharacter foldings, - * used later (regexec.c:S_reginclass()). */ - av_store(av, 0, listsv); - av_store(av, 1, NULL); - av_store(av, 2, MUTABLE_SV(unicode_alternate)); + + av_store(av, 0, (SvCUR(listsv) == initial_listsv_len) + ? &PL_sv_undef + : listsv); + if (swash) { + av_store(av, 1, swash); + SvREFCNT_dec(nonbitmap); + } + else { + av_store(av, 1, NULL); + if (nonbitmap) { + av_store(av, 3, nonbitmap); + av_store(av, 4, newSVuv(has_user_defined_property)); + } + } + + /* Store any computed multi-char folds only if we are allowing + * them */ + if (allow_full_fold) { + av_store(av, 2, MUTABLE_SV(unicode_alternate)); + if (unicode_alternate) { /* This node is variable length */ + OP(ret) = ANYOFV; + } + } + else { + av_store(av, 2, NULL); + } rv = newRV_noinc(MUTABLE_SV(av)); n = add_data(pRExC_state, 1, "s"); RExC_rxi->data->data[n] = (void*)rv; @@ -10263,7 +11644,6 @@ parseit: } return ret; } -#undef _C_C_T_ /* reg_skipcomment() @@ -10320,8 +11700,11 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) PERL_ARGS_ASSERT_NEXTCHAR; for (;;) { - if (*RExC_parse == '(' && RExC_parse[1] == '?' && - RExC_parse[2] == '#') { + if (RExC_end - RExC_parse >= 3 + && *RExC_parse == '(' + && RExC_parse[1] == '?' + && RExC_parse[2] == '#') + { while (*RExC_parse != ')') { if (RExC_parse == RExC_end) FAIL("Sequence (?#... not terminated"); @@ -10363,7 +11746,8 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) return(ret); } if (RExC_emit >= RExC_emit_bound) - Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", + op, RExC_emit, RExC_emit_bound); NODE_ALIGN_FILL(ret); ptr = ret; @@ -10405,7 +11789,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) We can't do this: assert(2==regarglen[op]+1); - + Anything larger than this has to allocate the extra amount. If we changed this to be: @@ -10418,7 +11802,8 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) return(ret); } if (RExC_emit >= RExC_emit_bound) - Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p", + op, RExC_emit, RExC_emit_bound); NODE_ALIGN_FILL(ret); ptr = ret; @@ -10629,9 +12014,11 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, for (;;) { regnode * const temp = regnext(scan); #ifdef EXPERIMENTAL_INPLACESCAN - if (PL_regkind[OP(scan)] == EXACT) - if (join_exact(pRExC_state,scan,&min,1,val,depth+1)) + if (PL_regkind[OP(scan)] == EXACT) { + bool has_exactf_sharp_s; /* Unexamined in this routine */ + if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1)) return EXACT; + } #endif if ( exact ) { switch (OP(scan)) { @@ -10639,6 +12026,8 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, case EXACTF: case EXACTFA: case EXACTFU: + case EXACTFU_SS: + case EXACTFU_NO_TRIE: case EXACTFL: if( exact == PSEUDO ) exact= OP(scan); @@ -10963,8 +12352,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ - else if (k == FOLDCHAR) - Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); @@ -11011,7 +12398,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); if (flags & ANYOF_INVERT) sv_catpvs(sv, "^"); - + /* output what the standard cp 0-255 bitmap matches */ for (i = 0; i <= 256; i++) { if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { @@ -11054,68 +12441,87 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) if (flags & ANYOF_NONBITMAP_NON_UTF8) sv_catpvs(sv, "{outside bitmap}"); - { - SV *lv; + if (ANYOF_NONBITMAP(o)) { + SV *lv; /* Set if there is something outside the bit map */ SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0); - - if (lv) { + bool byte_output = FALSE; /* If something in the bitmap has been + output */ + + if (lv && lv != &PL_sv_undef) { if (sw) { U8 s[UTF8_MAXBYTES_CASE+1]; - for (i = 0; i <= 256; i++) { /* just the first 256 */ + for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */ uvchr_to_utf8(s, i); - - if (i < 256 && swash_fetch(sw, s, TRUE)) { + + if (i < 256 + && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate + things already + output as part + of the bitmap */ + && swash_fetch(sw, s, TRUE)) + { if (rangestart == -1) rangestart = i; } else if (rangestart != -1) { + byte_output = TRUE; if (i <= rangestart + 3) for (; rangestart < i; rangestart++) { - const U8 * const e = uvchr_to_utf8(s,rangestart); - U8 *p; - for(p = s; p < e; p++) - put_byte(sv, *p); + put_byte(sv, rangestart); } else { - const U8 *e = uvchr_to_utf8(s,rangestart); - U8 *p; - for (p = s; p < e; p++) - put_byte(sv, *p); + put_byte(sv, rangestart); sv_catpvs(sv, "-"); - e = uvchr_to_utf8(s, i-1); - for (p = s; p < e; p++) - put_byte(sv, *p); - } - rangestart = -1; + put_byte(sv, i-1); } + rangestart = -1; } - - sv_catpvs(sv, "..."); /* et cetera */ + } } { char *s = savesvpv(lv); char * const origs = s; - + while (*s && *s != '\n') s++; - + if (*s == '\n') { const char * const t = ++s; - + + if (byte_output) { + sv_catpvs(sv, " "); + } + while (*s) { - if (*s == '\n') + if (*s == '\n') { + + /* Truncate very long output */ + if (s - origs > 256) { + Perl_sv_catpvf(aTHX_ sv, + "%.*s...", + (int) (s - origs - 1), + t); + goto out_dump; + } *s = ' '; + } + else if (*s == '\t') { + *s = '-'; + } s++; } if (s[-1] == ' ') s[-1] = 0; - + sv_catpv(sv, t); } - + + out_dump: + Safefree(origs); } + SvREFCNT_dec(lv); } } @@ -11495,7 +12901,8 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) 1: a buffer in a different thread 2: something we no longer hold a reference on so we need to copy it locally. */ - /* Note we need to sue SvCUR() on our mother_re, because it, in + /* Note we need to use SvCUR(), rather than + SvLEN(), on our mother_re, because it, in turn, may well be pointing to its own mother_re. */ SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re), SvCUR(ret->mother_re)+1)); @@ -11527,12 +12934,11 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) dVAR; struct regexp *const r = (struct regexp *)SvANY(rx); regexp_internal *reti; - int len, npar; + int len; RXi_GET_DECL(r,ri); PERL_ARGS_ASSERT_REGDUPE_INTERNAL; - npar = r->nparens+1; len = ProgLen(ri); Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); @@ -11644,7 +13050,7 @@ Perl_regnext(pTHX_ register regnode *p) } #endif -STATIC void +STATIC void S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) { va_list args; @@ -11821,7 +13227,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, goto after_print; } else CLEAR_OPTSTART; - + regprop(r, sv, node); PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), (int)(2*indent + 1), "", SvPVX_const(sv)); @@ -11869,7 +13275,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, sv_setpvs(sv, ""); for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); - + PerlIO_printf(Perl_debug_log, "%*s%s ", (int)(2*(indent+3)), "", elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,