X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/216bfc0a080a7190b7235110e12114b87f6e7b56..868625d2e6637edf38415dd2a2648c512c379d47:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 62e9448..560696e 100644 --- a/regcomp.c +++ b/regcomp.c @@ -142,6 +142,8 @@ typedef struct RExC_state_t { regnode **recurse; /* Recurse regops */ I32 recurse_count; /* Number of recurse regops */ I32 in_lookbehind; + I32 contains_locale; + I32 override_recoding; #if ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -190,6 +192,8 @@ typedef struct RExC_state_t { #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) #define RExC_in_lookbehind (pRExC_state->in_lookbehind) +#define RExC_contains_locale (pRExC_state->contains_locale) +#define RExC_override_recoding (pRExC_state->override_recoding) #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') @@ -380,6 +384,8 @@ static const scan_data_t zero_scan_data = #define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET) #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) +#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) +#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET) #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) @@ -507,6 +513,13 @@ static const scan_data_t zero_scan_data = (int)offset, RExC_precomp, RExC_precomp + offset); \ } STMT_END +#define ckWARN2regdep(loc,m, a1) STMT_START { \ + const IV offset = loc - RExC_precomp; \ + Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \ + m REPORT_LOCATION, \ + a1, (int)offset, RExC_precomp, RExC_precomp + offset); \ +} STMT_END + #define ckWARN2reg(loc, m, a1) STMT_START { \ const IV offset = loc - RExC_precomp; \ Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \ @@ -715,11 +728,32 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c { PERL_ARGS_ASSERT_CL_ANYTHING; - ANYOF_CLASS_ZERO(cl); ANYOF_BITMAP_SETALL(cl); - cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL; - if (LOC) + cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL + |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL + /* Even though no bitmap is in use here, we need to set + * the flag below so an AND with a node that does have one + * doesn't lose that one. The flag should get cleared if + * the other one doesn't; and the code in regexec.c is + * structured so this being set when not needed does no + * harm. It seemed a little cleaner to set it here than do + * a special case in cl_and() */ + |ANYOF_NONBITMAP_NON_UTF8; + + /* If any portion of the regex is to operate under locale rules, + * initialization includes it. The reason this isn't done for all regexes + * is that the optimizer was written under the assumption that locale was + * all-or-nothing. Given the complexity and lack of documentation in the + * optimizer, and that there are inadequate test cases for locale, so many + * parts of it may not work properly, it is safest to avoid locale unless + * necessary. */ + if (RExC_contains_locale) { + ANYOF_CLASS_SETALL(cl); /* /l uses class */ cl->flags |= ANYOF_LOCALE; + } + else { + ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */ + } } /* Can match anything (initialization) */ @@ -749,22 +783,15 @@ S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; cl_anything(pRExC_state, cl); + ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); } -STATIC void -S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) -{ - PERL_ARGS_ASSERT_CL_INIT_ZERO; - - Zero(cl, 1, struct regnode_charclass_class); - cl->type = ANYOF; - cl_anything(pRExC_state, cl); - if (LOC) - cl->flags |= ANYOF_LOCALE; -} +/* These two functions currently do the exact same thing */ +#define cl_init_zero S_cl_init -/* 'And' a given class with another one. Can create false positives */ -/* We assume that cl is not inverted */ +/* 'AND' a given class with another one. Can create false positives. 'cl' + * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if + * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void S_cl_and(struct regnode_charclass_class *cl, const struct regnode_charclass_class *and_with) @@ -773,6 +800,7 @@ S_cl_and(struct regnode_charclass_class *cl, assert(and_with->type == ANYOF); + /* I (khw) am not sure all these restrictions are necessary XXX */ if (!(ANYOF_CLASS_TEST_ANY_SET(and_with)) && !(ANYOF_CLASS_TEST_ANY_SET(cl)) && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) @@ -787,42 +815,88 @@ S_cl_and(struct regnode_charclass_class *cl, for (i = 0; i < ANYOF_BITMAP_SIZE; i++) cl->bitmap[i] &= and_with->bitmap[i]; } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ - if (!(and_with->flags & ANYOF_EOS)) - cl->flags &= ~ANYOF_EOS; - if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)) - cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD; - if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL)) - cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; + if (and_with->flags & ANYOF_INVERT) { - if (cl->flags & ANYOF_UNICODE_ALL - && and_with->flags & ANYOF_NONBITMAP - && !(and_with->flags & ANYOF_INVERT)) - { - if (! (and_with->flags & ANYOF_UNICODE_ALL)) { + /* Here, the and'ed node is inverted. Get the AND of the flags that + * aren't affected by the inversion. Those that are affected are + * handled individually below */ + U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS; + cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS); + cl->flags |= affected_flags; + + /* We currently don't know how to deal with things that aren't in the + * bitmap, but we know that the intersection is no greater than what + * is already in cl, so let there be false positives that get sorted + * out after the synthetic start class succeeds, and the node is + * matched for real. */ + + /* The inversion of these two flags indicate that the resulting + * intersection doesn't have them */ + if (and_with->flags & ANYOF_UNICODE_ALL) { cl->flags &= ~ANYOF_UNICODE_ALL; } - cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use - only the one(s) - actually set */ - ARG_SET(cl, ARG(and_with)); - } - if (!(and_with->flags & ANYOF_UNICODE_ALL) && - !(and_with->flags & ANYOF_INVERT)) - cl->flags &= ~ANYOF_UNICODE_ALL; - if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) && - !(and_with->flags & ANYOF_INVERT)) - cl->flags &= ~ANYOF_NONBITMAP; + if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) { + cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL; + } + } + else { /* and'd node is not inverted */ + if (! ANYOF_NONBITMAP(and_with)) { + + /* Here 'and_with' doesn't match anything outside the bitmap + * (except possibly ANYOF_UNICODE_ALL), which means the + * intersection can't either, except for ANYOF_UNICODE_ALL, in + * which case we don't know what the intersection is, but it's no + * greater than what cl already has, so can just leave it alone, + * with possible false positives */ + if (! (and_with->flags & ANYOF_UNICODE_ALL)) { + ARG_SET(cl, ANYOF_NONBITMAP_EMPTY); + cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8; + } + } + else if (! ANYOF_NONBITMAP(cl)) { + + /* Here, 'and_with' does match something outside the bitmap, and cl + * doesn't have a list of things to match outside the bitmap. If + * cl can match all code points above 255, the intersection will + * be those above-255 code points that 'and_with' matches. There + * may be false positives from code points in 'and_with' that are + * outside the bitmap but below 256, but those get sorted out + * after the synthetic start class succeeds). If cl can't match + * all Unicode code points, it means here that it can't match * + * anything outside the bitmap, so we leave the bitmap empty */ + if (cl->flags & ANYOF_UNICODE_ALL) { + ARG_SET(cl, ARG(and_with)); + } + } + else { + /* Here, both 'and_with' and cl match something outside the + * bitmap. Currently we do not do the intersection, so just match + * whatever cl had at the beginning. */ + } + + + /* Take the intersection of the two sets of flags */ + cl->flags &= and_with->flags; + } } -/* 'OR' a given class with another one. Can create false positives */ -/* We assume that cl is not inverted */ +/* 'OR' a given class with another one. Can create false positives. 'cl' + * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if + * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */ STATIC void S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with) { PERL_ARGS_ASSERT_CL_OR; if (or_with->flags & ANYOF_INVERT) { + + /* Here, the or'd node is to be inverted. This means we take the + * complement of everything not in the bitmap, but currently we don't + * know what that is, so give up and match anything */ + if (ANYOF_NONBITMAP(or_with)) { + cl_anything(pRExC_state, cl); + } /* We do not use * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) * <= (B1 | !B2) | (CL1 | !CL2) @@ -832,7 +906,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) = * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i')) */ - if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) + else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) { int i; @@ -843,7 +917,21 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con else { cl_anything(pRExC_state, cl); } - } else { + + /* And, we can just take the union of the flags that aren't affected + * by the inversion */ + cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS; + + /* For the remaining flags: + ANYOF_UNICODE_ALL and inverted means to not match anything above + 255, which means that the union with cl should just be + what cl has in it, so can ignore this flag + ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord + is 127-255 to match them, but then invert that, so the + union with cl should just be what cl has in it, so can + ignore this flag + */ + } else { /* 'or_with' is not inverted */ /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) @@ -862,25 +950,32 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con else { /* XXXX: logic is complicated, leave it along for a moment. */ cl_anything(pRExC_state, cl); } - } - if (or_with->flags & ANYOF_EOS) - cl->flags |= ANYOF_EOS; - if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL)) - cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL; - if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD) - cl->flags |= ANYOF_LOC_NONBITMAP_FOLD; + if (ANYOF_NONBITMAP(or_with)) { + + /* Use the added node's outside-the-bit-map match if there isn't a + * conflict. If there is a conflict (both nodes match something + * outside the bitmap, but what they match outside is not the same + * pointer, and hence not easily compared until XXX we extend + * inversion lists this far), give up and allow the start class to + * match everything outside the bitmap. If that stuff is all above + * 255, can just set UNICODE_ALL, otherwise caould be anything. */ + if (! ANYOF_NONBITMAP(cl)) { + ARG_SET(cl, ARG(or_with)); + } + else if (ARG(cl) != ARG(or_with)) { - /* If both nodes match something outside the bitmap, but what they match - * outside is not the same pointer, and hence not easily compared, give up - * and allow the start class to match everything outside the bitmap */ - if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP && - ARG(cl) != ARG(or_with)) { - cl->flags |= ANYOF_UNICODE_ALL; - } + if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) { + cl_anything(pRExC_state, cl); + } + else { + cl->flags |= ANYOF_UNICODE_ALL; + } + } - if (or_with->flags & ANYOF_UNICODE_ALL) { - cl->flags |= ANYOF_UNICODE_ALL; + /* Take the union */ + cl->flags |= or_with->flags; + } } } @@ -1397,6 +1492,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #endif switch (flags) { + case EXACTFA: case EXACTFU: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; case EXACTFL: folder = PL_fold_locale; break; @@ -2472,7 +2568,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags #define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS if (UTF - && ( OP(scan) == EXACTF || OP(scan) == EXACTFU) + && ( OP(scan) == EXACTF || OP(scan) == EXACTFU || OP(scan) == EXACTFA) && ( STR_LEN(scan) >= 6 ) ) { /* @@ -2936,10 +3032,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, If/when this is fixed the following define can be swapped in below to fully enable trie logic. + XXX It may work if not UTF and/or /a (AT_LEAST_UNI_SEMANTICS) but perhaps + not /aa + #define TRIE_TYPE_IS_SAFE 1 */ -#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT) +#define TRIE_TYPE_IS_SAFE ((UTF && UNI_SEMANTICS) || optype==EXACT) if ( last && TRIE_TYPE_IS_SAFE ) { make_trie( pRExC_state, @@ -3105,11 +3204,29 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc]))) ) + { compat = 0; + } ANYOF_CLASS_ZERO(data->start_class); ANYOF_BITMAP_ZERO(data->start_class); if (compat) ANYOF_BITMAP_SET(data->start_class, uc); + else if (uc >= 0x100) { + int i; + + /* Some Unicode code points fold to the Latin1 range; as + * XXX temporary code, instead of figuring out if this is + * one, just assume it is and set all the start class bits + * that could be some such above 255 code point's fold + * which will generate fals positives. As the code + * elsewhere that does compute the fold settles down, it + * can be extracted out and re-used here */ + for (i = 0; i < 256; i++){ + if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) { + ANYOF_BITMAP_SET(data->start_class, i); + } + } + } data->start_class->flags &= ~ANYOF_EOS; if (uc < 0x100) data->start_class->flags &= ~ANYOF_UNICODE_ALL; @@ -3159,6 +3276,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->start_class->flags &= ~ANYOF_EOS; data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD; if (OP(scan) == EXACTFL) { + /* XXX This set is probably no longer necessary, and + * probably wrong as LOCALE now is on in the initial + * state */ data->start_class->flags |= ANYOF_LOCALE; } else { @@ -3170,6 +3290,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]); } } + else if (uc >= 0x100) { + int i; + for (i = 0; i < 256; i++){ + if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) { + ANYOF_BITMAP_SET(data->start_class, i); + } + } + } } else if (flags & SCF_DO_STCLASS_OR) { if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) { @@ -3663,7 +3791,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); - else if (OP(scan) == ALNUMU) { + + /* Even if under locale, set the bits for non-locale + * in case it isn't a true locale-node. This will + * create false positives if it truly is locale */ + if (OP(scan) == ALNUMU) { for (value = 0; value < 256; value++) { if (isWORDCHAR_L1(value)) { ANYOF_BITMAP_SET(data->start_class, value); @@ -3700,19 +3832,21 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM); - else { - if (OP(scan) == NALNUMU) { - for (value = 0; value < 256; value++) { - if (! isWORDCHAR_L1(value)) { - ANYOF_BITMAP_SET(data->start_class, value); - } - } - } else { - for (value = 0; value < 256; value++) { - if (! isALNUM(value)) { - ANYOF_BITMAP_SET(data->start_class, value); - } - } + + /* Even if under locale, set the bits for non-locale in + * case it isn't a true locale-node. This will create + * false positives if it truly is locale */ + if (OP(scan) == NALNUMU) { + for (value = 0; value < 256; value++) { + if (! isWORDCHAR_L1(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (! isALNUM(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } } } } @@ -3740,7 +3874,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (data->start_class->flags & ANYOF_LOCALE) { ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); } - else if (OP(scan) == SPACEU) { + if (OP(scan) == SPACEU) { for (value = 0; value < 256; value++) { if (isSPACE_L1(value)) { ANYOF_BITMAP_SET(data->start_class, value); @@ -3777,7 +3911,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); - else if (OP(scan) == NSPACEU) { + if (OP(scan) == NSPACEU) { for (value = 0; value < 256; value++) { if (!isSPACE_L1(value)) { ANYOF_BITMAP_SET(data->start_class, value); @@ -3795,24 +3929,25 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, break; case DIGIT: if (flags & SCF_DO_STCLASS_AND) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT); - for (value = 0; value < 256; value++) - if (!isDIGIT(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + if (!(data->start_class->flags & ANYOF_LOCALE)) { + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT); + for (value = 0; value < 256; value++) + if (!isDIGIT(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } } else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT); - else { - for (value = 0; value < 256; value++) - if (isDIGIT(value)) - ANYOF_BITMAP_SET(data->start_class, value); - } + for (value = 0; value < 256; value++) + if (isDIGIT(value)) + ANYOF_BITMAP_SET(data->start_class, value); } break; case NDIGIT: if (flags & SCF_DO_STCLASS_AND) { - ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT); + if (!(data->start_class->flags & ANYOF_LOCALE)) + ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT); for (value = 0; value < 256; value++) if (isDIGIT(value)) ANYOF_BITMAP_CLEAR(data->start_class, value); @@ -3820,11 +3955,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT); - else { - for (value = 0; value < 256; value++) - if (!isDIGIT(value)) - ANYOF_BITMAP_SET(data->start_class, value); - } + for (value = 0; value < 256; value++) + if (!isDIGIT(value)) + ANYOF_BITMAP_SET(data->start_class, value); } break; CASE_SYNST_FNC(VERTWS); @@ -4383,6 +4516,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) I32 sawplus = 0; I32 sawopen = 0; bool used_setjump = FALSE; + regex_charset initial_charset = get_regex_charset(orig_pm_flags); U8 jump_ret = 0; dJMPENV; @@ -4401,6 +4535,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); RExC_uni_semantics = 0; + RExC_contains_locale = 0; /****************** LONG JUMP TARGET HERE***********************/ /* Longjmp back to here if have to switch in midstream to utf8 */ @@ -4457,11 +4592,15 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) restudied = 0; #endif - /* Set to use unicode semantics if the pattern is in utf8 and has the - * 'depends' charset specified, as it means unicode when utf8 */ pm_flags = orig_pm_flags; - if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) { + if (initial_charset == REGEX_LOCALE_CHARSET) { + RExC_contains_locale = 1; + } + else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) { + + /* Set to use unicode semantics if the pattern is in utf8 and has the + * 'depends' charset specified, as it means unicode when utf8 */ set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET); } @@ -4474,6 +4613,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_seen_evals = 0; RExC_extralen = 0; + RExC_override_recoding = 0; /* First pass: determine size, legality. */ RExC_parse = exp; @@ -4982,14 +5122,13 @@ reStudy: && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY)) ri->regstclass = NULL; - /* If the synthetic start class were to ever be used when EOS is set, - * that bit would have to be cleared, as it is shared with another */ if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset) && stclass_flag && !(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { const U32 n = add_data(pRExC_state, 1, "f"); + data.start_class->flags |= ANYOF_IS_SYNTHETIC; Newx(RExC_rxi->data->data[n], 1, struct regnode_charclass_class); @@ -5057,12 +5196,11 @@ reStudy: r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8 = r->float_substr = r->float_utf8 = NULL; - /* If the synthetic start class were to ever be used when EOS is set, - * that bit would have to be cleared, as it is shared with another */ if (!(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { const U32 n = add_data(pRExC_state, 1, "f"); + data.start_class->flags |= ANYOF_IS_SYNTHETIC; Newx(RExC_rxi->data->data[n], 1, struct regnode_charclass_class); @@ -5940,7 +6078,7 @@ Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV } #endif -PERL_STATIC_INLINE HV* +STATIC HV* S_invlist_union(pTHX_ HV* const a, HV* const b) { /* Return a new inversion list which is the union of two inversion lists. @@ -6091,7 +6229,7 @@ S_invlist_union(pTHX_ HV* const a, HV* const b) return u; } -PERL_STATIC_INLINE HV* +STATIC HV* S_invlist_intersection(pTHX_ HV* const a, HV* const b) { /* Return the intersection of two inversion lists. The basis for this @@ -6216,18 +6354,25 @@ S_invlist_intersection(pTHX_ HV* const a, HV* const b) } STATIC HV* -S_add_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end) +S_add_range_to_invlist(pTHX_ HV* invlist, const UV start, const UV end) { /* Add the range from 'start' to 'end' inclusive to the inversion list's * set. A pointer to the inversion list is returned. This may actually be - * a new list, in which case the passed in one has been destroyed */ + * a new list, in which case the passed in one has been destroyed. The + * passed in inversion list can be NULL, in which case a new one is created + * with just the one range in it */ HV* range_invlist; HV* added_invlist; + UV len; - UV len = invlist_len(invlist); - - PERL_ARGS_ASSERT_ADD_RANGE_TO_INVLIST; + if (invlist == NULL) { + invlist = _new_invlist(2); + len = 0; + } + else { + len = invlist_len(invlist); + } /* If comes after the final entry, can just append it to the end */ if (len == 0 @@ -6254,6 +6399,11 @@ S_add_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end) return added_invlist; } +PERL_STATIC_INLINE HV* +S_add_cp_to_invlist(pTHX_ HV* invlist, const UV cp) { + return add_range_to_invlist(invlist, cp, cp); +} + /* End of inversion list object */ /* @@ -6452,11 +6602,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, ((! FOLD) ? NREF - : (UNI_SEMANTICS) - ? NREFFU - : (LOC) - ? NREFFL - : NREFF), + : (MORE_ASCII_RESTRICTED) + ? NREFFA + : (AT_LEAST_UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), num); *flagp |= HASWIDTH; @@ -6896,7 +7048,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) U32 posflags = 0, negflags = 0; U32 *flagsp = &posflags; bool has_charset_modifier = 0; - regex_charset cs = REGEX_DEPENDS_CHARSET; + regex_charset cs = (RExC_utf8 || RExC_uni_semantics) + ? REGEX_UNICODE_CHARSET + : REGEX_DEPENDS_CHARSET; while (*RExC_parse) { /* && strchr("iogcmsx", *RExC_parse) */ @@ -6910,6 +7064,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } cs = REGEX_LOCALE_CHARSET; has_charset_modifier = 1; + RExC_contains_locale = 1; break; case UNICODE_PAT_MOD: if (has_charset_modifier || flagsp == &negflags) { @@ -6922,7 +7077,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (has_charset_modifier || flagsp == &negflags) { goto fail_modifiers; } - cs = REGEX_ASCII_RESTRICTED_CHARSET; + if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) { + /* Doubled modifier implies more restricted */ + cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET; + RExC_parse++; + } + else { + cs = REGEX_ASCII_RESTRICTED_CHARSET; + } has_charset_modifier = 1; break; case DEPENDS_PAT_MOD: @@ -7055,12 +7217,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) parse_start = RExC_parse; /* MJD */ br = regbranch(pRExC_state, &flags, 1,depth+1); - if (freeze_paren) { - if (RExC_npar > after_freeze) - after_freeze = RExC_npar; - RExC_npar = freeze_paren; - } - /* branch_len = (paren != 0); */ if (br == NULL) @@ -7204,7 +7360,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (RExC_in_lookbehind) { RExC_in_lookbehind--; } - if (after_freeze) + if (after_freeze > RExC_npar) RExC_npar = after_freeze; return(ret); } @@ -7477,7 +7633,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } -/* reg_namedseq(pRExC_state,UVp) +/* reg_namedseq(pRExC_state,UVp, UV depth) This is expected to be called by a parser routine that has recognized '\N' and needs to handle the rest. RExC_parse is @@ -7520,13 +7676,10 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) Parsing failures will generate a fatal error via vFAIL(...) */ STATIC regnode * -S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) +S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth) { char * endbrace; /* '}' following the name */ regnode *ret = NULL; -#ifdef DEBUGGING - char* parse_start = RExC_parse - 2; /* points to the '\N' */ -#endif char* p; GET_RE_DEBUG_FLAGS_DECL; @@ -7639,117 +7792,55 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp) ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */ } else { /* Not a char class */ - char *s; /* String to put in generated EXACT node */ - STRLEN len = 0; /* Its current byte length */ + + /* What is done here is to convert this to a sub-pattern of the form + * (?:\x{char1}\x{char2}...) + * and then call reg recursively. That way, it retains its atomicness, + * while not having to worry about special handling that some code + * points may have. toke.c has converted the original Unicode values + * to native, so that we can just pass on the hex values unchanged. We + * do have to set a flag to keep recoding from happening in the + * recursion */ + + SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP); + STRLEN len; char *endchar; /* Points to '.' or '}' ending cur char in the input stream */ + char *orig_end = RExC_end; - ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT - : (LOC) - ? EXACTFL - : UNI_SEMANTICS - ? EXACTFU - : EXACTF)); - s= STRING(ret); - - /* Exact nodes can hold only a U8 length's of text = 255. Loop through - * the input which is of the form now 'c1.c2.c3...}' until find the - * ending brace or exceed length 255. The characters that exceed this - * limit are dropped. The limit could be relaxed should it become - * desirable by reparsing this as (?:\N{NAME}), so could generate - * multiple EXACT nodes, as is done for just regular input. But this - * is primarily a named character, and not intended to be a huge long - * string, so 255 bytes should be good enough */ - while (1) { - STRLEN length_of_hex; - I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES - | PERL_SCAN_DISALLOW_PREFIX - | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); - UV cp; /* Ord of current character */ + while (RExC_parse < endbrace) { /* Code points are separated by dots. If none, there is only one * code point, and is terminated by the brace */ endchar = RExC_parse + strcspn(RExC_parse, ".}"); - /* The values are Unicode even on EBCDIC machines */ - length_of_hex = (STRLEN)(endchar - RExC_parse); - cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL); - if ( length_of_hex == 0 - || length_of_hex != (STRLEN)(endchar - RExC_parse) ) - { - RExC_parse += length_of_hex; /* Includes all the valid */ - RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */ - ? UTF8SKIP(RExC_parse) - : 1; - /* Guard against malformed utf8 */ - if (RExC_parse >= endchar) RExC_parse = endchar; - vFAIL("Invalid hexadecimal number in \\N{U+...}"); - } - - if (! FOLD) { /* Not folding, just append to the string */ - STRLEN unilen; - - /* Quit before adding this character if would exceed limit */ - if (len + UNISKIP(cp) > U8_MAX) break; - - unilen = reguni(pRExC_state, cp, s); - if (unilen > 0) { - s += unilen; - len += unilen; - } - } else { /* Folding, output the folded equivalent */ - STRLEN foldlen,numlen; - U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; - cp = toFOLD_uni(cp, tmpbuf, &foldlen); - - /* Quit before exceeding size limit */ - if (len + foldlen > U8_MAX) break; - - for (foldbuf = tmpbuf; - foldlen; - foldlen -= numlen) - { - cp = utf8_to_uvchr(foldbuf, &numlen); - if (numlen > 0) { - const STRLEN unilen = reguni(pRExC_state, cp, s); - s += unilen; - len += unilen; - /* In EBCDIC the numlen and unilen can differ. */ - foldbuf += numlen; - if (numlen >= foldlen) - break; - } - else - break; /* "Can't happen." */ - } - } + /* Convert to notation the rest of the code understands */ + sv_catpv(substitute_parse, "\\x{"); + sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse); + sv_catpv(substitute_parse, "}"); /* Point to the beginning of the next character in the sequence. */ RExC_parse = endchar + 1; - - /* Quit if no more characters */ - if (RExC_parse >= endbrace) break; } + sv_catpv(substitute_parse, ")"); + RExC_parse = SvPV(substitute_parse, len); - if (SIZE_ONLY) { - if (RExC_parse < endbrace) { - ckWARNreg(RExC_parse - 1, - "Using just the first characters returned by \\N{}"); - } - - RExC_size += STR_SZ(len); - } else { - STR_LEN(ret) = len; - RExC_emit += STR_SZ(len); + /* Don't allow empty number */ + if (len < 8) { + vFAIL("Invalid hexadecimal number in \\N{U+...}"); } + RExC_end = RExC_parse + len; - RExC_parse = endbrace + 1; + /* The values are Unicode, and therefore not subject to recoding */ + RExC_override_recoding = 1; + + ret = reg(pRExC_state, 1, flagp, depth+1); + + RExC_parse = endbrace; + RExC_end = orig_end; + RExC_override_recoding = 0; - *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail - with malformed in t/re/pat_advanced.t */ - RExC_parse --; - Set_Node_Cur_Length(ret); /* MJD */ nextchar(pRExC_state); } @@ -7912,27 +8003,6 @@ tryagain: RExC_parse++; vFAIL("Quantifier follows nothing"); break; - case LATIN_SMALL_LETTER_SHARP_S: - case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S): - case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T): -#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T) -#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below. - case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T): -#endif - do_foldchar: - if (!LOC && FOLD) { - U32 len,cp; - len=0; /* silence a spurious compiler warning */ - if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) { - *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */ - RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */ - ret = reganode(pRExC_state, FOLDCHAR, cp); - Set_Node_Length(ret, 1); /* MJD */ - nextchar(pRExC_state); /* kill whitespace under /x */ - return ret; - } - } - goto outer_default; case '\\': /* Special Escapes @@ -7947,10 +8017,6 @@ tryagain: literal text handling code. */ switch ((U8)*++RExC_parse) { - case LATIN_SMALL_LETTER_SHARP_S: - case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S): - case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T): - goto do_foldchar; /* Special Escapes */ case 'A': RExC_seen_zerolen++; @@ -8000,6 +8066,7 @@ tryagain: op = ALNUMU; break; case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: op = ALNUMA; break; case REGEX_DEPENDS_CHARSET: @@ -8020,6 +8087,7 @@ tryagain: op = NALNUMU; break; case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: op = NALNUMA; break; case REGEX_DEPENDS_CHARSET: @@ -8042,6 +8110,7 @@ tryagain: op = BOUNDU; break; case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: op = BOUNDA; break; case REGEX_DEPENDS_CHARSET: @@ -8053,6 +8122,9 @@ tryagain: ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; + if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { + ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead"); + } goto finish_meta_pat; case 'B': RExC_seen_zerolen++; @@ -8065,6 +8137,7 @@ tryagain: op = NBOUNDU; break; case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: op = NBOUNDA; break; case REGEX_DEPENDS_CHARSET: @@ -8076,6 +8149,9 @@ tryagain: ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; + if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') { + ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead"); + } goto finish_meta_pat; case 's': switch (get_regex_charset(RExC_flags)) { @@ -8086,6 +8162,7 @@ tryagain: op = SPACEU; break; case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: op = SPACEA; break; case REGEX_DEPENDS_CHARSET: @@ -8106,6 +8183,7 @@ tryagain: op = NSPACEU; break; case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: op = NSPACEA; break; case REGEX_DEPENDS_CHARSET: @@ -8123,6 +8201,7 @@ tryagain: op = DIGITL; break; case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: op = DIGITA; break; case REGEX_DEPENDS_CHARSET: /* No difference between these */ @@ -8141,6 +8220,7 @@ tryagain: op = NDIGITL; break; case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: op = NDIGITA; break; case REGEX_DEPENDS_CHARSET: /* No difference between these */ @@ -8219,7 +8299,7 @@ tryagain: Also this makes sure that things like /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq*/ ++RExC_parse; - ret= reg_namedseq(pRExC_state, NULL, flagp); + ret= reg_namedseq(pRExC_state, NULL, flagp, depth); break; case 'k': /* Handle \k and \k'NAME' */ parse_named_seq: @@ -8249,11 +8329,13 @@ tryagain: ret = reganode(pRExC_state, ((! FOLD) ? NREF - : (AT_LEAST_UNI_SEMANTICS) - ? NREFFU - : (LOC) - ? NREFFL - : NREFF), + : (MORE_ASCII_RESTRICTED) + ? NREFFA + : (AT_LEAST_UNI_SEMANTICS) + ? NREFFU + : (LOC) + ? NREFFL + : NREFF), num); *flagp |= HASWIDTH; @@ -8317,11 +8399,13 @@ tryagain: ret = reganode(pRExC_state, ((! FOLD) ? REF - : (AT_LEAST_UNI_SEMANTICS) - ? REFFU - : (LOC) - ? REFFL - : REFF), + : (MORE_ASCII_RESTRICTED) + ? REFFA + : (AT_LEAST_UNI_SEMANTICS) + ? REFFU + : (LOC) + ? REFFL + : REFF), num); *flagp |= HASWIDTH; @@ -8353,27 +8437,42 @@ tryagain: /* FALL THROUGH */ default: - outer_default:{ + + parse_start = RExC_parse - 1; + + RExC_parse++; + + defchar: { + typedef enum { + generic_char = 0, + char_s, + upsilon_1, + upsilon_2, + iota_1, + iota_2, + } char_state; + char_state latest_char_state = generic_char; register STRLEN len; register UV ender; register char *p; char *s; STRLEN foldlen; U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; + regnode * orig_emit; - parse_start = RExC_parse - 1; - - RExC_parse++; - - defchar: ender = 0; + orig_emit = RExC_emit; /* Save the original output node position in + case we need to output a different node + type */ ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT : (LOC) ? EXACTFL - : (AT_LEAST_UNI_SEMANTICS) - ? EXACTFU - : EXACTF) + : (MORE_ASCII_RESTRICTED) + ? EXACTFA + : (AT_LEAST_UNI_SEMANTICS) + ? EXACTFU + : EXACTF) ); s = STRING(ret); for (len = 0, p = RExC_parse - 1; @@ -8385,11 +8484,6 @@ tryagain: if (RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); switch ((U8)*p) { - case LATIN_SMALL_LETTER_SHARP_S: - case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S): - case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T): - if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) - goto normal_default; case '^': case '$': case '.': @@ -8414,11 +8508,6 @@ tryagain: switch ((U8)*++p) { /* These are all the special escapes. */ - case LATIN_SMALL_LETTER_SHARP_S: - case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S): - case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T): - if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) - goto normal_default; case 'A': /* Start assertion */ case 'b': case 'B': /* Word-boundary assertion*/ case 'C': /* Single char !DANGEROUS! */ @@ -8545,7 +8634,7 @@ tryagain: goto recode_encoding; break; recode_encoding: - { + if (! RExC_override_recoding) { SV* enc = PL_encoding; ender = reg_recode((const char)(U8)ender, &enc); if (!enc && SIZE_ONLY) @@ -8577,14 +8666,272 @@ tryagain: p += numlen; } else - ender = *p++; + ender = (U8) *p++; break; + } /* End of switch on the literal */ + + /* Certain characters are problematic because their folded + * length is so different from their original length that it + * isn't handleable by the optimizer. They are therefore not + * placed in an EXACTish node; and are here handled specially. + * (Even if the optimizer handled LATIN_SMALL_LETTER_SHARP_S, + * putting it in a special node keeps regexec from having to + * deal with a non-utf8 multi-char fold */ + if (FOLD + && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC))) + { + /* We look for either side of the fold. For example \xDF + * folds to 'ss'. We look for both the single character + * \xDF and the sequence 'ss'. When we find something that + * could be one of those, we stop and flush whatever we + * have output so far into the EXACTish node that was being + * built. Then restore the input pointer to what it was. + * regatom will return that EXACT node, and will be called + * again, positioned so the first character is the one in + * question, which we return in a different node type. + * The multi-char folds are a sequence, so the occurrence + * of the first character in that sequence doesn't + * necessarily mean that what follows is the rest of the + * sequence. We keep track of that with a state machine, + * with the state being set to the latest character + * processed before the current one. Most characters will + * set the state to 0, but if one occurs that is part of a + * potential tricky fold sequence, the state is set to that + * character, and the next loop iteration sees if the state + * should progress towards the final folded-from character, + * or if it was a false alarm. If it turns out to be a + * false alarm, the character(s) will be output in a new + * EXACTish node, and join_exact() will later combine them. + * In the case of the 'ss' sequence, which is more common + * and more easily checked, some look-ahead is done to + * save time by ruling-out some false alarms */ + switch (ender) { + default: + latest_char_state = generic_char; + break; + case 's': + case 'S': + if (AT_LEAST_UNI_SEMANTICS) { + if (latest_char_state == char_s) { /* 'ss' */ + ender = LATIN_SMALL_LETTER_SHARP_S; + goto do_tricky; + } + else if (p < RExC_end) { + + /* Look-ahead at the next character. If it + * is also an s, we handle as a sharp s + * tricky regnode. */ + if (*p == 's' || *p == 'S') { + + /* But first flush anything in the + * EXACTish buffer */ + if (len != 0) { + p = oldp; + goto loopdone; + } + p++; /* Account for swallowing this + 's' up */ + ender = LATIN_SMALL_LETTER_SHARP_S; + goto do_tricky; + } + /* Here, the next character is not a + * literal 's', but still could + * evaluate to one if part of a \o{}, + * \x or \OCTAL-DIGIT. The minimum + * length required for that is 4, eg + * \x53 or \123 */ + else if (*p == '\\' + && p < RExC_end - 4 + && (isDIGIT(*(p + 1)) + || *(p + 1) == 'x' + || *(p + 1) == 'o' )) + { + + /* Here, it could be an 's', too much + * bother to figure it out here. Flush + * the buffer if any; when come back + * here, set the state so know that the + * previous char was an 's' */ + if (len != 0) { + latest_char_state = generic_char; + p = oldp; + goto loopdone; + } + latest_char_state = char_s; + break; + } + } + } + + /* Here, can't be an 'ss' sequence, or at least not + * one that could fold to/from the sharp ss */ + latest_char_state = generic_char; + break; + case 0x03C5: /* First char in upsilon series */ + if (p < RExC_end - 4) { /* Need >= 4 bytes left */ + latest_char_state = upsilon_1; + if (len != 0) { + p = oldp; + goto loopdone; + } + } + else { + latest_char_state = generic_char; + } + break; + case 0x03B9: /* First char in iota series */ + if (p < RExC_end - 4) { + latest_char_state = iota_1; + if (len != 0) { + p = oldp; + goto loopdone; + } + } + else { + latest_char_state = generic_char; + } + break; + case 0x0308: + if (latest_char_state == upsilon_1) { + latest_char_state = upsilon_2; + } + else if (latest_char_state == iota_1) { + latest_char_state = iota_2; + } + else { + latest_char_state = generic_char; + } + break; + case 0x301: + if (latest_char_state == upsilon_2) { + ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS; + goto do_tricky; + } + else if (latest_char_state == iota_2) { + ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS; + goto do_tricky; + } + latest_char_state = generic_char; + break; + + /* These are the tricky fold characters. Flush any + * buffer first. */ + case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS: + case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS: + case LATIN_SMALL_LETTER_SHARP_S: + case LATIN_CAPITAL_LETTER_SHARP_S: + case 0x1FD3: + case 0x1FE3: + if (len != 0) { + p = oldp; + goto loopdone; + } + /* FALL THROUGH */ + do_tricky: { + char* const oldregxend = RExC_end; + U8 tmpbuf[UTF8_MAXBYTES+1]; + + /* Here, we know we need to generate a special + * regnode, and 'ender' contains the tricky + * character. What's done is to pretend it's in a + * [bracketed] class, and let the code that deals + * with those handle it, as that code has all the + * intelligence necessary. First save the current + * parse state, get rid of the already allocated + * but empty EXACT node that the ANYOFV node will + * replace, and point the parse to a buffer which + * we fill with the character we want the regclass + * code to think is being parsed */ + RExC_emit = orig_emit; + RExC_parse = (char *) tmpbuf; + if (UTF) { + U8 *d = uvchr_to_utf8(tmpbuf, ender); + *d = '\0'; + RExC_end = (char *) d; + } + else { /* ender above 255 already excluded */ + tmpbuf[0] = (U8) ender; + tmpbuf[1] = '\0'; + RExC_end = RExC_parse + 1; + } + + ret = regclass(pRExC_state,depth+1); + + /* Here, have parsed the buffer. Reset the parse to + * the actual input, and return */ + RExC_end = oldregxend; + RExC_parse = p - 1; + + Set_Node_Offset(ret, RExC_parse); + Set_Node_Cur_Length(ret); + nextchar(pRExC_state); + *flagp |= HASWIDTH|SIMPLE; + return ret; + } + } } + if ( RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); if (UTF && FOLD) { - /* Prime the casefolded buffer. */ - ender = toFOLD_uni(ender, tmpbuf, &foldlen); + /* Prime the casefolded buffer. Locale rules, which apply + * only to code points < 256, aren't known until execution, + * so for them, just output the original character using + * utf8 */ + if (LOC && ender < 256) { + if (UNI_IS_INVARIANT(ender)) { + *tmpbuf = (U8) ender; + foldlen = 1; + } else { + *tmpbuf = UTF8_TWO_BYTE_HI(ender); + *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender); + foldlen = 2; + } + } + else if (isASCII(ender)) { /* Note: Here can't also be LOC + */ + ender = toLOWER(ender); + *tmpbuf = (U8) ender; + foldlen = 1; + } + else if (! MORE_ASCII_RESTRICTED && ! LOC) { + + /* Locale and /aa require more selectivity about the + * fold, so are handled below. Otherwise, here, just + * use the fold */ + ender = toFOLD_uni(ender, tmpbuf, &foldlen); + } + else { + /* Under locale rules or /aa we are not to mix, + * respectively, ords < 256 or ASCII with non-. So + * reject folds that mix them, using only the + * non-folded code point. So do the fold to a + * temporary, and inspect each character in it. */ + U8 trialbuf[UTF8_MAXBYTES_CASE+1]; + U8* s = trialbuf; + UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen); + U8* e = s + foldlen; + bool fold_ok = TRUE; + + while (s < e) { + if (isASCII(*s) + || (LOC && (UTF8_IS_INVARIANT(*s) + || UTF8_IS_DOWNGRADEABLE_START(*s)))) + { + fold_ok = FALSE; + break; + } + s += UTF8SKIP(s); + } + if (fold_ok) { + Copy(trialbuf, tmpbuf, foldlen, U8); + ender = tmpender; + } + else { + uvuni_to_utf8(tmpbuf, ender); + foldlen = UNISKIP(ender); + } + } } if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ if (len) @@ -8656,10 +9003,12 @@ tryagain: } len--; } - else + else { REGC((char)ender, s++); + } } - loopdone: + loopdone: /* Jumped to when encounters something that shouldn't be in + the node */ RExC_parse = p - 1; Set_Node_Cur_Length(ret); /* MJD */ nextchar(pRExC_state); @@ -8882,14 +9231,14 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) ANYOF_##NAME: \ for (value = 0; value < 256; value++) \ if (TEST) \ - stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \ + stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ yesno = '+'; \ what = WORD; \ break; \ case ANYOF_N##NAME: \ for (value = 0; value < 256; value++) \ if (!TEST) \ - stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \ + stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ yesno = '!'; \ what = WORD; \ break @@ -8904,14 +9253,14 @@ ANYOF_##NAME: \ else if (UNI_SEMANTICS) { \ for (value = 0; value < 256; value++) { \ if (TEST_8(value)) stored += \ - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \ + set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ } \ } \ else { \ for (value = 0; value < 128; value++) { \ if (TEST_7(UNI_TO_NATIVE(value))) stored += \ - S_set_regclass_bit(aTHX_ pRExC_state, ret, \ - (U8) UNI_TO_NATIVE(value), &nonbitmap); \ + set_regclass_bit(pRExC_state, ret, \ + (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ } \ } \ yesno = '+'; \ @@ -8922,20 +9271,20 @@ case ANYOF_N##NAME: \ else if (UNI_SEMANTICS) { \ for (value = 0; value < 256; value++) { \ if (! TEST_8(value)) stored += \ - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); \ + set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \ } \ } \ else { \ for (value = 0; value < 128; value++) { \ - if (! TEST_7(UNI_TO_NATIVE(value))) stored += S_set_regclass_bit( \ - aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \ + if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \ + pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ } \ - if (ASCII_RESTRICTED) { \ + if (AT_LEAST_ASCII_RESTRICTED) { \ for (value = 128; value < 256; value++) { \ - stored += S_set_regclass_bit( \ - aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \ + stored += set_regclass_bit( \ + pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \ } \ - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8; \ + ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \ } \ else { \ /* For a non-ut8 target string with DEPENDS semantics, all above \ @@ -8943,69 +9292,151 @@ case ANYOF_N##NAME: \ * classes. But in utf8, they have their Unicode semantics, so \ * can't just set them in the bitmap, or else regexec.c will think \ * they matched when they shouldn't. */ \ - ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8; \ + ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \ } \ } \ yesno = '!'; \ what = WORD; \ break -/* - We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test - so that it is possible to override the option here without having to - rebuild the entire core. as we are required to do if we change regcomp.h - which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined. -*/ -#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS -#define BROKEN_UNICODE_CHARCLASS_MAPPINGS -#endif - -#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS -#define POSIX_CC_UNI_NAME(CCNAME) CCNAME -#else -#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME -#endif - STATIC U8 -S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr) +S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr) { /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes. * Locale folding is done at run-time, so this function should not be * called for nodes that are for locales. * - * This function simply sets the bit corresponding to the fold of the input + * This function sets the bit corresponding to the fold of the input * 'value', if not already set. The fold of 'f' is 'F', and the fold of * 'F' is 'f'. * - * It also sets any necessary flags, and returns the number of bits that - * actually changed from 0 to 1 */ + * It also knows about the characters that are in the bitmap that have + * folds that are matchable only outside it, and sets the appropriate lists + * and flags. + * + * It returns the number of bits that actually changed from 0 to 1 */ U8 stored = 0; U8 fold; + PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD; + fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value] - : PL_fold[value]; + : PL_fold[value]; /* It assumes the bit for 'value' has already been set */ if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) { ANYOF_BITMAP_SET(node, fold); stored++; } - if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) - || (! UNI_SEMANTICS - && ! isASCII(value) - && PL_fold_latin1[value] != value)) - { /* A character that has a fold outside of Latin1 matches outside the - bitmap, but only when the target string is utf8. Similarly when we - don't have unicode semantics for the above ASCII Latin-1 characters, - and they have a fold, they should match if the target is utf8, and - not otherwise */ - if (! *nonbitmap_ptr) { - *nonbitmap_ptr = _new_invlist(2); + if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) { + /* Certain Latin1 characters have matches outside the bitmap. To get + * here, 'value' is one of those characters. None of these matches is + * valid for ASCII characters under /aa, which have been excluded by + * the 'if' above. The matches fall into three categories: + * 1) They are singly folded-to or -from an above 255 character, as + * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y + * WITH DIAERESIS; + * 2) They are part of a multi-char fold with another character in the + * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill; + * 3) They are part of a multi-char fold with a character not in the + * bitmap, such as various ligatures. + * We aren't dealing fully with multi-char folds, except we do deal + * with the pattern containing a character that has a multi-char fold + * (not so much the inverse). + * For types 1) and 3), the matches only happen when the target string + * is utf8; that's not true for 2), and we set a flag for it. + * + * The code below adds to the passed in inversion list the single fold + * closures for 'value'. The values are hard-coded here so that an + * innocent-looking character class, like /[ks]/i won't have to go out + * to disk to find the possible matches. XXX It would be better to + * generate these via regen, in case a new version of the Unicode + * standard adds new mappings, though that is not really likely. */ + switch (value) { + case 'k': + case 'K': + /* KELVIN SIGN */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A); + break; + case 's': + case 'S': + /* LATIN SMALL LETTER LONG S */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F); + break; + case MICRO_SIGN: + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + GREEK_SMALL_LETTER_MU); + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + GREEK_CAPITAL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + /* ANGSTROM SIGN */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B); + if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + PL_fold_latin1[value]); + } + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + case LATIN_SMALL_LETTER_SHARP_S: + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, + LATIN_CAPITAL_LETTER_SHARP_S); + + /* Under /a, /d, and /u, this can match the two chars "ss" */ + if (! MORE_ASCII_RESTRICTED) { + add_alternate(alternate_ptr, (U8 *) "ss", 2); + + /* And under /u or /a, it can match even if the target is + * not utf8 */ + if (AT_LEAST_UNI_SEMANTICS) { + ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8; + } + } + break; + case 'F': case 'f': + case 'I': case 'i': + case 'L': case 'l': + case 'T': case 't': + /* These all are targets of multi-character folds, which can + * occur with only non-Latin1 characters in the fold, so they + * can match if the target string isn't UTF-8 */ + ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8; + break; + case 'A': case 'a': + case 'H': case 'h': + case 'J': case 'j': + case 'N': case 'n': + case 'W': case 'w': + case 'Y': case 'y': + /* These all are targets of multi-character folds, which occur + * only with a non-Latin1 character as part of the fold, so + * they can't match unless the target string is in UTF-8, so no + * action here is necessary */ + break; + default: + /* Use deprecated warning to increase the chances of this + * being output */ + ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value); + break; } - *nonbitmap_ptr = add_range_to_invlist(*nonbitmap_ptr, value, value); - ANYOF_FLAGS(node) |= ANYOF_UTF8; + } + else if (DEPENDS_SEMANTICS + && ! isASCII(value) + && PL_fold_latin1[value] != value) + { + /* Under DEPENDS rules, non-ASCII Latin1 characters match their + * folds only when the target string is in UTF-8. We add the fold + * here to the list of things to match outside the bitmap, which + * won't be looked at unless it is UTF8 (or else if something else + * says to look even if not utf8, but those things better not happen + * under DEPENDS semantics. */ + *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]); } return stored; @@ -9013,7 +9444,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 PERL_STATIC_INLINE U8 -S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr) +S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** invlist_ptr, AV** alternate_ptr) { /* This inline function sets a bit in the bitmap if not already set, and if * appropriate, its fold, returning the number of bits that actually @@ -9021,6 +9452,8 @@ S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 valu U8 stored; + PERL_ARGS_ASSERT_SET_REGCLASS_BIT; + if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */ return 0; } @@ -9029,18 +9462,36 @@ S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 valu stored = 1; if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */ - stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value, nonbitmap_ptr); + stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr); } return stored; } +STATIC void +S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) +{ + /* Adds input 'string' with length 'len' to the ANYOF node's unicode + * alternate list, pointed to by 'alternate_ptr'. This is an array of + * the multi-character folds of characters in the node */ + SV *sv; + + PERL_ARGS_ASSERT_ADD_ALTERNATE; + + if (! *alternate_ptr) { + *alternate_ptr = newAV(); + } + sv = newSVpvn_utf8((char*)string, len, TRUE); + av_push(*alternate_ptr, sv); + return; +} + /* parse a class specification and produce either an ANYOF node that - matches the pattern or if the pattern matches a single char only and - that char is < 256 and we are case insensitive then we produce an - EXACT node instead. -*/ + matches the pattern or perhaps will be optimized into an EXACTish node + instead. The node contains a bit map for the first 256 characters, with the + corresponding bit set if that character is in the list. For characters + above 255, a range list is used */ STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) @@ -9056,8 +9507,30 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) char *rangebegin = NULL; bool need_class = 0; SV *listsv = NULL; + STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more + than just initialized. */ UV n; + + /* code points this node matches that can't be stored in the bitmap */ HV* nonbitmap = NULL; + + /* The items that are to match that aren't stored in the bitmap, but are a + * result of things that are stored there. This is the fold closure of + * such a character, either because it has DEPENDS semantics and shouldn't + * be matched unless the target string is utf8, or is a code point that is + * too large for the bit map, as for example, the fold of the MICRO SIGN is + * above 255. This all is solely for performance reasons. By having this + * code know the outside-the-bitmap folds that the bitmapped characters are + * involved with, we don't have to go out to disk to find the list of + * matches, unless the character class includes code points that aren't + * storable in the bit map. That means that a character class with an 's' + * in it, for example, doesn't need to go out to disk to find everything + * that matches. A 2nd list is used so that the 'nonbitmap' list is kept + * empty unless there is something whose fold we don't know about, and will + * have to go out to the disk to find. */ + HV* l1_fold_invlist = NULL; + + /* List of multi-character folds that are matched by this node */ AV* unicode_alternate = NULL; #ifdef EBCDIC UV literal_endpoint = 0; @@ -9093,23 +9566,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) if (SIZE_ONLY) { RExC_size += ANYOF_SKIP; -#ifdef ANYOF_ADD_LOC_SKIP - if (LOC) { - RExC_size += ANYOF_ADD_LOC_SKIP; - } -#endif listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */ } else { RExC_emit += ANYOF_SKIP; if (LOC) { ANYOF_FLAGS(ret) |= ANYOF_LOCALE; -#ifdef ANYOF_ADD_LOC_SKIP - RExC_emit += ANYOF_ADD_LOC_SKIP; -#endif } ANYOF_BITMAP_ZERO(ret); listsv = newSVpvs("# comment\n"); + initial_listsv_len = SvCUR(listsv); } nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0; @@ -9175,7 +9641,7 @@ parseit: from earlier versions, OTOH that behaviour was broken as well. */ UV v; /* value is register so we cant & it /grrr */ - if (reg_namedseq(pRExC_state, &v, NULL)) { + if (reg_namedseq(pRExC_state, &v, NULL, depth)) { goto parseit; } value= v; @@ -9204,14 +9670,7 @@ parseit: e = RExC_parse; n = 1; } - if (SIZE_ONLY) { - if (LOC) { - ckWARN2reg(RExC_parse, - "\\%c uses Unicode rules, not locale rules", - (int) value); - } - } - else { + if (!SIZE_ONLY) { if (UCHARAT(RExC_parse) == '^') { RExC_parse++; n--; @@ -9238,7 +9697,7 @@ parseit: /* The \p could match something in the Latin1 range, hence * something that isn't utf8 */ - ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP; + ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; namedclass = ANYOF_MAX; /* no official name, but it's named */ /* \p means they want Unicode semantics */ @@ -9307,7 +9766,7 @@ parseit: break; } recode_encoding: - { + if (! RExC_override_recoding) { SV* enc = PL_encoding; value = reg_recode((const char)(U8)value, &enc); if (!enc && SIZE_ONLY) @@ -9341,21 +9800,18 @@ parseit: if (LOC && namedclass < ANYOF_MAX && ! need_class) { need_class = 1; if (SIZE_ONLY) { -#ifdef ANYOF_CLASS_ADD_SKIP - RExC_size += ANYOF_CLASS_ADD_SKIP; -#endif + RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP; } else { -#ifdef ANYOF_CLASS_ADD_SKIP - RExC_emit += ANYOF_CLASS_ADD_SKIP; -#endif + RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP; ANYOF_CLASS_ZERO(ret); } ANYOF_FLAGS(ret) |= ANYOF_CLASS; } /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a - * literal */ + * literal, as is the character that began the false range, i.e. + * the 'a' in the examples */ if (range) { if (!SIZE_ONLY) { const int w = @@ -9365,16 +9821,14 @@ parseit: "False [] range \"%*.*s\"", w, w, rangebegin); + stored += + set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate); if (prevvalue < 256) { stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue, &nonbitmap); - stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, '-', &nonbitmap); + set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate); } else { - ANYOF_FLAGS(ret) |= ANYOF_UTF8; - Perl_sv_catpvf(aTHX_ listsv, - "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-'); + nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue); } } @@ -9403,15 +9857,9 @@ parseit: case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace"); case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct"); case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper"); -#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS /* \s, \w match all unicode if utf8. */ case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl"); case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word"); -#else - /* \s, \w match ascii and locale only */ - case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "PerlSpace"); - case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "PerlWord"); -#endif case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit"); case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); @@ -9421,7 +9869,7 @@ parseit: else { for (value = 0; value < 128; value++) stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap); + set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); } yesno = '+'; what = NULL; /* Doesn't match outside ascii, so @@ -9433,7 +9881,7 @@ parseit: else { for (value = 128; value < 256; value++) stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap); + set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); } ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; yesno = '!'; @@ -9446,10 +9894,10 @@ parseit: /* consecutive digits assumed */ for (value = '0'; value <= '9'; value++) stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); + set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); } yesno = '+'; - what = POSIX_CC_UNI_NAME("Digit"); + what = "Digit"; break; case ANYOF_NDIGIT: if (LOC) @@ -9458,14 +9906,14 @@ parseit: /* consecutive digits assumed */ for (value = 0; value < '0'; value++) stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); + set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); for (value = '9' + 1; value < 256; value++) stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap); + set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); } yesno = '!'; - what = POSIX_CC_UNI_NAME("Digit"); - if (ASCII_RESTRICTED ) { + what = "Digit"; + if (AT_LEAST_ASCII_RESTRICTED ) { ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; } break; @@ -9476,10 +9924,9 @@ parseit: vFAIL("Invalid [::] class"); break; } - if (what && ! (ASCII_RESTRICTED)) { + if (what && ! (AT_LEAST_ASCII_RESTRICTED)) { /* Strings such as "+utf8::isWord\n" */ Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what); - ANYOF_FLAGS(ret) |= ANYOF_UTF8; } continue; @@ -9495,8 +9942,10 @@ parseit: } else { prevvalue = value; /* save the beginning of the range */ - if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && - RExC_parse[1] != ']') { + if (RExC_parse+1 < RExC_end + && *RExC_parse == '-' + && RExC_parse[1] != ']') + { RExC_parse++; /* a bad range like \w-, [:word:]- ? */ @@ -9511,13 +9960,15 @@ parseit: } if (!SIZE_ONLY) stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, '-', &nonbitmap); + set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate); } else range = 1; /* yeah, it's a range! */ continue; /* but do it the next time */ } } + /* non-Latin1 code point implies unicode semantics. Must be set in + * pass1 so is there for the whole of pass 2 */ if (value > 255) { RExC_uni_semantics = 1; } @@ -9538,129 +9989,27 @@ parseit: for (i = prevvalue; i <= ceilvalue; i++) if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap); + set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); } } else { for (i = prevvalue; i <= ceilvalue; i++) if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { stored += - S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap); + set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); } } } else #endif for (i = prevvalue; i <= ceilvalue; i++) { - stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap); + stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); } } if (value > 255) { const UV prevnatvalue = NATIVE_TO_UNI(prevvalue); const UV natvalue = NATIVE_TO_UNI(value); - if (! nonbitmap) { - nonbitmap = _new_invlist(2); - } nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue); - ANYOF_FLAGS(ret) |= ANYOF_UTF8; } -#if 0 - - /* If the code point requires utf8 to represent, and we are not - * folding, it can't match unless the target is in utf8. Only - * a few code points above 255 fold to below it, so XXX an - * optimization would be to know which ones and set the flag - * appropriately. */ - ANYOF_FLAGS(ret) |= (FOLD || value < 256) - ? ANYOF_NONBITMAP - : ANYOF_UTF8; - if (prevnatvalue < natvalue) { /* '>' case is fatal error above */ - - /* The \t sets the whole range */ - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", - prevnatvalue, natvalue); - - /* Currently, we don't look at every value in the range. - * Therefore we have to assume the worst case: that if - * folding, it will match more than one character. But in - * lookbehind patterns, can only be single character - * length, so disallow those folds */ - if (FOLD && ! RExC_in_lookbehind) { - OP(ret) = ANYOFV; - } - } - else if (prevnatvalue == natvalue) { - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue); - if (FOLD) { - U8 foldbuf[UTF8_MAXBYTES_CASE+1]; - STRLEN foldlen; - const UV f = to_uni_fold(natvalue, foldbuf, &foldlen); - -#ifdef EBCDIC /* RD t/uni/fold ff and 6b */ - if (RExC_precomp[0] == ':' && - RExC_precomp[1] == '[' && - (f == 0xDF || f == 0x92)) { - f = NATIVE_TO_UNI(f); - } -#endif - /* If folding and foldable and a single - * character, insert also the folded version - * to the charclass. */ - if (f != value) { -#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */ - if ((RExC_precomp[0] == ':' && - RExC_precomp[1] == '[' && - (f == 0xA2 && - (value == 0xFB05 || value == 0xFB06))) ? - foldlen == ((STRLEN)UNISKIP(f) - 1) : - foldlen == (STRLEN)UNISKIP(f) ) -#else - if (foldlen == (STRLEN)UNISKIP(f)) -#endif - Perl_sv_catpvf(aTHX_ listsv, - "%04"UVxf"\n", f); - else if (! RExC_in_lookbehind) { - /* Any multicharacter foldings - * (disallowed in lookbehind patterns) - * require the following transform: - * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) - * where E folds into "pq" and F folds - * into "rst", all other characters - * fold to single characters. We save - * away these multicharacter foldings, - * to be later saved as part of the - * additional "s" data. */ - SV *sv; - - if (!unicode_alternate) - unicode_alternate = newAV(); - sv = newSVpvn_utf8((char*)foldbuf, foldlen, - TRUE); - av_push(unicode_alternate, sv); - OP(ret) = ANYOFV; - } - } - - /* If folding and the value is one of the Greek - * sigmas insert a few more sigmas to make the - * folding rules of the sigmas to work right. - * Note that not all the possible combinations - * are handled here: some of them are handled - * by the standard folding rules, and some of - * them (literal or EXACTF cases) are handled - * during runtime in regexec.c:S_find_byclass(). */ - if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) { - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", - (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA); - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", - (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); - } - else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA) - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", - (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA); - } - } - } -#endif #ifdef EBCDIC literal_endpoint = 0; #endif @@ -9675,227 +10024,229 @@ parseit: return ret; /****** !SIZE_ONLY AFTER HERE *********/ - /* Finish up the non-bitmap entries */ - if (nonbitmap) { - UV* nonbitmap_array; + /* If folding and there are code points above 255, we calculate all + * characters that could fold to or from the ones already on the list */ + if (FOLD && nonbitmap) { UV i; - /* If folding, we add to the list all characters that could fold to or - * from the ones already on the list */ - if (FOLD) { - HV* fold_intersection; - UV* fold_list; - - /* This is a list of all the characters that participate in folds - * (except marks, etc in multi-char folds */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0); - PL_utf8_foldable = _swash_to_invlist(swash); - } + HV* fold_intersection; + UV* fold_list; - /* This is a hash that for a particular fold gives all characters - * that are involved in it */ - if (! PL_utf8_foldclosures) { - - /* If we were unable to find any folds, then we likely won't be - * able to find the closures. So just create an empty list. - * Folding will effectively be restricted to the non-Unicode - * rules hard-coded into Perl. (This case happens legitimately - * during compilation of Perl itself before the Unicode tables - * are generated) */ - if (invlist_len(PL_utf8_foldable) == 0) { - PL_utf8_foldclosures = _new_invlist(0); - } else { - /* If the folds haven't been read in, call a fold function - * to force that */ - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; - STRLEN dummy_len; - to_utf8_fold((U8*) "A", dummy, &dummy_len); - } - PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); + /* This is a list of all the characters that participate in folds + * (except marks, etc in multi-char folds */ + if (! PL_utf8_foldable) { + SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0); + PL_utf8_foldable = _swash_to_invlist(swash); + } + + /* This is a hash that for a particular fold gives all characters + * that are involved in it */ + if (! PL_utf8_foldclosures) { + + /* If we were unable to find any folds, then we likely won't be + * able to find the closures. So just create an empty list. + * Folding will effectively be restricted to the non-Unicode rules + * hard-coded into Perl. (This case happens legitimately during + * compilation of Perl itself before the Unicode tables are + * generated) */ + if (invlist_len(PL_utf8_foldable) == 0) { + PL_utf8_foldclosures = _new_invlist(0); + } else { + /* If the folds haven't been read in, call a fold function + * to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES+1]; + STRLEN dummy_len; + to_utf8_fold((U8*) "A", dummy, &dummy_len); } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); } + } - /* Only the characters in this class that participate in folds need - * be checked. Get the intersection of this class and all the - * possible characters that are foldable. This can quickly narrow - * down a large class */ - fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap); - - /* Now look at the foldable characters in this class individually */ - fold_list = invlist_array(fold_intersection); - for (i = 0; i < invlist_len(fold_intersection); i++) { - UV j; - - /* The next entry is the beginning of the range that is in the - * class */ - UV start = fold_list[i++]; - - - /* The next entry is the beginning of the next range, which - * isn't in the class, so the end of the current range is one - * less than that */ - UV end = fold_list[i] - 1; - - /* Look at every character in the range */ - for (j = start; j <= end; j++) { - - /* Get its fold */ - U8 foldbuf[UTF8_MAXBYTES_CASE+1]; - STRLEN foldlen; - const UV f = to_uni_fold(j, foldbuf, &foldlen); - - if (foldlen > (STRLEN)UNISKIP(f)) { - - /* Any multicharacter foldings (disallowed in - * lookbehind patterns) require the following - * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where - * E folds into "pq" and F folds into "rst", all other - * characters fold to single characters. We save away - * these multicharacter foldings, to be later saved as - * part of the additional "s" data. */ - if (! RExC_in_lookbehind) { - /* XXX Discard this fold if any are latin1 and LOC */ - SV *sv; - - if (!unicode_alternate) { - unicode_alternate = newAV(); - } - sv = newSVpvn_utf8((char*)foldbuf, foldlen, TRUE); - av_push(unicode_alternate, sv); + /* Only the characters in this class that participate in folds need + * be checked. Get the intersection of this class and all the + * possible characters that are foldable. This can quickly narrow + * down a large class */ + fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap); - /* This node is variable length */ - OP(ret) = ANYOFV; - ANYOF_FLAGS(ret) |= ANYOF_UNICODE; - } - } - else { /* Single character fold */ - SV** listp; - - /* Consider "k" =~ /[K]/i. The line above would have - * just folded the 'k' to itself, and that isn't going - * to match 'K'. So we look through the closure of - * everything that folds to 'k'. That will find the - * 'K'. Initialize the list, if necessary */ - - /* The data structure is a hash with the keys every - * character that is folded to, like 'k', and the - * values each an array of everything that folds to its - * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */ - if ((listp = hv_fetch(PL_utf8_foldclosures, - (char *) foldbuf, foldlen, FALSE))) - { - AV* list = (AV*) *listp; - IV k; - for (k = 0; k <= av_len(list); k++) { - SV** c_p = av_fetch(list, k, FALSE); - UV c; - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - c = SvUV(*c_p); + /* Now look at the foldable characters in this class individually */ + fold_list = invlist_array(fold_intersection); + for (i = 0; i < invlist_len(fold_intersection); i++) { + UV j; - if (c < 256 && AT_LEAST_UNI_SEMANTICS) { - stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) c, &nonbitmap); + /* The next entry is the beginning of the range that is in the + * class */ + UV start = fold_list[i++]; + + + /* The next entry is the beginning of the next range, which + * isn't in the class, so the end of the current range is one + * less than that */ + UV end = fold_list[i] - 1; + + /* Look at every character in the range */ + for (j = start; j <= end; j++) { + + /* Get its fold */ + U8 foldbuf[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen; + const UV f = to_uni_fold(j, foldbuf, &foldlen); + + if (foldlen > (STRLEN)UNISKIP(f)) { + + /* Any multicharacter foldings (disallowed in + * lookbehind patterns) require the following + * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where + * E folds into "pq" and F folds into "rst", all other + * characters fold to single characters. We save away + * these multicharacter foldings, to be later saved as + * part of the additional "s" data. */ + if (! RExC_in_lookbehind) { + U8* loc = foldbuf; + U8* e = foldbuf + foldlen; + + /* If any of the folded characters of this are in + * the Latin1 range, tell the regex engine that + * this can match a non-utf8 target string. The + * only multi-byte fold whose source is in the + * Latin1 range (U+00DF) applies only when the + * target string is utf8, or under unicode rules */ + if (j > 255 || AT_LEAST_UNI_SEMANTICS) { + while (loc < e) { + + /* Can't mix ascii with non- under /aa */ + if (MORE_ASCII_RESTRICTED + && (isASCII(*loc) != isASCII(j))) + { + goto end_multi_fold; } - /* It may be that the code point is already - * in this range or already in the bitmap, - * XXX THink about LOC - * in which case we need do nothing */ - else if ((c < start || c > end) - && (c > 255 - || ! ANYOF_BITMAP_TEST(ret, c))) + if (UTF8_IS_INVARIANT(*loc) + || UTF8_IS_DOWNGRADEABLE_START(*loc)) { - nonbitmap = add_range_to_invlist(nonbitmap, c, c); + /* Can't mix above and below 256 under + * LOC */ + if (LOC) { + goto end_multi_fold; + } + ANYOF_FLAGS(ret) + |= ANYOF_NONBITMAP_NON_UTF8; + break; } + loc += UTF8SKIP(loc); } } - } - } - } - invlist_destroy(fold_intersection); - } /* End of processing all the folds */ - - /* Here have the full list of items to match that aren't in the - * bitmap. Convert to the structure that the rest of the code is - * expecting. XXX That rest of the code should convert to this - * structure */ - nonbitmap_array = invlist_array(nonbitmap); - for (i = 0; i < invlist_len(nonbitmap); i++) { - /* The next entry is the beginning of the range that is in the - * class */ - UV start = nonbitmap_array[i++]; + add_alternate(&unicode_alternate, foldbuf, foldlen); + end_multi_fold: ; + } - /* The next entry is the beginning of the next range, which isn't - * in the class, so the end of the current range is one less than - * that */ - UV end = nonbitmap_array[i] - 1; + /* This is special-cased, as it is the only letter which + * has both a multi-fold and single-fold in Latin1. All + * the other chars that have single and multi-folds are + * always in utf8, and the utf8 folding algorithm catches + * them */ + if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) { + stored += set_regclass_bit(pRExC_state, + ret, + LATIN_SMALL_LETTER_SHARP_S, + &l1_fold_invlist, &unicode_alternate); + } + } + else { + /* Single character fold. Add everything in its fold + * closure to the list that this node should match */ + SV** listp; + + /* The fold closures data structure is a hash with the + * keys being every character that is folded to, like + * 'k', and the values each an array of everything that + * folds to its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) foldbuf, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_len(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and + * non-; /l doesn't allow them between above + * and below 256 */ + if ((MORE_ASCII_RESTRICTED + && (isASCII(c) != isASCII(j))) + || (LOC && ((c < 256) != (j < 256)))) + { + continue; + } - if (start == end) { - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start); - } - else { - /* The \t sets the whole range */ - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", - /* XXX EBCDIC */ - start, end); + if (c < 256 && AT_LEAST_UNI_SEMANTICS) { + stored += set_regclass_bit(pRExC_state, + ret, + (U8) c, + &l1_fold_invlist, &unicode_alternate); + } + /* It may be that the code point is already + * in this range or already in the bitmap, + * in which case we need do nothing */ + else if ((c < start || c > end) + && (c > 255 + || ! ANYOF_BITMAP_TEST(ret, c))) + { + nonbitmap = add_cp_to_invlist(nonbitmap, c); + } + } + } + } } } - invlist_destroy(nonbitmap); + invlist_destroy(fold_intersection); + } + + /* Combine the two lists into one. */ + if (l1_fold_invlist) { + if (nonbitmap) { + nonbitmap = invlist_union(nonbitmap, l1_fold_invlist); + } + else { + nonbitmap = l1_fold_invlist; + } } + /* Here, we have calculated what code points should be in the character + * class. Now we can see about various optimizations. Fold calculation + * needs to take place before inversion. Otherwise /[^k]/i would invert to + * include K, which under /i would match k. */ + /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't * set the FOLD flag yet, so this this does optimize those. It doesn't * optimize locale. Doing so perhaps could be done as long as there is * nothing like \w in it; some thought also would have to be given to the * interaction with above 0x100 chars */ - if (! LOC && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { + if (! LOC + && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT + && ! unicode_alternate + && ! nonbitmap + && SvCUR(listsv) == initial_listsv_len) + { for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) ANYOF_BITMAP(ret)[value] ^= 0xFF; stored = 256 - stored; /* The inversion means that everything above 255 is matched; and at the * same time we clear the invert flag */ - ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL; + ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL; } - if (FOLD) { - SV *sv; - - /* This is the one character in the bitmap that needs special handling - * under non-locale folding, as it folds to two characters 'ss'. This - * happens if it is set and not inverting, or isn't set and are - * inverting (disallowed in lookbehind patterns because they can't be - * variable length) */ - if (! LOC - && ! RExC_in_lookbehind - && (cBOOL(ANYOF_BITMAP_TEST(ret, LATIN_SMALL_LETTER_SHARP_S)) - ^ cBOOL(ANYOF_FLAGS(ret) & ANYOF_INVERT))) - { - OP(ret) = ANYOFV; /* Can match more than a single char */ - - /* Under Unicode semantics), it can do this when the target string - * isn't in utf8 */ - if (UNI_SEMANTICS) { - ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; - } - - if (!unicode_alternate) { - unicode_alternate = newAV(); - } - sv = newSVpvn_utf8("ss", 2, TRUE); - av_push(unicode_alternate, sv); - } - - /* Folding in the bitmap is taken care of above, but not for locale - * (for which we have to wait to see what folding is in effect at - * runtime), and for things not in the bitmap. Set run-time fold flag - * for these */ - if ((LOC || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP))) { - ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD; - } + /* Folding in the bitmap is taken care of above, but not for locale (for + * which we have to wait to see what folding is in effect at runtime), and + * for things not in the bitmap. Set run-time fold flag for these */ + if (FOLD && (LOC || nonbitmap || unicode_alternate)) { + ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD; } /* A single character class can be "optimized" into an EXACTish node. @@ -9912,7 +10263,10 @@ parseit: * characters which only have the two folds; so things like 'fF' and 'Ii' * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE * FI'. */ - if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL)) + if (! nonbitmap + && ! unicode_alternate + && SvCUR(listsv) == initial_listsv_len + && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL)) && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) || (! ANYOF_CLASS_TEST_ANY_SET(ret))))) || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) @@ -9979,7 +10333,54 @@ parseit: return ret; } - { + if (nonbitmap) { + UV* nonbitmap_array = invlist_array(nonbitmap); + UV nonbitmap_len = invlist_len(nonbitmap); + UV i; + + /* Here have the full list of items to match that aren't in the + * bitmap. Convert to the structure that the rest of the code is + * expecting. XXX That rest of the code should convert to this + * structure */ + for (i = 0; i < nonbitmap_len; i++) { + + /* The next entry is the beginning of the range that is in the + * class */ + UV start = nonbitmap_array[i++]; + UV end; + + /* The next entry is the beginning of the next range, which isn't + * in the class, so the end of the current range is one less than + * that. But if there is no next range, it means that the range + * begun by 'start' extends to infinity, which for this platform + * ends at UV_MAX */ + if (i == nonbitmap_len) { + end = UV_MAX; + } + else { + end = nonbitmap_array[i] - 1; + } + + if (start == end) { + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start); + } + else { + /* The \t sets the whole range */ + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", + /* XXX EBCDIC */ + start, end); + } + } + invlist_destroy(nonbitmap); + } + + if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) { + ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); + SvREFCNT_dec(listsv); + SvREFCNT_dec(unicode_alternate); + } + else { + AV * const av = newAV(); SV *rv; /* The 0th element stores the character class description @@ -9991,6 +10392,9 @@ parseit: av_store(av, 0, listsv); av_store(av, 1, NULL); av_store(av, 2, MUTABLE_SV(unicode_alternate)); + if (unicode_alternate) { /* This node is variable length */ + OP(ret) = ANYOFV; + } rv = newRV_noinc(MUTABLE_SV(av)); n = add_data(pRExC_state, 1, "s"); RExC_rxi->data->data[n] = (void*)rv; @@ -10372,6 +10776,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, switch (OP(scan)) { case EXACT: case EXACTF: + case EXACTFA: case EXACTFU: case EXACTFL: if( exact == PSEUDO ) @@ -10453,6 +10858,9 @@ S_regdump_extflags(pTHX_ const char *lead, const U32 flags) case REGEX_ASCII_RESTRICTED_CHARSET: PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED"); break; + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: + PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED"); + break; default: PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET"); break; @@ -10780,12 +11188,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) /* output information about the unicode matching */ if (flags & ANYOF_UNICODE_ALL) sv_catpvs(sv, "{unicode_all}"); - else if (flags & ANYOF_UTF8) + else if (ANYOF_NONBITMAP(o)) sv_catpvs(sv, "{unicode}"); if (flags & ANYOF_NONBITMAP_NON_UTF8) sv_catpvs(sv, "{outside bitmap}"); - { + if (ANYOF_NONBITMAP(o)) { SV *lv; SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);