X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/e1ee396065926b895c7d618e39afbf16cb549f31..5c388b33f99c7a69f32810e1889f45652f531eab:/regexec.c diff --git a/regexec.c b/regexec.c index 33fb5da..6a209ad 100644 --- a/regexec.c +++ b/regexec.c @@ -231,15 +231,15 @@ static const char* const non_utf8_target_but_utf8_required #if 0 /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so - we don't need this definition. */ + we don't need this definition. XXX These are now out-of-sync*/ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) #define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ -#define IS_TEXT(rn) ( OP(rn)==EXACT ) -#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) +#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL ) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -485,7 +485,7 @@ S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum); } - assert(0); /* NOTREACHED */ + NOT_REACHED; /* NOTREACHED */ return FALSE; } @@ -498,7 +498,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) * '_char_class_number'. * * This just calls isFOO_lc on the code point for the character if it is in - * the range 0-255. Outside that range, all characters avoid Unicode + * the range 0-255. Outside that range, all characters use Unicode * rules, ignoring any locale. So use the Unicode function if this class * requires a swash, and use the Unicode macro otherwise. */ @@ -512,6 +512,8 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); } + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character)); + if (classnum < _FIRST_NON_SWASH_CC) { /* Initialize the swash unless done already */ @@ -703,6 +705,7 @@ Perl_re_intuit_start(pTHX_ goto fail; } + RX_MATCH_UTF8_set(rx,utf8_target); reginfo->is_utf8_target = cBOOL(utf8_target); reginfo->info_aux = NULL; reginfo->strbeg = strbeg; @@ -772,7 +775,7 @@ Perl_re_intuit_start(pTHX_ * be too fiddly (e.g. REXEC_IGNOREPOS). */ if ( strpos != strbeg - && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL))) + && (prog->intflags & PREGf_ANCH_SBOL)) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " Not at start...\n")); @@ -896,7 +899,7 @@ Perl_re_intuit_start(pTHX_ /* If the regex is absolutely anchored to either the start of the - * string (BOL,SBOL) or to pos() (ANCH_GPOS), then + * string (SBOL) or to pos() (ANCH_GPOS), then * check_offset_max represents an upper bound on the string where * the substr could start. For the ANCH_GPOS case, we assume that * the caller of intuit will have already set strpos to @@ -1432,23 +1435,39 @@ Perl_re_intuit_start(pTHX_ #define DECL_TRIE_TYPE(scan) \ - const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ - trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ - trie_type = ((scan->flags == EXACT) \ - ? (utf8_target ? trie_utf8 : trie_plain) \ - : (scan->flags == EXACTFA) \ - ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ - : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ + trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \ + trie_utf8l, trie_flu8 } \ + trie_type = ((scan->flags == EXACT) \ + ? (utf8_target ? trie_utf8 : trie_plain) \ + : (scan->flags == EXACTL) \ + ? (utf8_target ? trie_utf8l : trie_plain) \ + : (scan->flags == EXACTFA) \ + ? (utf8_target \ + ? trie_utf8_exactfa_fold \ + : trie_latin_utf8_exactfa_fold) \ + : (scan->flags == EXACTFLU8 \ + ? trie_flu8 \ + : (utf8_target \ + ? trie_utf8_fold \ + : trie_latin_utf8_fold))) #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ STMT_START { \ STRLEN skiplen; \ U8 flags = FOLD_FLAGS_FULL; \ switch (trie_type) { \ + case trie_flu8: \ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ + if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \ + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \ + } \ + goto do_trie_utf8_fold; \ case trie_utf8_exactfa_fold: \ flags |= FOLD_FLAGS_NOMIX_ASCII; \ - /* FALLTHROUGH */ \ + /* FALLTHROUGH */ \ case trie_utf8_fold: \ + do_trie_utf8_fold: \ if ( foldlen>0 ) { \ uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ @@ -1464,7 +1483,7 @@ STMT_START { break; \ case trie_latin_utf8_exactfa_fold: \ flags |= FOLD_FLAGS_NOMIX_ASCII; \ - /* FALLTHROUGH */ \ + /* FALLTHROUGH */ \ case trie_latin_utf8_fold: \ if ( foldlen>0 ) { \ uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ @@ -1479,6 +1498,12 @@ STMT_START { uscan = foldbuf + skiplen; \ } \ break; \ + case trie_utf8l: \ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \ + if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \ + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \ + } \ + /* FALLTHROUGH */ \ case trie_utf8: \ uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ break; \ @@ -1738,6 +1763,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* We know what class it must start with. */ switch (OP(c)) { + case ANYOFL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + /* FALLTHROUGH */ case ANYOF: if (utf8_target) { REXEC_FBC_UTF8_CLASS_SCAN( @@ -1779,6 +1807,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto do_exactf_non_utf8; case EXACTFL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { utf8_fold_flags = FOLDEQ_LOCALE; goto do_exactf_utf8; @@ -1793,6 +1822,15 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } goto do_exactf_utf8; + case EXACTFLU8: + if (! utf8_target) { /* All code points in this node require + UTF-8 to express. */ + break; + } + utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED + | FOLDEQ_S2_FOLDS_SANE; + goto do_exactf_utf8; + case EXACTFU: if (is_utf8_pat || utf8_target) { utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -1898,9 +1936,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } case BOUNDL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8); break; case NBOUNDL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8); break; case BOUND: @@ -1935,6 +1975,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* FALLTHROUGH */ case POSIXL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)), to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); break; @@ -2046,7 +2087,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, default: Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum); - assert(0); /* NOTREACHED */ + NOT_REACHED; /* NOTREACHED */ } } break; @@ -2617,6 +2658,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } RX_MATCH_TAINTED_off(rx); + RX_MATCH_UTF8_set(rx, utf8_target); reginfo->prog = rx; /* Yes, sorry that this is confusing. */ reginfo->intuit = 0; @@ -2637,7 +2679,6 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, magic belonging to this SV. Not newSVsv, either, as it does not COW. */ - assert(!IS_PADGV(sv)); reginfo->sv = newSV(0); SvSetSV_nosteal(reginfo->sv, sv); SAVEFREESV(reginfo->sv); @@ -2715,7 +2756,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } /* Simplest case: anchored match need be tried only once. */ - /* [unless only anchor is BOL and multiline is set] */ + /* [unless only anchor is MBOL - implying multiline is set] */ if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) { if (s == startpos && regtry(reginfo, &s)) goto got_it; @@ -2969,7 +3010,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); - regprop(prog, prop, c, reginfo); + regprop(prog, prop, c, reginfo, NULL); { RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), s,strend-s,60); @@ -3075,7 +3116,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, * and replaced it with this one. Yves */ DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "String does not contain required substring, cannot match.\n" + "%sString does not contain required substring, cannot match.%s\n", + PL_colors[4], PL_colors[5] )); goto phooey; } @@ -3105,7 +3147,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* Failure. */ goto phooey; -got_it: + got_it: /* s/// doesn't like it if $& is earlier than where we asked it to * start searching (which can happen on something like /.\G/) */ if ( (flags & REXEC_FAIL_ON_UNDERFLOW) @@ -3137,8 +3179,6 @@ got_it: if (RXp_PAREN_NAMES(prog)) (void)hv_iterinit(RXp_PAREN_NAMES(prog)); - RX_MATCH_UTF8_set(rx, utf8_target); - /* make sure $`, $&, $', and $digit will work later */ if ( !(flags & REXEC_NOT_FIRST) ) S_reg_set_capture_string(aTHX_ rx, @@ -3147,7 +3187,7 @@ got_it: return 1; -phooey: + phooey: DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", PL_colors[4], PL_colors[5])); @@ -3648,7 +3688,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8 *pat = (U8*)STRING(text_node); U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; - if (OP(text_node) == EXACT) { + if (OP(text_node) == EXACT || OP(text_node) == EXACTL) { /* In an exact node, only one thing can be matched, that first * character. If both the pat and the target are UTF-8, we can just @@ -3844,7 +3884,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); - assert(0); /* NOTREACHED */ + NOT_REACHED; /* NOTREACHED */ } } } @@ -3991,7 +4031,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SV * const prop = sv_newmortal(); regnode *rnext=regnext(scan); DUMP_EXEC_POS( locinput, scan, utf8_target ); - regprop(rex, prop, scan, reginfo); + regprop(rex, prop, scan, reginfo, NULL); PerlIO_printf(Perl_debug_log, "%3"IVdf":%*s%s(%"IVdf")\n", @@ -4013,8 +4053,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); switch (state_num) { - case BOL: /* /^../ */ - case SBOL: /* /^../s */ + case SBOL: /* /^../ and /\A../ */ if (locinput == reginfo->strbeg) break; sayNO; @@ -4038,23 +4077,21 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) rex->offs[0].start = locinput - reginfo->strbeg; PUSH_STATE_GOTO(KEEPS_next, next, locinput); /* NOTREACHED */ - assert(0); + NOT_REACHED; case KEEPS_next_fail: /* rollback the start point change */ rex->offs[0].start = st->u.keeper.val; sayNO_SILENT; /* NOTREACHED */ - assert(0); + NOT_REACHED; case MEOL: /* /..$/m */ if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; break; - case EOL: /* /..$/ */ - /* FALLTHROUGH */ - case SEOL: /* /..$/s */ + case SEOL: /* /..$/ */ if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; if (reginfo->strend - locinput > 1) @@ -4097,7 +4134,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) ); sayNO_SILENT; /* NOTREACHED */ - assert(0); + NOT_REACHED; } /* FALLTHROUGH */ case TRIE: /* (ab|cd) */ @@ -4155,6 +4192,19 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); U32 state = trie->startstate; + if (scan->flags == EXACTL || scan->flags == EXACTFLU8) { + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + if (utf8_target + && UTF8_IS_ABOVE_LATIN1(nextchr) + && scan->flags == EXACTL) + { + /* We only output for EXACTL, as we let the folder + * output this message for EXACTFLU8 to avoid + * duplication */ + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, + reginfo->strend); + } + } if ( trie->bitmap && (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr))) { @@ -4285,7 +4335,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto trie_first_try; /* jump into the fail handler */ }} /* NOTREACHED */ - assert(0); + NOT_REACHED; case TRIE_next_fail: /* we failed - try next alternative */ { @@ -4400,7 +4450,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (ST.accepted > 1 || has_cutgroup) { PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc); /* NOTREACHED */ - assert(0); + NOT_REACHED; } /* only one choice left - just continue */ DEBUG_EXECUTE_r({ @@ -4425,10 +4475,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) locinput = (char*)uc; continue; /* execute rest of RE */ /* NOTREACHED */ - assert(0); } #undef ST + case EXACTL: /* /abc/l */ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + + /* Complete checking would involve going through every character + * matched by the string to see if any is above latin1. But the + * comparision otherwise might very well be a fast assembly + * language routine, and I (khw) don't think slowing things down + * just to check for this warning is worth it. So this just checks + * the first character */ + if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) { + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend); + } + /* FALLTHROUGH */ case EXACT: { /* /abc/ */ char *s = STRING(scan); ln = STR_LEN(scan); @@ -4515,11 +4577,24 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const char * s; U32 fold_utf8_flags; + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; folder = foldEQ_locale; fold_array = PL_fold_locale; fold_utf8_flags = FOLDEQ_LOCALE; goto do_exactf; + case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so + is effectively /u; hence to match, target + must be UTF-8. */ + if (! utf8_target) { + sayNO; + } + fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED + | FOLDEQ_S1_FOLDS_SANE; + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + goto do_exactf; + case EXACTFU_SS: /* /\x{df}/iu */ case EXACTFU: /* /abc/iu */ folder = foldEQ_latin1; @@ -4582,90 +4657,109 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; } - /* XXX Could improve efficiency by separating these all out using a - * macro or in-line function. At that point regcomp.c would no longer - * have to set the FLAGS fields of these */ - case BOUNDL: /* /\b/l */ + /* XXX At that point regcomp.c would no longer * have to set the FLAGS fields of these */ case NBOUNDL: /* /\B/l */ + to_complement = 1; + /* FALLTHROUGH */ + + case BOUNDL: /* /\b/l */ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + if (utf8_target) { + if (locinput == reginfo->strbeg) + ln = isWORDCHAR_LC('\n'); + else { + ln = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1, + (U8*)(reginfo->strbeg))); + } + n = (NEXTCHR_IS_EOS) + ? isWORDCHAR_LC('\n') + : isWORDCHAR_LC_utf8((U8*)locinput); + } + else { /* Here the string isn't utf8 */ + ln = (locinput == reginfo->strbeg) + ? isWORDCHAR_LC('\n') + : isWORDCHAR_LC(UCHARAT(locinput - 1)); + n = (NEXTCHR_IS_EOS) + ? isWORDCHAR_LC('\n') + : isWORDCHAR_LC(nextchr); + } + if (to_complement ^ (ln == n)) { + sayNO; + } + break; + + case NBOUND: /* /\B/ */ + to_complement = 1; + /* FALLTHROUGH */ + case BOUND: /* /\b/ */ - case BOUNDU: /* /\b/u */ + if (utf8_target) { + goto bound_utf8; + } + goto bound_ascii_match_only; + + case NBOUNDA: /* /\B/a */ + to_complement = 1; + /* FALLTHROUGH */ + case BOUNDA: /* /\b/a */ - case NBOUND: /* /\B/ */ + + bound_ascii_match_only: + /* Here the string isn't utf8, or is utf8 and only ascii characters + * are to match \w. In the latter case looking at the byte just + * prior to the current one may be just the final byte of a + * multi-byte character. This is ok. There are two cases: + * 1) it is a single byte character, and then the test is doing + * just what it's supposed to. + * 2) it is a multi-byte character, in which case the final byte is + * never mistakable for ASCII, and so the test will say it is + * not a word character, which is the correct answer. */ + ln = (locinput == reginfo->strbeg) + ? isWORDCHAR_A('\n') + : isWORDCHAR_A(UCHARAT(locinput - 1)); + n = (NEXTCHR_IS_EOS) + ? isWORDCHAR_A('\n') + : isWORDCHAR_A(nextchr); + if (to_complement ^ (ln == n)) { + sayNO; + } + break; + case NBOUNDU: /* /\B/u */ - case NBOUNDA: /* /\B/a */ - /* was last char in word? */ - if (utf8_target - && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET - && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET) - { - if (locinput == reginfo->strbeg) - ln = '\n'; - else { - const U8 * const r = - reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg)); + to_complement = 1; + /* FALLTHROUGH */ - ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r, - 0, uniflags); - } - if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { - ln = isWORDCHAR_uni(ln); - if (NEXTCHR_IS_EOS) - n = 0; - else { - LOAD_UTF8_CHARCLASS_ALNUM(); - n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput, - utf8_target); - } - } - else { - ln = isWORDCHAR_LC_uvchr(ln); - n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); - } + case BOUNDU: /* /\b/u */ + if (utf8_target) { + + bound_utf8: + ln = (locinput == reginfo->strbeg) + ? isWORDCHAR_L1('\n') + : isWORDCHAR_utf8(reghop3((U8*)locinput, -1, + (U8*)(reginfo->strbeg))); + n = (NEXTCHR_IS_EOS) + ? isWORDCHAR_L1('\n') + : isWORDCHAR_utf8((U8*)locinput); } else { + ln = (locinput == reginfo->strbeg) + ? isWORDCHAR_L1('\n') + : isWORDCHAR_L1(UCHARAT(locinput - 1)); + n = (NEXTCHR_IS_EOS) + ? isWORDCHAR_L1('\n') + : isWORDCHAR_L1(nextchr); - /* Here the string isn't utf8, or is utf8 and only ascii - * characters are to match \w. In the latter case looking at - * the byte just prior to the current one may be just the final - * byte of a multi-byte character. This is ok. There are two - * cases: - * 1) it is a single byte character, and then the test is doing - * just what it's supposed to. - * 2) it is a multi-byte character, in which case the final - * byte is never mistakable for ASCII, and so the test - * will say it is not a word character, which is the - * correct answer. */ - ln = (locinput != reginfo->strbeg) ? - UCHARAT(locinput - 1) : '\n'; - switch (FLAGS(scan)) { - case REGEX_UNICODE_CHARSET: - ln = isWORDCHAR_L1(ln); - n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr); - break; - case REGEX_LOCALE_CHARSET: - ln = isWORDCHAR_LC(ln); - n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr); - break; - case REGEX_DEPENDS_CHARSET: - ln = isWORDCHAR(ln); - n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr); - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - ln = isWORDCHAR_A(ln); - n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr); - break; - default: - Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan)); - } } - /* Note requires that all BOUNDs be lower than all NBOUNDs in - * regcomp.sym */ - if (((!ln) == (!n)) == (OP(scan) < NBOUND)) - sayNO; + + if (to_complement ^ (ln == n)) { + sayNO; + } break; - case ANYOF: /* /[abc]/ */ + case ANYOFL: /* /[abc]/l */ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + /* FALLTHROUGH */ + case ANYOF: /* /[abc]/ */ if (NEXTCHR_IS_EOS) sayNO; if (utf8_target) { @@ -4689,6 +4783,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* FALLTHROUGH */ case POSIXL: /* \w or [:punct:] etc. under /l */ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (NEXTCHR_IS_EOS) sayNO; @@ -4709,7 +4804,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } } else { /* Here, must be an above Latin-1 code point */ - goto utf8_posix_not_eos; + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend); + goto utf8_posix_above_latin1; } /* Here, must be utf8 */ @@ -4768,7 +4864,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (NEXTCHR_IS_EOS) { sayNO; } - utf8_posix_not_eos: /* Use _generic_isCC() for characters within Latin1. (Note that * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else @@ -4792,6 +4887,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) locinput += 2; } else { /* Handle above Latin-1 code points */ + utf8_posix_above_latin1: classnum = (_char_class_number) FLAGS(scan); if (classnum < _FIRST_NON_SWASH_CC) { @@ -5065,6 +5161,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) const U8 *fold_array; UV utf8_fold_flags; + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; folder = foldEQ_locale; fold_array = PL_fold_locale; type = REFFL; @@ -5109,6 +5206,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto do_nref_ref_common; case REFFL: /* /\1/il */ + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; folder = foldEQ_locale; fold_array = PL_fold_locale; utf8_fold_flags = FOLDEQ_LOCALE; @@ -5192,9 +5290,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case TAIL: /* placeholder while compiling (A|B|C) */ break; - case BACK: /* ??? doesn't appear to be used ??? */ - break; - #undef ST #define ST st->u.eval { @@ -5233,9 +5328,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* and then jump to the code we share with EVAL */ goto eval_recurse_doit; - /* NOTREACHED */ - assert(0); case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ if (cur_eval && cur_eval->locinput==locinput) { @@ -5324,7 +5417,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) assert(o->op_targ == OP_LEAVE); o = cUNOPo->op_first; assert(o->op_type == OP_ENTER); - o = OP_SIBLING(o); + o = OpSIBLING(o); } if (o->op_type != OP_STUB) { @@ -5450,8 +5543,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) assert(!(scan->flags & ~RXf_PMf_COMPILETIME)); re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL, rex->engine, NULL, NULL, - /* copy /msix etc to inner pattern */ - scan->flags, + /* copy /msixn etc to inner pattern */ + ARG2L(scan), pm_flags); if (!(SvFLAGS(ret) @@ -5517,7 +5610,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* now continue from first node in postoned RE */ PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput); /* NOTREACHED */ - assert(0); + NOT_REACHED; } case EVAL_AB: /* cleanup after a successful (??{A})B */ @@ -5777,21 +5870,21 @@ NULL PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput); /* NOTREACHED */ - assert(0); + NOT_REACHED; } case CURLYX_end: /* just finished matching all of A*B */ cur_curlyx = ST.prev_curlyx; sayYES; /* NOTREACHED */ - assert(0); + NOT_REACHED; case CURLYX_end_fail: /* just failed to match all of A*B */ regcpblow(ST.cp); cur_curlyx = ST.prev_curlyx; sayNO; /* NOTREACHED */ - assert(0); + NOT_REACHED; #undef ST @@ -5830,7 +5923,7 @@ NULL PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput); /* NOTREACHED */ - assert(0); + NOT_REACHED; } /* If degenerate A matches "", assume A done. */ @@ -5943,7 +6036,7 @@ NULL PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, locinput); /* NOTREACHED */ - assert(0); + NOT_REACHED; } /* Prefer A over B for maximal matching. */ @@ -5955,19 +6048,19 @@ NULL REGCP_SET(ST.lastcp); PUSH_STATE_GOTO(WHILEM_A_max, A, locinput); /* NOTREACHED */ - assert(0); + NOT_REACHED; } goto do_whilem_B_max; } /* NOTREACHED */ - assert(0); + NOT_REACHED; case WHILEM_B_min: /* just matched B in a minimal match */ case WHILEM_B_max: /* just matched B in a maximal match */ cur_curlyx = ST.save_curlyx; sayYES; /* NOTREACHED */ - assert(0); + NOT_REACHED; case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ cur_curlyx = ST.save_curlyx; @@ -5975,7 +6068,7 @@ NULL cur_curlyx->u.curlyx.count--; CACHEsayNO; /* NOTREACHED */ - assert(0); + NOT_REACHED; case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ /* FALLTHROUGH */ @@ -5986,7 +6079,7 @@ NULL cur_curlyx->u.curlyx.count--; CACHEsayNO; /* NOTREACHED */ - assert(0); + NOT_REACHED; case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ REGCP_UNWIND(ST.lastcp); @@ -6013,7 +6106,7 @@ NULL PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, locinput); /* NOTREACHED */ - assert(0); + NOT_REACHED; case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ cur_curlyx = ST.save_curlyx; @@ -6048,7 +6141,7 @@ NULL /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS, locinput); /* NOTREACHED */ - assert(0); + NOT_REACHED; #undef ST #define ST st->u.branch @@ -6074,14 +6167,14 @@ NULL PUSH_STATE_GOTO(BRANCH_next, scan, locinput); } /* NOTREACHED */ - assert(0); + NOT_REACHED; case CUTGROUP: /* /(*THEN)/ */ sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL : MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); PUSH_STATE_GOTO(CUTGROUP_next, next, locinput); /* NOTREACHED */ - assert(0); + NOT_REACHED; case CUTGROUP_next_fail: do_cutgroup = 1; @@ -6090,12 +6183,12 @@ NULL sv_commit = st->u.mark.mark_name; sayNO; /* NOTREACHED */ - assert(0); + NOT_REACHED; case BRANCH_next: sayYES; /* NOTREACHED */ - assert(0); + NOT_REACHED; case BRANCH_next_fail: /* that branch failed; try the next, if any */ if (do_cutgroup) { @@ -6118,7 +6211,6 @@ NULL } continue; /* execute next BRANCH[J] op */ /* NOTREACHED */ - assert(0); case MINMOD: /* next op will be non-greedy, e.g. A*? */ minmod = 1; @@ -6163,7 +6255,7 @@ NULL curlym_do_A: /* execute the A in /A{m,n}B/ */ PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */ /* NOTREACHED */ - assert(0); + NOT_REACHED; case CURLYM_A: /* we've just matched an A */ ST.count++; @@ -6300,7 +6392,7 @@ NULL PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */ /* NOTREACHED */ - assert(0); + NOT_REACHED; case CURLYM_B_fail: /* just failed to match a B */ REGCP_UNWIND(ST.cp); @@ -6479,7 +6571,7 @@ NULL goto curly_try_B_max; } /* NOTREACHED */ - assert(0); + NOT_REACHED; case CURLY_B_min_known_fail: /* failed to find B in a non-greedy match where c1,c2 valid */ @@ -6555,7 +6647,7 @@ NULL PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput); } /* NOTREACHED */ - assert(0); + NOT_REACHED; case CURLY_B_min_fail: /* failed to find B in a non-greedy match where c1,c2 invalid */ @@ -6588,7 +6680,7 @@ NULL } sayNO; /* NOTREACHED */ - assert(0); + NOT_REACHED; curly_try_B_max: /* a successful greedy match: now try to match B */ @@ -6619,7 +6711,7 @@ NULL CURLY_SETPAREN(ST.paren, ST.count); PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput); /* NOTREACHED */ - assert(0); + NOT_REACHED; } } /* FALLTHROUGH */ @@ -6739,7 +6831,7 @@ NULL /* execute body of (?...A) */ PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart); /* NOTREACHED */ - assert(0); + NOT_REACHED; } case IFMATCH_A_fail: /* body of (?...A) failed */ @@ -6780,7 +6872,7 @@ NULL sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); PUSH_STATE_GOTO(COMMIT_next, next, locinput); /* NOTREACHED */ - assert(0); + NOT_REACHED; case COMMIT_next_fail: no_final = 1; @@ -6789,7 +6881,7 @@ NULL case OPFAIL: /* (*FAIL) */ sayNO; /* NOTREACHED */ - assert(0); + NOT_REACHED; #define ST st->u.mark case MARKPOINT: /* (*MARK:foo) */ @@ -6800,13 +6892,13 @@ NULL ST.mark_loc = locinput; PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput); /* NOTREACHED */ - assert(0); + NOT_REACHED; case MARKPOINT_next: mark_state = ST.prev_mark; sayYES; /* NOTREACHED */ - assert(0); + NOT_REACHED; case MARKPOINT_next_fail: if (popmark && sv_eq(ST.mark_name,popmark)) @@ -6828,7 +6920,7 @@ NULL mark_state->u.mark.mark_name : NULL; sayNO; /* NOTREACHED */ - assert(0); + NOT_REACHED; case SKIP: /* (*SKIP) */ if (scan->flags) { @@ -6874,7 +6966,7 @@ NULL no_final = 1; sayNO; /* NOTREACHED */ - assert(0); + NOT_REACHED; #undef ST case LNBREAK: /* \R */ @@ -6909,7 +7001,6 @@ NULL scan = next; /* prepare to execute the next op and ... */ continue; /* ... jump back to the top, reusing st */ /* NOTREACHED */ - assert(0); push_yes_state: /* push a state that backtracks on success */ @@ -6953,7 +7044,6 @@ NULL st = newst; continue; /* NOTREACHED */ - assert(0); } } @@ -6964,8 +7054,9 @@ NULL Perl_croak(aTHX_ "corrupted regexp pointers"); /* NOTREACHED */ sayNO; + NOT_REACHED; -yes: + yes: if (yes_state) { /* we have successfully completed a subexpression, but we must now * pop to the state marked by yes_state and continue from there */ @@ -7026,7 +7117,7 @@ yes: result = 1; goto final_exit; -no: + no: DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, "%*s %sfailed...%s\n", @@ -7034,7 +7125,7 @@ no: PL_colors[4], PL_colors[5]) ); -no_silent: + no_silent: if (no_final) { if (yes_state) { goto yes; @@ -7185,6 +7276,12 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, scan = loceol; } break; + case EXACTL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) { + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol); + } + /* FALLTHROUGH */ case EXACT: assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); @@ -7258,6 +7355,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, goto do_exactf; case EXACTFL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; utf8_flags = FOLDEQ_LOCALE; goto do_exactf; @@ -7266,6 +7364,14 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, utf8_flags = 0; goto do_exactf; + case EXACTFLU8: + if (! utf8_target) { + break; + } + utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED + | FOLDEQ_S2_FOLDS_SANE; + goto do_exactf; + case EXACTFU_SS: case EXACTFU: utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -7329,6 +7435,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; } + case ANYOFL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + /* FALLTHROUGH */ case ANYOF: if (utf8_target) { while (hardcount < max @@ -7351,6 +7460,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* FALLTHROUGH */ case POSIXL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (! utf8_target) { while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p), *scan))) @@ -7570,16 +7680,18 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; + case BOUNDL: + case NBOUNDL: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + /* FALLTHROUGH */ case BOUND: case BOUNDA: - case BOUNDL: case BOUNDU: case EOS: case GPOS: case KEEPS: case NBOUND: case NBOUNDA: - case NBOUNDL: case NBOUNDU: case OPFAIL: case SBOL: @@ -7590,7 +7702,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, default: Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]); /* NOTREACHED */ - assert(0); + NOT_REACHED; } @@ -7604,7 +7716,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, GET_RE_DEBUG_FLAGS_DECL; DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); - regprop(prog, prop, p, reginfo); + regprop(prog, prop, p, reginfo, NULL); PerlIO_printf(Perl_debug_log, "%*s %s can match %"IVdf" times out of %"IVdf"...\n", REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max); @@ -7630,127 +7742,15 @@ Perl_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, *altsvp = NULL; } - return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL)); + return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL)); } -SV * -Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, - const regnode* node, - bool doinit, - SV** listsvp, - SV** only_utf8_locale_ptr) -{ - /* For internal core use only. - * Returns the swash for the input 'node' in the regex 'prog'. - * If is 'true', will attempt to create the swash if not already - * done. - * If is non-null, will return the printable contents of the - * swash. This can be used to get debugging information even before the - * swash exists, by calling this function with 'doinit' set to false, in - * which case the components that will be used to eventually create the - * swash are returned (in a printable form). - * Tied intimately to how regcomp.c sets up the data structure */ - - SV *sw = NULL; - SV *si = NULL; /* Input swash initialization string */ - SV* invlist = NULL; - - RXi_GET_DECL(prog,progi); - const struct reg_data * const data = prog ? progi->data : NULL; - - PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; - - assert(ANYOF_FLAGS(node) - & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); - - if (data && data->count) { - const U32 n = ARG(node); - - if (data->what[n] == 's') { - SV * const rv = MUTABLE_SV(data->data[n]); - AV * const av = MUTABLE_AV(SvRV(rv)); - SV **const ary = AvARRAY(av); - U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - - si = *ary; /* ary[0] = the string to initialize the swash with */ - - /* Elements 3 and 4 are either both present or both absent. [3] is - * any inversion list generated at compile time; [4] indicates if - * that inversion list has any user-defined properties in it. */ - if (av_tindex(av) >= 2) { - if (only_utf8_locale_ptr - && ary[2] - && ary[2] != &PL_sv_undef) - { - *only_utf8_locale_ptr = ary[2]; - } - else { - assert(only_utf8_locale_ptr); - *only_utf8_locale_ptr = NULL; - } - - if (av_tindex(av) >= 3) { - invlist = ary[3]; - if (SvUV(ary[4])) { - swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; - } - } - else { - invlist = NULL; - } - } - - /* Element [1] is reserved for the set-up swash. If already there, - * return it; if not, create it and store it there */ - if (ary[1] && SvROK(ary[1])) { - sw = ary[1]; - } - else if (doinit && ((si && si != &PL_sv_undef) - || (invlist && invlist != &PL_sv_undef))) { - assert(si); - sw = _core_swash_init("utf8", /* the utf8 package */ - "", /* nameless */ - si, - 1, /* binary */ - 0, /* not from tr/// */ - invlist, - &swash_init_flags); - (void)av_store(av, 1, sw); - } - } - } - - /* If requested, return a printable version of what this swash matches */ - if (listsvp) { - SV* matches_string = newSVpvs(""); - - /* The swash should be used, if possible, to get the data, as it - * contains the resolved data. But this function can be called at - * compile-time, before everything gets resolved, in which case we - * return the currently best available information, which is the string - * that will eventually be used to do that resolving, 'si' */ - if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) - && (si && si != &PL_sv_undef)) - { - sv_catsv(matches_string, si); - } - - /* Add the inversion list to whatever we have. This may have come from - * the swash, or from an input parameter */ - if (invlist) { - sv_catsv(matches_string, _invlist_contents(invlist)); - } - *listsvp = matches_string; - } - - return sw; -} #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ /* - reginclass - determine if a character falls into a character class - n is the ANYOF regnode + n is the ANYOF-type regnode p is the target string p_end points to one byte beyond the end of the target string utf8_target tells whether p is in UTF-8. @@ -7784,25 +7784,31 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * UTF8_ALLOW_FFFF */ if (c_len == (STRLEN)-1) Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); + if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) { + _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c); + } } /* If this character is potentially in the bitmap, check it */ - if (c < 256) { + if (c < NUM_ANYOF_CODE_POINTS) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; - else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL - && ! utf8_target - && ! isASCII(c)) + else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) + && ! utf8_target + && ! isASCII(c)) { match = TRUE; } else if (flags & ANYOF_LOCALE_FLAGS) { - if (flags & ANYOF_LOC_FOLD) { - if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { - match = TRUE; - } + if ((flags & ANYOF_LOC_FOLD) + && c < 256 + && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) + { + match = TRUE; } - if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) { + else if (ANYOF_POSIXL_TEST_ANY_SET(n) + && c < 256 + ) { /* The data structure is arranged so bits 0, 2, 4, ... are set * if the class includes the Posix character class given by @@ -7855,18 +7861,20 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const /* If the bitmap didn't (or couldn't) match, and something outside the * bitmap could match, try that. */ if (!match) { - if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) { - match = TRUE; /* Everything above 255 matches */ + if (c >= NUM_ANYOF_CODE_POINTS + && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP)) + { + match = TRUE; /* Everything above the bitmap matches */ } - else if ((flags & ANYOF_NONBITMAP_NON_UTF8) - || (utf8_target && (flags & ANYOF_UTF8)) + else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) + || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES)) || ((flags & ANYOF_LOC_FOLD) && IN_UTF8_CTYPE_LOCALE - && ARG(n) != ANYOF_NONBITMAP_EMPTY)) + && ARG(n) != ANYOF_ONLY_HAS_BITMAP)) { SV* only_utf8_locale = NULL; SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, - &only_utf8_locale); + &only_utf8_locale, NULL); if (sw) { U8 utf8_buffer[2]; U8 * utf8_p;