From 1ee208c4824a2a3a18e979873141161c149a57a3 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 18 Feb 2014 10:24:31 -0700 Subject: [PATCH] regcomp.c: Fix more alignment problems I believe this will fix the remaining alignment problems recently being shown on gcc on HP-UX, It works on the procura machine. regnodes should not have stricter alignment than required by U32, for reasons given in the comments this commit adds to the beginning of regcomp.h. Commit 31f05a37 added a new ANYOF regnode struct with a pointer field. This requires stricter alignment on some 64-bit platforms, and hence doesn't work on those platforms. This commit removes that regnode struct type, and instead stores the pointer it used via a more indirect, but already existing mechanism that stores other data.. The function that returns that other data is enlarged to return this new field as well. It now needs to be called from regcomp.c, so the previous commit had renamed and made it accessible from there. The "public" function that wraps this one is unchanged. (I put "public" in quotes here, because I don't think anyone outside core is or should be using it, but since it has been publicly available for a long time, I'm treating the API as unchangeable. regcomp.c called this public function before this commit, but needs the additional data returned by the inner one). --- embed.fnc | 3 +- embed.h | 2 +- perl.h | 1 - proto.h | 4 +-- regcomp.c | 118 +++++++++++++++++++++++++++++++++++--------------------------- regcomp.h | 36 +++++++++---------- regexec.c | 46 +++++++++++++++--------- 7 files changed, 117 insertions(+), 93 deletions(-) diff --git a/embed.fnc b/embed.fnc index c90bf6c..eae081e 100644 --- a/embed.fnc +++ b/embed.fnc @@ -2041,6 +2041,7 @@ Es |void |set_ANYOF_arg |NN RExC_state_t* const pRExC_state \ |NN regnode* const node \ |NULLOK SV* const cp_list \ |NULLOK SV* const runtime_defns \ + |NULLOK SV* const only_utf8_locale_list \ |NULLOK SV* const swash \ |const bool has_user_defined_property Es |regnode*|regclass |NN RExC_state_t *pRExC_state \ @@ -2102,7 +2103,7 @@ Es |void |ssc_or |NN const RExC_state_t *pRExC_state \ |NN const regnode_charclass *or_with Es |SV* |get_ANYOF_cp_list_for_ssc \ |NN const RExC_state_t *pRExC_state \ - |NN const regnode_charclass_posixl_fold* const node + |NN const regnode_charclass_posixl* const node Ei |void |ssc_intersection|NN regnode_ssc *ssc \ |NN SV* const invlist|const bool invert_2nd Ei |void |ssc_union |NN regnode_ssc *ssc \ diff --git a/embed.h b/embed.h index 0574367..65b9c1c 100644 --- a/embed.h +++ b/embed.h @@ -953,7 +953,7 @@ #define reguni(a,b,c) S_reguni(aTHX_ a,b,c) #define regwhite S_regwhite #define scan_commit(a,b,c,d) S_scan_commit(aTHX_ a,b,c,d) -#define set_ANYOF_arg(a,b,c,d,e,f) S_set_ANYOF_arg(aTHX_ a,b,c,d,e,f) +#define set_ANYOF_arg(a,b,c,d,e,f,g) S_set_ANYOF_arg(aTHX_ a,b,c,d,e,f,g) #define ssc_add_range(a,b,c) S_ssc_add_range(aTHX_ a,b,c) #define ssc_and(a,b,c) S_ssc_and(aTHX_ a,b,c) #define ssc_anything(a) S_ssc_anything(aTHX_ a) diff --git a/perl.h b/perl.h index 9b6e3fe..a6e4dbb 100644 --- a/perl.h +++ b/perl.h @@ -3337,7 +3337,6 @@ struct regnode_charclass_class; /* A hopefully less confusing name. The sub-classes are all Posix classes only * used under /l matching */ typedef struct regnode_charclass_class regnode_charclass_posixl; -typedef struct regnode_charclass_posixl_fold regnode_charclass_posixl_fold; typedef struct regnode_ssc regnode_ssc; typedef struct RExC_state_t RExC_state_t; diff --git a/proto.h b/proto.h index 4a7533d..f77debe 100644 --- a/proto.h +++ b/proto.h @@ -6624,7 +6624,7 @@ STATIC bool S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state) #define PERL_ARGS_ASSERT_COULD_IT_BE_A_POSIX_CLASS \ assert(pRExC_state) -STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass_posixl_fold* const node) +STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass_posixl* const node) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); #define PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC \ @@ -6876,7 +6876,7 @@ STATIC void S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, struct scan_dat #define PERL_ARGS_ASSERT_SCAN_COMMIT \ assert(pRExC_state); assert(data); assert(minlenp) -STATIC void S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, regnode* const node, SV* const cp_list, SV* const runtime_defns, SV* const swash, const bool has_user_defined_property) +STATIC void S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, regnode* const node, SV* const cp_list, SV* const runtime_defns, SV* const only_utf8_locale_list, SV* const swash, const bool has_user_defined_property) __attribute__nonnull__(pTHX_1) __attribute__nonnull__(pTHX_2); #define PERL_ARGS_ASSERT_SET_ANYOF_ARG \ diff --git a/regcomp.c b/regcomp.c index f7bac3d..3371d5d 100644 --- a/regcomp.c +++ b/regcomp.c @@ -999,7 +999,7 @@ S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state, STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, - const regnode_charclass_posixl_fold* const node) + const regnode_charclass_posixl* const node) { /* Returns a mortal inversion list defining which code points are matched * by 'node', which is of type ANYOF. Handles complementing the result if @@ -1008,6 +1008,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, * possibility. */ SV* invlist = sv_2mortal(_new_invlist(0)); + SV* only_utf8_locale_invlist = NULL; unsigned int i; const U32 n = ARG(node); bool new_node_has_latin1 = FALSE; @@ -1030,11 +1031,18 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, * known until runtime -- we have to assume it could be anything */ return _add_range_to_invlist(invlist, 0, UV_MAX); } - else { + else if (ary[3] && ary[3] != &PL_sv_undef) { /* Here no compile-time swash, and no run-time only data. Use the * node's inversion list */ - invlist = sv_2mortal(invlist_clone(ary[2])); + invlist = sv_2mortal(invlist_clone(ary[3])); + } + + /* Get the code points valid only under UTF-8 locales */ + if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) + && ary[2] && ary[2] != &PL_sv_undef) + { + only_utf8_locale_invlist = ary[2]; } } @@ -1082,11 +1090,12 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, _invlist_union(invlist, PL_Latin1, &invlist); } - /* Similarly add the UTF-8 locale possible matches */ - if (ANYOF_FLAGS(node) & ANYOF_LOC_FOLD && ANYOF_UTF8_LOCALE_INVLIST(node)) - { + /* Similarly add the UTF-8 locale possible matches. These have to be + * deferred until after the non-UTF-8 locale ones are taken care of just + * above, or it leads to wrong results under ANYOF_INVERT */ + if (only_utf8_locale_invlist) { _invlist_union_maybe_complement_2nd(invlist, - ANYOF_UTF8_LOCALE_INVLIST(node), + only_utf8_locale_invlist, ANYOF_FLAGS(node) & ANYOF_INVERT, &invlist); } @@ -1162,7 +1171,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, } else { anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, - (regnode_charclass_posixl_fold*) and_with); + (regnode_charclass_posixl*) and_with); anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; } @@ -1239,7 +1248,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * standard, in particular almost everything by Microsoft. * The loop below just changes e.g., \w into \W and vice versa */ - regnode_charclass_posixl_fold temp; + regnode_charclass_posixl temp; int add = 1; /* To calculate the index of the complement */ ANYOF_POSIXL_ZERO(&temp); @@ -1311,7 +1320,7 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, } else { ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, - (regnode_charclass_posixl_fold*) or_with); + (regnode_charclass_posixl*) or_with); ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; } @@ -1450,7 +1459,8 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); - set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE); + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, + NULL, NULL, NULL, FALSE); if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { ANYOF_FLAGS(ssc) |= ANYOF_POSIXL; @@ -13188,6 +13198,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * that fold to/from them under /i */ SV* cp_foldable_list = NULL; + /* Like cp_list, but code points on this list are valid only when the + * runtime locale is UTF-8 */ + SV* only_utf8_locale_list = NULL; + #ifdef EBCDIC /* In a range, counts how many 0-2 of the ends of it came from literals, * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ @@ -13638,20 +13652,6 @@ parseit: * against. This isn't needed for \p{} and pseudo-classes, as they are * not affected by locale, and hence are dealt with separately */ if (LOC) { - if (FOLD && ! need_class) { - need_class = 1; - if (SIZE_ONLY) { - RExC_size += ANYOF_POSIXL_FOLD_SKIP - ANYOF_SKIP; - } - else { - RExC_emit += ANYOF_POSIXL_FOLD_SKIP - ANYOF_SKIP; - } - - /* We need to initialize this here because this node type has - * this field, and will skip getting initialized when we get to - * a posix class since are doing it here */ - ANYOF_POSIXL_ZERO(ret); - } if (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_POSIXL_MAX) { if (! need_class) { need_class = 1; @@ -14264,8 +14264,7 @@ parseit: * runtime only when the locale indicates Unicode rules. For * non-locale, we just use to the general list */ if (LOC) { - use_list = &ANYOF_UTF8_LOCALE_INVLIST(ret); - *use_list = NULL; + use_list = &only_utf8_locale_list; } else { use_list = &cp_list; @@ -14596,7 +14595,7 @@ parseit: * fetching). We know to set the flag if we have a non-NULL list for UTF-8 * locales, or the class matches at least one 0-255 range code point */ if (LOC && FOLD) { - if (ANYOF_UTF8_LOCALE_INVLIST(ret)) { + if (only_utf8_locale_list) { ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD; } else if (cp_list) { /* Look to see if there a 0-255 code point is in @@ -14808,6 +14807,7 @@ parseit: set_ANYOF_arg(pRExC_state, ret, cp_list, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) ? listsv : NULL, + only_utf8_locale_list, swash, has_user_defined_property); *flagp |= HASWIDTH|SIMPLE; @@ -14821,6 +14821,7 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, regnode* const node, SV* const cp_list, SV* const runtime_defns, + SV* const only_utf8_locale_list, SV* const swash, const bool has_user_defined_property) { @@ -14838,25 +14839,29 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, * av[1] if &PL_sv_undef, is a placeholder to later contain the swash * computed from av[0]. But if no further computation need be done, * the swash is stored here now (and av[0] is &PL_sv_undef). - * av[2] stores the cp_list inversion list for use in addition or instead + * av[2] stores the inversion list of code points that match only if the + * current locale is UTF-8 + * av[3] stores the cp_list inversion list for use in addition or instead * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. * (Otherwise everything needed is already in av[0] and av[1]) - * av[3] is set if any component of the class is from a user-defined - * property; used only if av[2] exists */ + * av[4] is set if any component of the class is from a user-defined + * property; used only if av[3] exists */ UV n; PERL_ARGS_ASSERT_SET_ANYOF_ARG; - if (! cp_list && ! runtime_defns) { - assert(! (ANYOF_FLAGS(node) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); + if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) { + assert(! (ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8))); ARG_SET(node, ANYOF_NONBITMAP_EMPTY); } else { AV * const av = newAV(); SV *rv; - assert(ANYOF_FLAGS(node) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)); + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); av_store(av, 0, (runtime_defns) ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); @@ -14867,11 +14872,18 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, else { av_store(av, 1, &PL_sv_undef); if (cp_list) { - av_store(av, 2, cp_list); - av_store(av, 3, newSVuv(has_user_defined_property)); + av_store(av, 3, cp_list); + av_store(av, 4, newSVuv(has_user_defined_property)); } } + if (only_utf8_locale_list) { + av_store(av, 2, only_utf8_locale_list); + } + else { + av_store(av, 2, &PL_sv_undef); + } + rv = newRV_noinc(MUTABLE_SV(av)); n = add_data(pRExC_state, STR_WITH_LEN("s")); RExC_rxi->data->data[n] = (void*)rv; @@ -15672,8 +15684,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } } - if ((flags & (ANYOF_ABOVE_LATIN1_ALL|ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)) - || ANYOF_UTF8_LOCALE_INVLIST(o)) + if ((flags & (ANYOF_ABOVE_LATIN1_ALL + |ANYOF_UTF8 + |ANYOF_NONBITMAP_NON_UTF8 + |ANYOF_LOC_FOLD))) { if (do_sep) { Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); @@ -15689,13 +15703,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) /* output information about the unicode matching */ if (flags & ANYOF_ABOVE_LATIN1_ALL) sv_catpvs(sv, "{unicode_all}"); - else if (FLAGS(o) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)) { + else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) { SV *lv; /* Set if there is something outside the bit map. */ bool byte_output = FALSE; /* If something in the bitmap has been output */ + SV *only_utf8_locale; /* Get the stuff that wasn't in the bitmap */ - (void) regclass_swash(prog, o, FALSE, &lv, NULL); + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &lv, &only_utf8_locale); if (lv && lv != &PL_sv_undef) { char *s = savesvpv(lv); char * const origs = s; @@ -15746,16 +15762,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Safefree(origs); SvREFCNT_dec_NN(lv); } - } - /* Output any UTF-8 locale code points */ - if (flags & ANYOF_LOC_FOLD && ANYOF_UTF8_LOCALE_INVLIST(o)) { + if ((flags & ANYOF_LOC_FOLD) + && only_utf8_locale + && only_utf8_locale != &PL_sv_undef) + { UV start, end; int max_entries = 256; sv_catpvs(sv, "{utf8 locale}"); - invlist_iterinit(ANYOF_UTF8_LOCALE_INVLIST(o)); - while (invlist_iternext(ANYOF_UTF8_LOCALE_INVLIST(o), + invlist_iterinit(only_utf8_locale); + while (invlist_iternext(only_utf8_locale, &start, &end)) { put_range(sv, start, end); max_entries --; @@ -15764,7 +15781,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) break; } } - invlist_iterfinish(ANYOF_UTF8_LOCALE_INVLIST(o)); + invlist_iterfinish(only_utf8_locale); + } } } @@ -16629,11 +16647,9 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } else if (PL_regkind[(U8)op] == ANYOF) { /* arglen 1 + class block */ - node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) - ? ANYOF_POSIXL_FOLD_SKIP - : (ANYOF_FLAGS(node) & ANYOF_POSIXL) - ? ANYOF_POSIXL_SKIP - : ANYOF_SKIP); + node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL) + ? ANYOF_POSIXL_SKIP + : ANYOF_SKIP); node = NEXTOPER(node); } else if (PL_regkind[(U8)op] == EXACT) { diff --git a/regcomp.h b/regcomp.h index 744f361..700d6c1 100644 --- a/regcomp.h +++ b/regcomp.h @@ -187,6 +187,17 @@ struct regnode_2 { #define ANYOF_BITMAP_SIZE (256 / 8) /* 8 bits/Byte */ +/* Note that these form structs which are supersets of the next smaller one, by + * appending fields. Alignment problems can occur if one of those optional + * fields requires stricter alignment than the base struct. And formal + * parameters that can really be two or more of the structs should be + * declared as the smallest one it could be. See commit message for + * 7dcac5f6a5195002b55c935ee1d67f67e1df280b. Regnode allocation is done + * without regard to alignment, and changing it to would also require changing + * the code that inserts and deletes regnodes. The basic single-argument + * regnode has a U32, which is what reganode() allocates as a unit. Therefore + * no field can require stricter alignment than U32. */ + /* also used by trie */ struct regnode_charclass { U8 flags; @@ -206,23 +217,14 @@ struct regnode_charclass_class { U32 classflags; /* and run-time */ }; -/* like above, but also has folds that are used only if the runtime locale is - * UTF-8. */ -struct regnode_charclass_posixl_fold { - U8 flags; /* ANYOF_POSIXL bit must go here */ - U8 type; - U16 next_off; - U32 arg1; - char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */ - U32 classflags; /* and run-time */ - SV* utf8_locale_list; /* list of code points matched by folds - in a UTF-8 locale */ -}; - /* A synthetic start class; is a regnode_charclass_posixl_fold, plus an extra * SV*, used only during its construction and which is not used by regexec.c. * Note that the 'next_off' field is unused, as the SSC stands alone, so there - * is never a next node. */ + * is never a next node. Also, there is no alignment issue, becase these are + * declared or allocated as a complete unit so the compiler takes care of + * alignment. This is unlike ithe other regnodes which are allocated in terms + * of multiples of a single-argument regnode. Because there is no alignment + * issue, these can have a pointer field */ struct regnode_ssc { U8 flags; /* ANYOF_POSIXL bit must go here */ U8 type; @@ -230,8 +232,6 @@ struct regnode_ssc { U32 arg1; char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */ U32 classflags; /* and run-time */ - SV* utf8_locale_list; /* list of code points matched by folds - in a UTF-8 locale */ SV* invlist; /* list of code points matched */ }; @@ -492,7 +492,6 @@ struct regnode_ssc { #define ANYOF_SIZE (sizeof(struct regnode_charclass)) #define ANYOF_POSIXL_SIZE (sizeof(regnode_charclass_posixl)) #define ANYOF_CLASS_SIZE ANYOF_POSIXL_SIZE -#define ANYOF_POSIXL_FOLD_SIZE (sizeof(regnode_charclass_posixl_fold)) #define ANYOF_FLAGS(p) ((p)->flags) @@ -554,11 +553,8 @@ struct regnode_ssc { #define ANYOF_SKIP ((ANYOF_SIZE - 1)/sizeof(regnode)) #define ANYOF_POSIXL_SKIP ((ANYOF_POSIXL_SIZE - 1)/sizeof(regnode)) -#define ANYOF_POSIXL_FOLD_SKIP ((ANYOF_POSIXL_FOLD_SIZE - 1)/sizeof(regnode)) #define ANYOF_CLASS_SKIP ANYOF_POSIXL_SKIP -#define ANYOF_UTF8_LOCALE_INVLIST(node) (((regnode_charclass_posixl_fold*) (node))->utf8_locale_list) - /* * Utility definitions. */ diff --git a/regexec.c b/regexec.c index d1a6dcc..e1e840d 100644 --- a/regexec.c +++ b/regexec.c @@ -7537,7 +7537,8 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; - assert(ANYOF_FLAGS(node) & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)); + assert(ANYOF_FLAGS(node) + & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD)); if (data && data->count) { const U32 n = ARG(node); @@ -7550,18 +7551,30 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, si = *ary; /* ary[0] = the string to initialize the swash with */ - /* Elements 2 and 3 are either both present or both absent. [2] is - * any inversion list generated at compile time; [3] indicates if + /* Elements 3 and 4 are either both present or both absent. [3] is + * any inversion list generated at compile time; [4] indicates if * that inversion list has any user-defined properties in it. */ - if (av_len(av) >= 2) { - invlist = ary[2]; - if (SvUV(ary[3])) { + if (av_tindex(av) >= 2) { + if (only_utf8_locale_ptr + && ary[2] + && ary[2] != &PL_sv_undef) + { + *only_utf8_locale_ptr = ary[2]; + } + else { + *only_utf8_locale_ptr = NULL; + } + + if (av_len(av) >= 3) { + invlist = ary[3]; + if (SvUV(ary[4])) { swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; } } else { invlist = NULL; } + } /* Element [1] is reserved for the set-up swash. If already there, * return it; if not, create it and store it there */ @@ -7715,15 +7728,6 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } } - /* For /li matching and the current locale is a UTF-8 one, look at the - * special list, valid for just these circumstances. */ - if (! match - && (flags & ANYOF_LOC_FOLD) - && IN_UTF8_CTYPE_LOCALE - && ANYOF_UTF8_LOCALE_INVLIST(n)) - { - match = _invlist_contains_cp(ANYOF_UTF8_LOCALE_INVLIST(n), c); - } /* If the bitmap didn't (or couldn't) match, and something outside the * bitmap could match, try that. */ @@ -7732,9 +7736,14 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const match = TRUE; /* Everything above 255 matches */ } else if ((flags & ANYOF_NONBITMAP_NON_UTF8) - || (utf8_target && (flags & ANYOF_UTF8))) + || (utf8_target && (flags & ANYOF_UTF8)) + || ((flags & ANYOF_LOC_FOLD) + && IN_UTF8_CTYPE_LOCALE + && ARG(n) != ANYOF_NONBITMAP_EMPTY)) { - SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, NULL); + SV* only_utf8_locale = NULL; + SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale); if (sw) { U8 * utf8_p; if (utf8_target) { @@ -7751,6 +7760,9 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const /* If we allocated a string above, free it */ if (! utf8_target) Safefree(utf8_p); } + if (! match && only_utf8_locale && IN_UTF8_CTYPE_LOCALE) { + match = _invlist_contains_cp(only_utf8_locale, c); + } } if (UNICODE_IS_SUPER(c) -- 1.8.3.1