Revamp qr/[...]/ optimizations
authorKarl Williamson <khw@cpan.org>
Wed, 26 Dec 2018 05:56:48 +0000 (22:56 -0700)
committerKarl Williamson <khw@cpan.org>
Wed, 26 Dec 2018 19:50:38 +0000 (12:50 -0700)
This commit extensively changes the optimizations for ANYOF regnodes
that represent bracketed character classes.

The removal of the regex compilation pass now makes these feasible and
desirable.  Compilation now tries hard to optimize an ANYOF node into
something smaller and/or faster when feasible.

Now, qr/[X]/ for any single character or POSIX class X, and any
modifiers like /d, /i, etc, should be the same as qr/X/ for the same
modifiers, unless it would require the pattern to be upgraded from
non-UTF-8 to UTF-8, unless not doing so could introduce bugs.

These changes fix some issues with multi-character /i folding.

regcomp.c
t/re/anyof.t
t/re/re_tests
t/uni/fold.t

index 50e98bd..b3c348d 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -16478,7 +16478,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     STRLEN numlen;
     int namedclass = OOB_NAMEDCLASS;
     char *rangebegin = NULL;
-    bool need_class = 0;
     SV *listsv = NULL;
     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
                                      than just initialized.  */
@@ -16549,7 +16548,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     bool warn_super = ALWAYS_WARN_SUPER;
 
     const char * orig_parse = RExC_parse;
-    bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */
 
     /* This variable is used to mark where the end in the input is of something
      * that looks like a POSIX construct but isn't.  During the parse, when
@@ -17037,8 +17035,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                                         (FOLD) ? "__" : "",
                                         UTF8fARG(UTF, n, name),
                                         (FOLD) ? "_i" : "");
-                        optimizable = FALSE;    /* Will have to leave this an
-                                                   ANYOF node */
                         has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY;
 
                         /* We don't know yet what this matches, so have to flag
@@ -17240,39 +17236,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  * be matched against.  This isn't needed for \p{} and
                  * pseudo-classes, as they are not affected by locale, and
                  * hence are dealt with separately */
-                if (! need_class) {
-                    need_class = 1;
-                    anyof_flags |= ANYOF_MATCHES_POSIXL;
-                    has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
-
-                    /* We can't change this into some other type of node
-                     * (unless this is the only element, in which case there
-                     * are nodes that mean exactly this) as has runtime
-                     * dependencies */
-                    optimizable = FALSE;
-                }
-
-                /* Coverity thinks it is possible for this to be negative; both
-                 * jhi and khw think it's not, but be safer */
-                assert(! (anyof_flags & ANYOF_MATCHES_POSIXL)
-                       || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
-
-                /* See if it already matches the complement of this POSIX
-                 * class */
-                if (  (anyof_flags & ANYOF_MATCHES_POSIXL)
-                    && POSIXL_TEST(posixl, namedclass + ((namedclass % 2)
-                                                         ? -1
-                                                         : 1)))
-                {
-                    posixl_matches_all = TRUE;
-                    break;  /* No need to continue.  Since it matches both
-                               e.g., \w and \W, it matches everything, and the
-                               bracketed class can be optimized into qr/./s */
-                }
-
-                /* Add this class to those that should be checked at runtime */
                 POSIXL_SET(posixl, namedclass);
                 has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY;
+                anyof_flags |= ANYOF_MATCHES_POSIXL;
 
                 /* The above-Latin1 characters are not subject to locale rules.
                  * Just add them to the unconditionally-matched list */
@@ -18146,7 +18112,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                  || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)))
     {
         RExC_seen_d_op = TRUE;
-        optimizable = FALSE;
         has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY;
     }
 
@@ -18179,106 +18144,484 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
      * routine) */
     *flagp |= HASWIDTH|SIMPLE;
 
+    if (anyof_flags & ANYOF_LOCALE_FLAGS) {
+        RExC_contains_locale = 1;
+    }
+
     /* Some character classes are equivalent to other nodes.  Such nodes take
      * up less room, and some nodes require fewer operations to execute, than
      * ANYOF nodes.  EXACTish nodes may be joinable with adjacent nodes to
      * improve efficiency. */
 
     if (optimizable) {
-        int posix_class = -1;   /* Illegal value */
-        UV start, end;
+        PERL_UINT_FAST8_T i;
+        Size_t partial_cp_count = 0;
+        UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */
+        UV   end[MAX_FOLD_FROMS+1] = { 0 };
+
+        if (cp_list) { /* Count the code points in enough ranges that we would
+                          see all the ones possible in any fold in this version
+                          of Unicode */
 
-        if (UNLIKELY(posixl_matches_all)) {
-            ret = reg_node(pRExC_state, SANY);
+            invlist_iterinit(cp_list);
+            for (i = 0; i <= MAX_FOLD_FROMS; i++) {
+                if (invlist_iternext(cp_list, &start[i], &end[i])) {
+                    partial_cp_count += end[i] - start[i] + 1;
+                }
+            }
+
+            invlist_iterfinish(cp_list);
+        }
+
+        /* If we know at compile time that this matches every possible code
+         * point, any run-time dependencies don't matter */
+        if (start[0] == 0 && end[0] == UV_MAX) {
+            if (invert) {
+                ret = reganode(pRExC_state, OPFAIL, 0);
+            }
+            else {
+                ret = reg_node(pRExC_state, SANY);
+                MARK_NAUGHTY(1);
+            }
             goto not_anyof;
         }
 
-        if (cp_list && ! invert) {
-            invlist_iterinit(cp_list);
-            if (! invlist_iternext(cp_list, &start, &end)) {
+        /* Similarly, for /l posix classes, if both a class and its
+         * complement match, any run-time dependencies don't matter */
+        if (posixl) {
+            for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX;
+                                                        namedclass += 2)
+            {
+                if (   POSIXL_TEST(posixl, namedclass)      /* class */
+                    && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */
+                {
+                    if (invert) {
+                        ret = reganode(pRExC_state, OPFAIL, 0);
+                    }
+                    else {
+                        ret = reg_node(pRExC_state, SANY);
+                        MARK_NAUGHTY(1);
+                    }
+                    goto not_anyof;
+                }
+            }
+            /* For well-behaved locales, some classes are subsets of others,
+             * so complementing the subset and including the non-complemented
+             * superset should match everything, like [\D[:alnum:]], and
+             * [[:^alpha:][:alnum:]], but some implementations of locales are
+             * buggy, and khw thinks its a bad idea to have optimization change
+             * behavior, even if it avoids an OS bug in a given case */
+
+#define isSINGLE_BIT_SET(n) isPOWER_OF_2(n)
+
+            /* If is a single posix /l class, can optimize to just that op.
+             * Such a node will not match anything in the Latin1 range, as that
+             * is not determinable until runtime, but will match whatever the
+             * class does outside that range.  (Note that some classes won't
+             * match anything outside the range, like [:ascii:]) */
+            if (    isSINGLE_BIT_SET(posixl)
+                && (partial_cp_count == 0 || start[0] > 255))
+            {
+                U8 classnum;
+                SV * class_above_latin1 = NULL;
+                bool already_inverted;
+                bool are_equivalent;
+
+                /* Compute which bit is set, which is the same thing as, e.g.,
+                 * ANYOF_CNTRL.  From
+                 * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn
+                 * */
+                static const int MultiplyDeBruijnBitPosition2[32] =
+                    {
+                    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+                    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+                    };
+
+                namedclass = MultiplyDeBruijnBitPosition2[(posixl
+                                                          * 0x077CB531U) >> 27];
+                classnum = namedclass_to_classnum(namedclass);
+
+                /* The named classes are such that the inverted number is one
+                 * larger than the non-inverted one */
+                already_inverted = namedclass
+                                 - classnum_to_namedclass(classnum);
+
+                /* Create an inversion list of the official property, inverted
+                 * if the constructed node list is inverted, and restricted to
+                 * only the above latin1 code points, which are the only ones
+                 * known at compile time */
+                _invlist_intersection_maybe_complement_2nd(
+                                                    PL_AboveLatin1,
+                                                    PL_XPosix_ptrs[classnum],
+                                                    already_inverted,
+                                                    &class_above_latin1);
+                are_equivalent = _invlistEQ(class_above_latin1, cp_list,
+                                                                        FALSE);
+                SvREFCNT_dec_NN(class_above_latin1);
+
+                if (are_equivalent) {
+
+                    /* Resolve the run-time inversion flag with this possibly
+                     * inverted class */
+                    invert = invert ^ already_inverted;
+
+                    ret = reg_node(pRExC_state,
+                                   POSIXL + invert * (NPOSIXL - POSIXL));
+                    FLAGS(REGNODE_p(ret)) = classnum;
+                    goto not_anyof;
+                }
+            }
+        }
+
+        /* khw can't think of any other possible transformation involving
+         * these. */
+        if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) {
+            goto is_anyof;
+        }
+
+        if (! has_runtime_dependency) {
 
             /* If the list is empty, nothing matches.  This happens, for
              * example, when a Unicode property that doesn't match anything is
              * the only element in the character class (perluniprops.pod notes
              * such properties). */
+            if (partial_cp_count == 0) {
+                assert (! invert);
                 ret = reganode(pRExC_state, OPFAIL, 0);
                 goto not_anyof;
             }
 
-            if (start == end) {    /* The range is a single code point */
-                if (! invlist_iternext(cp_list, &start, &end)
+            /* If matches everything but \n */
+            if (   start[0] == 0 && end[0] == '\n' - 1
+                && start[1] == '\n' + 1 && end[1] == UV_MAX)
+            {
+                assert (! invert);
+                ret = reg_node(pRExC_state, REG_ANY);
+                MARK_NAUGHTY(1);
+                goto not_anyof;
+            }
+        }
 
-                        /* Don't do this optimization if it would require
-                         * changing the pattern to UTF-8 */
-                    && (start < 256 || UTF))
-                {
-                    /* Here, the list contains a single code point.  Can
-                     * optimize into an EXACTish node */
+        /* Next see if can optimize classes that contain just a few code points
+         * into an EXACTish node.  The reason to do this is to let the
+         * optimizer join this node with adjacent EXACTish ones.
+         *
+         * An EXACTFish node can be generated even if not under /i, and vice
+         * versa.  But care must be taken.  An EXACTFish node has to be such
+         * that it only matches precisely the code points in the class, but we
+         * want to generate the least restrictive one that does that, to
+         * increase the odds of being able to join with an adjacent node.  For
+         * example, if the class contains [kK], we have to make it an EXACTFAA
+         * node to prevent the KELVIN SIGN from matching.  Whether we are under
+         * /i or not is irrelevant in this case.  Less obvious is the pattern
+         * qr/[\x{02BC}]n/i.  U+02BC is MODIFIER LETTER APOSTROPHE. That is
+         * supposed to match the single character U+0149 LATIN SMALL LETTER N
+         * PRECEDED BY APOSTROPHE.  And so even though there is no simple fold
+         * that includes \X{02BC}, there is a multi-char fold that does, and so
+         * the node generated for it must be an EXACTFish one.  On the other
+         * hand qr/:/i should generate a plain EXACT node since the colon
+         * participates in no fold whatsoever, and having it EXACT tells the
+         * optimizer the target string cannot match unless it has a colon in
+         * it.
+         *
+         * We don't typically generate an EXACTish node if doing so would
+         * require changing the pattern to UTF-8, as that affects /d and
+         * otherwise is slower.  However, under /i, not changing to UTF-8 can
+         * miss some potential multi-character folds.  We calculate the
+         * EXACTish node, and then decide if something would be missed if we
+         * don't upgrade */
+        if (   ! posixl
+            && ! invert
+
+                /* Only try if there are no more code points in the class than
+                 * in the max possible fold */
+            &&   partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1
+
+            && (start[0] < 256 || UTF || FOLD))
+        {
+            if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches)
+            {
+                /* We can always make a single code point class into an
+                 * EXACTish node. */
+
+                if (LOC) {
+
+                    /* Here is /l:  Use EXACTL, except /li indicates EXACTFL,
+                     * as that means there is a fold not known until runtime so
+                     * shows as only a single code point here. */
+                    op = (FOLD) ? EXACTFL : EXACTL;
+                }
+                else if (! FOLD) { /* Not /l and not /i */
+                    op = (start[0] < 256) ? EXACT : EXACT_ONLY8;
+                }
+                else if (start[0] < 256) { /* /i, not /l, and the code point is
+                                              small */
+
+                    /* Under /i, it gets a little tricky.  A code point that
+                     * doesn't participate in a fold should be an EXACT node.
+                     * We know this one isn't the result of a simple fold, or
+                     * there'd be more than one code point in the list, but it
+                     * could be part of a multi- character fold.  In that case
+                     * we better not create an EXACT node, as we would wrongly
+                     * be telling the optimizer that this code point must be in
+                     * the target string, and that is wrong.  This is because
+                     * if the sequence around this code point forms a
+                     * multi-char fold, what needs to be in the string could be
+                     * the code point that folds to the sequence.
+                     *
+                     * This handles the case of below-255 code points, as we
+                     * have an easy look up for those.  The next clause handles
+                     * the above-256 one */
+                    op = IS_IN_SOME_FOLD_L1(start[0])
+                         ? EXACTFU
+                         : EXACT;
+                }
+                else {  /* /i, larger code point.  Since we are under /i, and
+                           have just this code point, we know that it can't
+                           fold to something else, so PL_InMultiCharFold
+                           applies to it */
+                    op = _invlist_contains_cp(PL_InMultiCharFold,
+                                              start[0])
+                         ? EXACTFU_ONLY8
+                         : EXACT_ONLY8;
+                }
+
+                value = start[0];
+            }
+            else if (  ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY)
+                     && _invlist_contains_cp(PL_in_some_fold, start[0]))
+            {
+                /* Here, the only runtime dependency, if any, is from /d, and
+                 * the class matches more than one code point, and the lowest
+                 * code point participates in some fold.  It might be that the
+                 * other code points are /i equivalent to this one, and hence
+                 * they would representable by an EXACTFish node.  Above, we
+                 * eliminated classes that contain too many code points to be
+                 * EXACTFish, with the test for MAX_FOLD_FROMS
+                 *
+                 * First, special case the ASCII fold pairs, like 'B' and 'b'.
+                 * We do this because we have EXACTFAA at our disposal for the
+                 * ASCII range */
+                if (partial_cp_count == 2 && isASCII(start[0])) {
+
+                    /* The only ASCII characters that participate in folds are
+                     * alphabetics */
+                    assert(isALPHA(start[0]));
+                    if (   end[0] == start[0]   /* First range is a single
+                                                   character, so 2nd exists */
+                        && isALPHA_FOLD_EQ(start[0], start[1]))
+                    {
+
+                        /* Here, is part of an ASCII fold pair */
 
-                    value = start;
+                        if (   ASCII_FOLD_RESTRICTED
+                            || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0]))
+                        {
+                            /* If the second clause just above was true, it
+                             * means we can't be under /i, or else the list
+                             * would have included more than this fold pair.
+                             * Therefore we have to exclude the possibility of
+                             * whatever else it is that folds to these, by
+                             * using EXACTFAA */
+                            op = EXACTFAA;
+                        }
+                        else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) {
 
-                    if (! FOLD) {
-                        op = (LOC)
-                             ? EXACTL
-                             : EXACT;
+                            /* Here, there's no simple fold that start[0] is part
+                             * of, but there is a multi-character one.  If we
+                             * are not under /i, we want to exclude that
+                             * possibility; if under /i, we want to include it
+                             * */
+                            op = (FOLD) ? EXACTFU : EXACTFAA;
+                        }
+                        else {
+
+                            /* Here, the only possible fold start[0] particpates in
+                             * is with start[1].  /i or not isn't relevant */
+                            op = EXACTFU;
+                        }
+
+                        value = toFOLD(start[0]);
+                    }
+                }
+                else if (  ! upper_latin1_only_utf8_matches
+                         || (   _invlist_len(upper_latin1_only_utf8_matches)
+                                                                          == 2
+                             && PL_fold_latin1[
+                               invlist_highest(upper_latin1_only_utf8_matches)]
+                             == start[0]))
+                {
+                    /* Here, the smallest character is non-ascii or there are
+                     * more than 2 code points matched by this node.  Also, we
+                     * either don't have /d UTF-8 dependent matches, or if we
+                     * do, they look like they could be a single character that
+                     * is the fold of the lowest one in the always-match list.
+                     * This test quickly excludes most of the false positives
+                     * when there are /d UTF-8 depdendent matches.  These are
+                     * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN
+                     * SMALL LETTER A WITH GRAVE iff the target string is
+                     * UTF-8.  (We don't have to worry above about exceeding
+                     * the array bounds of PL_fold_latin1[] because any code
+                     * point in 'upper_latin1_only_utf8_matches' is below 256.)
+                     *
+                     * EXACTFAA would apply only to pairs (hence exactly 2 code
+                     * points) in the ASCII range, so we can't use it here to
+                     * artificially restrict the fold domain, so we check if
+                     * the class does or does not match some EXACTFish node.
+                     * Further, if we aren't under /i, and and the folded-to
+                     * character is part of a multi-character fold, we can't do
+                     * this optimization, as the sequence around it could be
+                     * that multi-character fold, and we don't here know the
+                     * context, so we have to assume it is that multi-char
+                     * fold, to prevent potential bugs.
+                     *
+                     * To do the general case, we first find the fold of the
+                     * lowest code point (which may be higher than the lowest
+                     * one), then find everything that folds to it.  (The data
+                     * structure we have only maps from the folded code points,
+                     * so we have to do the earlier step.) */
+
+                    Size_t foldlen;
+                    U8 foldbuf[UTF8_MAXBYTES_CASE];
+                    UV folded = _to_uni_fold_flags(start[0],
+                                                        foldbuf, &foldlen, 0);
+                    unsigned int first_fold;
+                    const unsigned int * remaining_folds;
+                    Size_t folds_to_this_cp_count = _inverse_folds(
+                                                            folded,
+                                                            &first_fold,
+                                                            &remaining_folds);
+                    Size_t folds_count = folds_to_this_cp_count + 1;
+                    SV * fold_list = _new_invlist(folds_count);
+                    unsigned int i;
+
+                    /* If there are UTF-8 dependent matches, create a temporary
+                     * list of what this node matches, including them. */
+                    SV * all_cp_list = NULL;
+                    SV ** use_this_list = &cp_list;
+
+                    if (upper_latin1_only_utf8_matches) {
+                        all_cp_list = _new_invlist(0);
+                        use_this_list = &all_cp_list;
+                        _invlist_union(cp_list,
+                                       upper_latin1_only_utf8_matches,
+                                       use_this_list);
                     }
-                    else if (LOC) {
 
-                        /* A locale node under folding with one code point can
-                         * be an EXACTFL, as its fold won't be calculated until
-                         * runtime */
-                        op = EXACTFL;
+                    /* Having gotten everything that participates in the fold
+                     * containing the lowest code point, we turn that into an
+                     * inversion list, making sure everything is included. */
+                    fold_list = add_cp_to_invlist(fold_list, start[0]);
+                    fold_list = add_cp_to_invlist(fold_list, folded);
+                    fold_list = add_cp_to_invlist(fold_list, first_fold);
+                    for (i = 0; i < folds_to_this_cp_count - 1; i++) {
+                        fold_list = add_cp_to_invlist(fold_list,
+                                                        remaining_folds[i]);
                     }
-                    else {
 
-                        /* Here, we are generally folding, but there is only
-                         * one code point to match.  If we have to, we use an
-                         * EXACT node, but it would be better for joining with
-                         * adjacent nodes in the optimization phase if we used
-                         * the same EXACTFish node that any such are likely to
-                         * be.  We can do this iff the code point doesn't
-                         * participate in any folds.  For example, an EXACTF of
-                         * a colon is the same as an EXACT one, since nothing
-                         * folds to or from a colon. */
-                        if (value < 256) {
-                            if (IS_IN_SOME_FOLD_L1(value)) {
-                                op = EXACT;
-                            }
-                        }
-                        else {
-                            if (_invlist_contains_cp(PL_in_some_fold, value)) {
-                                op = EXACT;
+                    /* If the fold list is identical to what's in this ANYOF
+                     * node, the node can be represented by an EXACTFish one
+                     * instead */
+                    if (_invlistEQ(*use_this_list, fold_list,
+                                   0 /* Don't complement */ )
+                    ) {
+
+                        /* But, we have to be careful, as mentioned above.
+                         * Just the right sequence of characters could match
+                         * this if it is part of a multi-character fold.  That
+                         * IS what we want if we are under /i.  But it ISN'T
+                         * what we want if not under /i, as it could match when
+                         * it shouldn't.  So, when we aren't under /i and this
+                         * character participates in a multi-char fold, we
+                         * don't optimize into an EXACTFish node.  So, for each
+                         * case below we have to check if we are folding
+                         * and if not, if it is not part of a multi-char fold.
+                         * */
+                        if (start[0] > 255) {    /* Highish code point */
+                            if (FOLD || ! _invlist_contains_cp(
+                                            PL_InMultiCharFold, folded))
+                            {
+                                op = (LOC)
+                                     ? EXACTFLU8
+                                     : (ASCII_FOLD_RESTRICTED)
+                                       ? EXACTFAA
+                                       : EXACTFU_ONLY8;
+                                value = folded;
                             }
+                        }   /* Below, the lowest code point < 256 */
+                        else if (    FOLD
+                                 &&  folded == 's'
+                                 &&  DEPENDS_SEMANTICS)
+                        {   /* An EXACTF node containing a single character
+                                's', can be an EXACTFU if it doesn't get
+                                joined with an adjacent 's' */
+                            op = EXACTFU_S_EDGE;
+                            value = folded;
                         }
+                        else if (    FOLD
+                                || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0]))
+                        {
+                            if (upper_latin1_only_utf8_matches) {
+                                op = EXACTF;
 
-                        /* If we haven't found the node type, above, it means
-                         * we can use the prevailing one */
-                        if (op == END) {
-                            op = compute_EXACTish(pRExC_state);
+                                /* We can't use the fold, as that only matches
+                                 * under UTF-8 */
+                                value = start[0];
+                            }
+                            else if (     UNLIKELY(start[0] == MICRO_SIGN)
+                                     && ! UTF)
+                            {   /* EXACTFUP is a special node for this
+                                   character */
+                                op = (ASCII_FOLD_RESTRICTED)
+                                     ? EXACTFAA
+                                     : EXACTFUP;
+                                value = MICRO_SIGN;
+                            }
+                            else if (     ASCII_FOLD_RESTRICTED
+                                     && ! isASCII(start[0]))
+                            {   /* For ASCII under /iaa, we can use EXACTFU
+                                   below */
+                                op = EXACTFAA;
+                                value = folded;
+                            }
+                            else {
+                                op = EXACTFU;
+                                value = folded;
+                            }
                         }
                     }
-                }
-            }   /* End of first range contains just a single code point */
-            else if (start == 0) {
-                if (end == UV_MAX) {
-                    op = SANY;
-                    MARK_NAUGHTY(1);
-                }
-                else if (end == '\n' - 1
-                        && invlist_iternext(cp_list, &start, &end)
-                        && start == '\n' + 1 && end == UV_MAX)
-                {
-                    op = REG_ANY;
-                    MARK_NAUGHTY(1);
+
+                    SvREFCNT_dec_NN(fold_list);
+                    SvREFCNT_dec(all_cp_list);
                 }
             }
-            invlist_iterfinish(cp_list);
 
             if (op != END) {
-                if (PL_regkind[op] != EXACT) {
-                    ret = reg_node(pRExC_state, op);
+
+                /* Here, we have calculated what EXACTish node we would use.
+                 * But we don't use it if it would require converting the
+                 * pattern to UTF-8, unless not using it could cause us to miss
+                 * some folds (hence be buggy) */
+
+                if (! UTF && value > 255) {
+                    SV * in_multis = NULL;
+
+                    assert(FOLD);
+
+                    /* If there is no code point that is part of a multi-char
+                     * fold, then there aren't any matches, so we don't do this
+                     * optimization.  Otherwise, it could match depending on
+                     * the context around us, so we do upgrade */
+                    _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis);
+                    if (UNLIKELY(_invlist_len(in_multis) != 0)) {
+                        REQUIRE_UTF8(flagp);
+                    }
+                    else {
+                        op = END;
+                    }
                 }
-                else {
+
+                if (op != END) {
                     U8 len = (UTF) ? UVCHR_SKIP(value) : 1;
 
                     ret = regnode_guts(pRExC_state, op, len, "exact");
@@ -18291,11 +18634,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                     else {
                         uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value);
                     }
+                    goto not_anyof;
                 }
-                goto not_anyof;
             }
+        }
 
-            {
+        if (! has_runtime_dependency) {
 
             /* See if this can be turned into an ANYOFM node.  Think about the
              * bit patterns in two different bytes.  In some positions, the
@@ -18333,7 +18677,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             }
 
             if (invlist_highest(cp_list) <= max_permissible) {
-                UV this_start, this_end, lowest_cp;
+                UV this_start, this_end;
+                UV lowest_cp = UV_MAX;  /* inited to suppress compiler warn */
                 U8 bits_differing = 0;
                 Size_t full_cp_count = 0;
                 bool first_time = TRUE;
@@ -18409,18 +18754,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
             if (op != END) {
                 goto not_anyof;
             }
-            }
+        }
 
-        {
+        if (! posixl) {
             PERL_UINT_FAST8_T type;
+            SV * intersection = NULL;
+            SV* d_invlist = NULL;
 
-            /* Here, didn't find an optimization.  See if this matches any
-             * of the POSIX classes.  The POSIXA ones are about the same speed
-             * as ANYOF ops, but take less room; the ones that have
-             * above-Latin1 code point matches are somewhat faster than ANYOF.
-             * */
+            /* See if this matches any of the POSIX classes.  The POSIXA and
+             * POSIXD ones are about the same speed as ANYOF ops, but take less
+             * room; the ones that have above-Latin1 code point matches are
+             * somewhat faster than ANYOF.  */
 
-            for (type = POSIXU; type <= POSIXA; type++) {
+            for (type = POSIXA; type >= POSIXD; type--) {
+                int posix_class;
+
+                if (type == POSIXL) {   /* But not /l posix classes */
+                    continue;
+                }
 
                 for (posix_class = 0;
                      posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC;
@@ -18437,30 +18788,90 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                         official_code_points = &PL_XPosix_ptrs[posix_class];
                     }
 
+                    /* Skip non-existent classes of this type.  e.g. \v only
+                     * has an entry in PL_XPosix_ptrs */
+                    if (! *official_code_points) {
+                        continue;
+                    }
+
                     /* Try both the regular class, and its inversion */
                     for (try_inverted = 0; try_inverted < 2; try_inverted++) {
-                        /* Check if matches, normal or inverted */
-                        if (*official_code_points) {
-                            if (_invlistEQ(*our_code_points,
-                                        *official_code_points,
-                                        try_inverted))
+                        bool this_inverted = invert ^ try_inverted;
+
+                        if (type != POSIXD) {
+
+                            /* This class that isn't /d can't match if we have
+                             * /d dependencies */
+                            if (has_runtime_dependency
+                                                    & HAS_D_RUNTIME_DEPENDENCY)
                             {
-                                ret = reg_node(pRExC_state, (try_inverted)
-                                                            ? type + NPOSIXA
-                                                                   - POSIXA
-                                                            : type);
-                                FLAGS(REGNODE_p(ret)) = posix_class;
-                                goto not_anyof;
+                                continue;
                             }
                         }
+                        else /* is /d */ if (! this_inverted) {
+
+                            /* /d classes don't match anything non-ASCII below
+                             * 256 unconditionally (which cp_list contains) */
+                            _invlist_intersection(cp_list, PL_UpperLatin1,
+                                                           &intersection);
+                            if (_invlist_len(intersection) != 0) {
+                                continue;
+                            }
+
+                            SvREFCNT_dec(d_invlist);
+                            d_invlist = invlist_clone(cp_list, NULL);
+
+                            /* But under UTF-8 it turns into using /u rules.
+                             * Add the things it matches under these conditions
+                             * so that we check below that these are identical
+                             * to what the tested class should match */
+                            if (upper_latin1_only_utf8_matches) {
+                                _invlist_union(
+                                            d_invlist,
+                                            upper_latin1_only_utf8_matches,
+                                            &d_invlist);
+                            }
+                            our_code_points = &d_invlist;
+                        }
+                        else {  /* POSIXD, inverted.  If this doesn't have this
+                                   flag set, it isn't /d. */
+                            if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
+                            {
+                                continue;
+                            }
+                            our_code_points = &cp_list;
+                        }
+
+                        /* Here, have weeded out some things.  We want to see
+                         * if the list of characters this node contains
+                         * ('*our_code_points') precisely matches those of the
+                         * class we are currently checking against
+                         * ('*official_code_points'). */
+                        if (_invlistEQ(*our_code_points,
+                                       *official_code_points,
+                                       try_inverted))
+                        {
+                            /* Here, they precisely match.  Optimize this ANYOF
+                             * node into its equivalent POSIX one of the
+                             * correct type, possibly inverted */
+                            ret = reg_node(pRExC_state, (try_inverted)
+                                                        ? type + NPOSIXA
+                                                                - POSIXA
+                                                        : type);
+                            FLAGS(REGNODE_p(ret)) = posix_class;
+                            SvREFCNT_dec(d_invlist);
+                            SvREFCNT_dec(intersection);
+                            goto not_anyof;
+                        }
                     }
                 }
             }
-        }
+            SvREFCNT_dec(d_invlist);
+            SvREFCNT_dec(intersection);
         }
     }   /* End of seeing if can optimize it into a different node */
 
-    /* It's going to be an ANYOF node. */
+  is_anyof: /* It's going to be an ANYOF node. */
     op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY)
          ? ANYOFD
          : ((posixl)
@@ -18523,11 +18934,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
                   only_utf8_locale_list,
                   swash, cBOOL(has_runtime_dependency
                                                 & HAS_USER_DEFINED_PROPERTY));
-
-    if (ANYOF_FLAGS(REGNODE_p(ret)) & ANYOF_LOCALE_FLAGS) {
-        RExC_contains_locale = 1;
-    }
-
     return ret;
 
   not_anyof:
@@ -18537,7 +18943,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
 
     Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start,
                                            RExC_parse - orig_parse);;
-    SvREFCNT_dec_NN(cp_list);;
+    SvREFCNT_dec(cp_list);;
     return ret;
 }
 
index d114e6a..493358c 100644 (file)
@@ -84,28 +84,387 @@ sub  get_compiled ($) {
     return $_;
 }
 
+# Note: EXACTish lowercases the hex; ANYOF uppercases, without braces
+
 my @tests = (
+    '[\xe0\xc0]' => 'EXACTFU <\\x{e0}>',
+    '[\xe1\xc1]' => 'EXACTFU <\\x{e1}>',
+    '[\xe2\xc2]' => 'EXACTFU <\\x{e2}>',
+    '[\xe3\xc3]' => 'EXACTFU <\\x{e3}>',
+    '[\xe4\xc4]' => 'EXACTFU <\\x{e4}>',
+    '[\xc5\xe5]' => 'ANYOF[\\xC5\\xE5]',
+    '[\xe6\xc6]' => 'EXACTFU <\\x{e6}>',
+    '[\xe7\xc7]' => 'EXACTFU <\\x{e7}>',
+    '[\xe8\xc8]' => 'EXACTFU <\\x{e8}>',
+    '[\xe9\xc9]' => 'EXACTFU <\\x{e9}>',
+    '[\xea\xca]' => 'EXACTFU <\\x{ea}>',
+    '[\xeb\xcb]' => 'EXACTFU <\\x{eb}>',
+    '[\xec\xcc]' => 'EXACTFU <\\x{ec}>',
+    '[\xee\xce]' => 'EXACTFU <\\x{ee}>',
+    '[\xef\xcf]' => 'EXACTFU <\\x{ef}>',
+    '[\xf0\xd0]' => 'EXACTFU <\\x{f0}>',
+    '[\xf1\xd1]' => 'EXACTFU <\\x{f1}>',
+    '[\xf2\xd2]' => 'EXACTFU <\\x{f2}>',
+    '[\xf3\xd3]' => 'EXACTFU <\\x{f3}>',
+    '[\xf4\xd4]' => 'EXACTFU <\\x{f4}>',
+    '[\xf5\xd5]' => 'EXACTFU <\\x{f5}>',
+    '[\xf6\xd6]' => 'EXACTFU <\\x{f6}>',
+    '[\xf8\xd8]' => 'EXACTFU <\\x{f8}>',
+    '[\xf9\xd9]' => 'EXACTFU <\\x{f9}>',
+    '[\xfa\xda]' => 'EXACTFU <\\x{fa}>',
+    '[\xfb\xdb]' => 'EXACTFU <\\x{fb}>',
+    '[\xfc\xdc]' => 'EXACTFU <\\x{fc}>',
+    '[\xfd\xdd]' => 'EXACTFU <\\x{fd}>',
+    '[\xfe\xde]' => 'EXACTFU <\\x{fe}>',
+
     '[[{]' => 'ANYOFM[\[\{]',
     '[^\S ]' => 'ANYOFD[\t\n\x0B\f\r{utf8}\x85\xA0][1680 2000-200A 2028-2029 202F 205F 3000]',
     '[^\n\r]' => 'ANYOF[^\n\r][0100-INFTY]',
     '[^\/\|,\$\%%\@\ \%"\<\>\:\#\&\*\{\}\[\]\(\)]' => 'ANYOF[^ "#$%&()*,/:<>@\[\]\{|\}][0100-INFTY]',
+    '[[:ascii:]]' => 'ANYOFM[\x00-\x7F]',
+    '[[:^ascii:]]' => 'NANYOFM[\x00-\x7F]',
+    '[[:^ascii:]\x{2C2}]' => 'NANYOFM[\x00-\x7F]',
+    '(?u)[[:ascii:]]' => 'ANYOFM[\x00-\x7F]',
+    '(?u)[[:^ascii:]]' => 'NANYOFM[\x00-\x7F]',
+    '(?a)[[:ascii:]]' => 'ANYOFM[\x00-\x7F]',
+    '(?a)[[:^ascii:]]' => 'NANYOFM[\x00-\x7F]',
+    '(?a)[[:^ascii:]\x{2C2}]' => 'NANYOFM[\x00-\x7F]',
+    '[[:cntrl:]]' => 'POSIXD[:cntrl:]',
     '[^[:^print:][:^ascii:]]' => 'POSIXA[:print:]',
-    '[ [:blank:]]' => 'ANYOFD[\t {utf8}\xA0][1680 2000-200A 202F 205F 3000]',
-    '[_[:^blank:]]' => 'ANYOFD[^\t {utf8}\xA0][0100-167F 1681-1FFF 200B-202E 2030-205E 2060-2FFF 3001-INFTY]',
+    '[[:blank:]]' => 'POSIXD[:blank:]',
+    '[ [:blank:]]' => 'POSIXD[:blank:]',
+    '[_[:blank:]]' => 'ANYOFD[\t _{utf8}\xA0][1680 2000-200A 202F 205F 3000]',
+    '[_[:^blank:]]' => 'NPOSIXD[:blank:]',
     '[\xA0[:^blank:]]' => 'ANYOF[^\t ][0100-167F 1681-1FFF 200B-202E 2030-205E 2060-2FFF 3001-INFTY]',
-    '[ [:blank:]]' => 'ANYOFD[\t {utf8}\xA0][1680 2000-200A 202F 205F 3000]',
-    '[_[:^blank:]]' => 'ANYOFD[^\t {utf8}\xA0][0100-167F 1681-1FFF 200B-202E 2030-205E 2060-2FFF 3001-INFTY]',
-    '(?d:[_[:^blank:]])' => 'ANYOFD[^\t {utf8}\xA0][0100-167F 1681-1FFF 200B-202E 2030-205E 2060-2FFF 3001-INFTY]',
+    '(?d:[_[:^blank:]])' => 'NPOSIXD[:blank:]',
     '[\x{07}-\x{0B}]' => 'ANYOF[\a\b\t\n\x0B]',
-    '(?il:[\x{212A}])' => 'ANYOFL{i}[{utf8 locale}Kk][212A]',
-    '(?il:(?[\x{212A}]))' => 'ANYOFL{utf8-locale-reqd}[Kk][212A]',
+    '(?il)[\x{212A}]' => 'ANYOFL{i}[{utf8 locale}Kk][212A]',
+    '(?il)(?[\x{212A}])' => 'ANYOFL{utf8-locale-reqd}[Kk][212A]',
+
+    '(?i)b[s]\xe0' => 'EXACTFU <b>',    # The s goes into a 2nd node
 
     'ebcdic_ok_below_this_marker',
 
+    '[aA]' => 'EXACTFAA <a>',
+    '[bB]' => 'EXACTFU <b>',
+    '[kK]' => 'EXACTFAA <k>',
+
+    '(?i:[^:])' => 'NANYOFM[:]',
+
+    '[^\n]' => 'REG_ANY',
+
+    '[[:alpha:]]' => 'POSIXD[:alpha:]',
+    '[[:^alpha:]]' => 'NPOSIXD[:alpha:]',
+    '[[:^alpha:]\x{2C2}]' => 'NPOSIXU[:alpha:]',
+    '(?l)[[:alpha:]]' => 'POSIXL[:alpha:]',
+    '(?l)[[:^alpha:]]' => 'NPOSIXL[:alpha:]',
+    '(?l)[[:^alpha:]\x{2C2}]' => 'NPOSIXL[:alpha:]',
+    '(?u)[[:alpha:]]' => 'POSIXU[:alpha:]',
+    '(?u)[[:^alpha:]]' => 'NPOSIXU[:alpha:]',
+    '(?a)[[:alpha:]]' => 'POSIXA[:alpha:]',
+    '(?a)[[:^alpha:]]' => 'NPOSIXA[:alpha:]',
+    '(?a)[[:^alpha:]\x{2C2}]' => 'NPOSIXA[:alpha:]',
+    '[[:alpha:][:^alpha:]]' => 'SANY',
+    '[^[:alpha:][:^alpha:]]' => 'OPFAIL',
+    '(?l)[[:alpha:][:^alpha:]]' => 'SANY',
+    '(?l)[^[:alpha:][:^alpha:]]' => 'OPFAIL',
+    '(?u)[[:alpha:][:^alpha:]]' => 'SANY',
+    '(?u)[^[:alpha:][:^alpha:]]' => 'OPFAIL',
+    '(?a)[[:alpha:][:^alpha:]]' => 'SANY',
+    '(?a)[^[:alpha:][:^alpha:]]' => 'OPFAIL',
+    '[[:alnum:]]' => 'POSIXD[:alnum:]',
+    '[[:^alnum:]]' => 'NPOSIXD[:alnum:]',
+    '[[:^alnum:]\x{2C2}]' => 'NPOSIXU[:alnum:]',
+    '(?l)[[:alnum:]]' => 'POSIXL[:alnum:]',
+    '(?l)[[:^alnum:]]' => 'NPOSIXL[:alnum:]',
+    '(?l)[[:^alnum:]\x{2C2}]' => 'NPOSIXL[:alnum:]',
+    '(?u)[[:alnum:]]' => 'POSIXU[:alnum:]',
+    '(?u)[[:^alnum:]]' => 'NPOSIXU[:alnum:]',
+    '(?a)[[:alnum:]]' => 'POSIXA[:alnum:]',
+    '(?a)[[:^alnum:]]' => 'NPOSIXA[:alnum:]',
+    '(?a)[[:^alnum:]\x{2C2}]' => 'NPOSIXA[:alnum:]',
+    '[[:alnum:][:^alnum:]]' => 'SANY',
+    '[^[:alnum:][:^alnum:]]' => 'OPFAIL',
+    '(?l)[[:alnum:][:^alnum:]]' => 'SANY',
+    '(?l)[^[:alnum:][:^alnum:]]' => 'OPFAIL',
+    '(?u)[[:alnum:][:^alnum:]]' => 'SANY',
+    '(?u)[^[:alnum:][:^alnum:]]' => 'OPFAIL',
+    '(?a)[[:alnum:][:^alnum:]]' => 'SANY',
+    '(?a)[^[:alnum:][:^alnum:]]' => 'OPFAIL',
+    '(?l)[[:ascii:]]' => 'POSIXL[:ascii:]',
+    '(?l)[[:^ascii:]]' => 'NPOSIXL[:ascii:]',
+    '(?l)[[:^ascii:]\x{2C2}]' => 'NPOSIXL[:ascii:]',
+    '[[:ascii:][:^ascii:]]' => 'SANY',
+    '[^[:ascii:][:^ascii:]]' => 'OPFAIL',
+    '(?l)[[:ascii:][:^ascii:]]' => 'SANY',
+    '(?l)[^[:ascii:][:^ascii:]]' => 'OPFAIL',
+    '(?u)[[:ascii:][:^ascii:]]' => 'SANY',
+    '(?u)[^[:ascii:][:^ascii:]]' => 'OPFAIL',
+    '(?a)[[:ascii:][:^ascii:]]' => 'SANY',
+    '(?a)[^[:ascii:][:^ascii:]]' => 'OPFAIL',
+    '[[:^blank:]]' => 'NPOSIXD[:blank:]',
+    '[[:^blank:]\x{2C2}]' => 'NPOSIXU[:blank:]',
+    '(?l)[[:blank:]]' => 'POSIXL[:blank:]',
+    '(?l)[[:^blank:]]' => 'NPOSIXL[:blank:]',
+    '(?l)[[:^blank:]\x{2C2}]' => 'NPOSIXL[:blank:]',
+    '(?u)[[:blank:]]' => 'POSIXU[:blank:]',
+    '(?u)[[:^blank:]]' => 'NPOSIXU[:blank:]',
+    '(?a)[[:blank:]]' => 'POSIXA[:blank:]',
+    '(?a)[[:^blank:]]' => 'NPOSIXA[:blank:]',
+    '(?a)[[:^blank:]\x{2C2}]' => 'NPOSIXA[:blank:]',
+    '[[:blank:]]' => 'POSIXD[:blank:]',
+    '[[:blank:][:^blank:]]' => 'SANY',
+    '[^[:blank:][:^blank:]]' => 'OPFAIL',
+    '(?l)[[:blank:][:^blank:]]' => 'SANY',
+    '(?l)[^[:blank:][:^blank:]]' => 'OPFAIL',
+    '(?u)[[:blank:][:^blank:]]' => 'SANY',
+    '(?u)[^[:blank:][:^blank:]]' => 'OPFAIL',
+    '(?a)[[:blank:][:^blank:]]' => 'SANY',
+    '(?a)[^[:blank:][:^blank:]]' => 'OPFAIL',
+    '[[:^cntrl:]]' => 'NPOSIXD[:cntrl:]',
+    '[[:^cntrl:]\x{2C2}]' => 'NPOSIXU[:cntrl:]',
+    '(?l)[[:cntrl:]]' => 'POSIXL[:cntrl:]',
+    '(?l)[[:^cntrl:]]' => 'NPOSIXL[:cntrl:]',
+    '(?l)[[:^cntrl:]\x{2C2}]' => 'NPOSIXL[:cntrl:]',
+    '(?u)[[:cntrl:]]' => 'POSIXU[:cntrl:]',
+    '(?u)[[:^cntrl:]]' => 'NPOSIXU[:cntrl:]',
+    '(?a)[[:cntrl:]]' => 'POSIXA[:cntrl:]',
+    '(?a)[[:^cntrl:]]' => 'NPOSIXA[:cntrl:]',
+    '(?a)[[:^cntrl:]\x{2C2}]' => 'NPOSIXA[:cntrl:]',
+    '[[:cntrl:][:^cntrl:]]' => 'SANY',
+    '[^[:cntrl:][:^cntrl:]]' => 'OPFAIL',
+    '(?l)[[:cntrl:][:^cntrl:]]' => 'SANY',
+    '(?l)[^[:cntrl:][:^cntrl:]]' => 'OPFAIL',
+    '(?u)[[:cntrl:][:^cntrl:]]' => 'SANY',
+    '(?u)[^[:cntrl:][:^cntrl:]]' => 'OPFAIL',
+    '(?a)[[:cntrl:][:^cntrl:]]' => 'SANY',
+    '(?a)[^[:cntrl:][:^cntrl:]]' => 'OPFAIL',
+    '[[:digit:]]' => 'POSIXU[\d]',
+    '[[:^digit:]]' => 'NPOSIXU[\d]',
+    '[[:^digit:]\x{2C2}]' => 'NPOSIXU[\d]',
+    '(?l)[[:digit:]]' => 'POSIXL[\d]',
+    '(?l)[[:^digit:]]' => 'NPOSIXL[\d]',
+    '(?l)[[:^digit:]\x{2C2}]' => 'NPOSIXL[\d]',
+    '(?u)[[:digit:]]' => 'POSIXU[\d]',
+    '(?u)[[:^digit:]]' => 'NPOSIXU[\d]',
+    '(?a)[[:digit:]]' => 'POSIXA[\d]',
+    '(?a)[[:^digit:]]' => 'NPOSIXA[\d]',
+    '(?a)[[:^digit:]\x{2C2}]' => 'NPOSIXA[\d]',
+    '[[:digit:][:^digit:]]' => 'SANY',
+    '[^[:digit:][:^digit:]]' => 'OPFAIL',
+    '(?l)[[:digit:][:^digit:]]' => 'SANY',
+    '(?l)[^[:digit:][:^digit:]]' => 'OPFAIL',
+    '(?u)[[:digit:][:^digit:]]' => 'SANY',
+    '(?u)[^[:digit:][:^digit:]]' => 'OPFAIL',
+    '(?a)[[:digit:][:^digit:]]' => 'SANY',
+    '(?a)[^[:digit:][:^digit:]]' => 'OPFAIL',
+    '[[:graph:]]' => 'POSIXD[:graph:]',
+    '[[:^graph:]]' => 'NPOSIXD[:graph:]',
+    '[[:^graph:]\x{FFFF}]' => 'NPOSIXU[:graph:]',
+    '(?l)[[:graph:]]' => 'POSIXL[:graph:]',
+    '(?l)[[:^graph:]]' => 'NPOSIXL[:graph:]',
+    '(?l)[[:^graph:]\x{FFFF}]' => 'NPOSIXL[:graph:]',
+    '(?u)[[:graph:]]' => 'POSIXU[:graph:]',
+    '(?u)[[:^graph:]]' => 'NPOSIXU[:graph:]',
+    '(?a)[[:graph:]]' => 'POSIXA[:graph:]',
+    '(?a)[[:^graph:]]' => 'NPOSIXA[:graph:]',
+    '(?a)[[:^graph:]\x{FFFF}]' => 'NPOSIXA[:graph:]',
+    '[[:graph:][:^graph:]]' => 'SANY',
+    '[^[:graph:][:^graph:]]' => 'OPFAIL',
+    '(?l)[[:graph:][:^graph:]]' => 'SANY',
+    '(?l)[^[:graph:][:^graph:]]' => 'OPFAIL',
+    '(?u)[[:graph:][:^graph:]]' => 'SANY',
+    '(?u)[^[:graph:][:^graph:]]' => 'OPFAIL',
+    '(?a)[[:graph:][:^graph:]]' => 'SANY',
+    '(?a)[^[:graph:][:^graph:]]' => 'OPFAIL',
+    '[[:lower:]]' => 'POSIXD[:lower:]',
+    '[[:^lower:]]' => 'NPOSIXD[:lower:]',
+    '[[:^lower:]\x{2C2}]' => 'NPOSIXU[:lower:]',
+    '(?l)[[:lower:]]' => 'POSIXL[:lower:]',
+    '(?l)[[:^lower:]]' => 'NPOSIXL[:lower:]',
+    '(?l)[[:^lower:]\x{2C2}]' => 'NPOSIXL[:lower:]',
+    '(?u)[[:lower:]]' => 'POSIXU[:lower:]',
+    '(?u)[[:^lower:]]' => 'NPOSIXU[:lower:]',
+    '(?a)[[:lower:]]' => 'POSIXA[:lower:]',
+    '(?a)[[:^lower:]]' => 'NPOSIXA[:lower:]',
+    '(?a)[[:^lower:]\x{2C2}]' => 'NPOSIXA[:lower:]',
+    '[[:lower:][:^lower:]]' => 'SANY',
+    '[^[:lower:][:^lower:]]' => 'OPFAIL',
+    '(?l)[[:lower:][:^lower:]]' => 'SANY',
+    '(?l)[^[:lower:][:^lower:]]' => 'OPFAIL',
+    '(?u)[[:lower:][:^lower:]]' => 'SANY',
+    '(?u)[^[:lower:][:^lower:]]' => 'OPFAIL',
+    '(?a)[[:lower:][:^lower:]]' => 'SANY',
+    '(?a)[^[:lower:][:^lower:]]' => 'OPFAIL',
+    '[[:print:]]' => 'POSIXD[:print:]',
+    '[[:^print:]]' => 'NPOSIXD[:print:]',
+    '[[:^print:]\x{FFFF}]' => 'NPOSIXU[:print:]',
+    '(?l)[[:print:]]' => 'POSIXL[:print:]',
+    '(?l)[[:^print:]]' => 'NPOSIXL[:print:]',
+    '(?l)[[:^print:]\x{FFFF}]' => 'NPOSIXL[:print:]',
+    '(?u)[[:print:]]' => 'POSIXU[:print:]',
+    '(?u)[[:^print:]]' => 'NPOSIXU[:print:]',
+    '(?a)[[:print:]]' => 'POSIXA[:print:]',
+    '(?a)[[:^print:]]' => 'NPOSIXA[:print:]',
+    '(?a)[[:^print:]\x{FFFF}]' => 'NPOSIXA[:print:]',
+    '[[:print:][:^print:]]' => 'SANY',
+    '[^[:print:][:^print:]]' => 'OPFAIL',
+    '(?l)[[:print:][:^print:]]' => 'SANY',
+    '(?l)[^[:print:][:^print:]]' => 'OPFAIL',
+    '(?u)[[:print:][:^print:]]' => 'SANY',
+    '(?u)[^[:print:][:^print:]]' => 'OPFAIL',
+    '(?a)[[:print:][:^print:]]' => 'SANY',
+    '(?a)[^[:print:][:^print:]]' => 'OPFAIL',
+    '[[:punct:]]' => 'POSIXD[:punct:]',
+    '[[:^punct:]]' => 'NPOSIXD[:punct:]',
+    '[[:^punct:]\x{2C2}]' => 'NPOSIXU[:punct:]',
+    '(?l)[[:punct:]]' => 'POSIXL[:punct:]',
+    '(?l)[[:^punct:]]' => 'NPOSIXL[:punct:]',
+    '(?l)[[:^punct:]\x{2C2}]' => 'NPOSIXL[:punct:]',
+    '(?u)[[:punct:]]' => 'POSIXU[:punct:]',
+    '(?u)[[:^punct:]]' => 'NPOSIXU[:punct:]',
+    '(?a)[[:punct:]]' => 'POSIXA[:punct:]',
+    '(?a)[[:^punct:]]' => 'NPOSIXA[:punct:]',
+    '(?a)[[:^punct:]\x{2C2}]' => 'NPOSIXA[:punct:]',
+    '[[:punct:][:^punct:]]' => 'SANY',
+    '[^[:punct:][:^punct:]]' => 'OPFAIL',
+    '(?l)[[:punct:][:^punct:]]' => 'SANY',
+    '(?l)[^[:punct:][:^punct:]]' => 'OPFAIL',
+    '(?u)[[:punct:][:^punct:]]' => 'SANY',
+    '(?u)[^[:punct:][:^punct:]]' => 'OPFAIL',
+    '(?a)[[:punct:][:^punct:]]' => 'SANY',
+    '(?a)[^[:punct:][:^punct:]]' => 'OPFAIL',
+    '[[:space:]]' => 'POSIXD[\s]',
+    '[[:^space:]]' => 'NPOSIXD[\s]',
+    '[[:^space:]\x{2C2}]' => 'NPOSIXU[\s]',
+    '(?l)[[:space:]]' => 'POSIXL[\s]',
+    '(?l)[[:^space:]]' => 'NPOSIXL[\s]',
+    '(?l)[[:^space:]\x{2C2}]' => 'NPOSIXL[\s]',
+    '(?u)[[:space:]]' => 'POSIXU[\s]',
+    '(?u)[[:^space:]]' => 'NPOSIXU[\s]',
+    '(?a)[[:space:]]' => 'POSIXA[\s]',
+    '(?a)[[:^space:]]' => 'NPOSIXA[\s]',
+    '(?a)[[:^space:]\x{2C2}]' => 'NPOSIXA[\s]',
+    '[[:space:][:^space:]]' => 'SANY',
+    '[^[:space:][:^space:]]' => 'OPFAIL',
+    '(?l)[[:space:][:^space:]]' => 'SANY',
+    '(?l)[^[:space:][:^space:]]' => 'OPFAIL',
+    '(?u)[[:space:][:^space:]]' => 'SANY',
+    '(?u)[^[:space:][:^space:]]' => 'OPFAIL',
+    '(?a)[[:space:][:^space:]]' => 'SANY',
+    '(?a)[^[:space:][:^space:]]' => 'OPFAIL',
+    '[[:upper:]]' => 'POSIXD[:upper:]',
+    '[[:^upper:]]' => 'NPOSIXD[:upper:]',
+    '[[:^upper:]\x{2C2}]' => 'NPOSIXU[:upper:]',
+    '(?l)[[:upper:]]' => 'POSIXL[:upper:]',
+    '(?l)[[:^upper:]]' => 'NPOSIXL[:upper:]',
+    '(?l)[[:^upper:]\x{2C2}]' => 'NPOSIXL[:upper:]',
+    '(?u)[[:upper:]]' => 'POSIXU[:upper:]',
+    '(?u)[[:^upper:]]' => 'NPOSIXU[:upper:]',
+    '(?a)[[:upper:]]' => 'POSIXA[:upper:]',
+    '(?a)[[:^upper:]]' => 'NPOSIXA[:upper:]',
+    '(?a)[[:^upper:]\x{2C2}]' => 'NPOSIXA[:upper:]',
+    '[[:upper:][:^upper:]]' => 'SANY',
+    '[^[:upper:][:^upper:]]' => 'OPFAIL',
+    '(?l)[[:upper:][:^upper:]]' => 'SANY',
+    '(?l)[^[:upper:][:^upper:]]' => 'OPFAIL',
+    '(?u)[[:upper:][:^upper:]]' => 'SANY',
+    '(?u)[^[:upper:][:^upper:]]' => 'OPFAIL',
+    '(?a)[[:upper:][:^upper:]]' => 'SANY',
+    '(?a)[^[:upper:][:^upper:]]' => 'OPFAIL',
+    '[\v]' => 'POSIXU[\v]',
+    '[^\v]' => 'NPOSIXU[\v]',
+    '[\V\x{2C2}]' => 'NPOSIXU[\v]',
+    '(?l)[\v]' => 'POSIXU[\v]',
+    '(?l)[^\v]' => 'NPOSIXU[\v]',
+    '(?l)[\V\x{2C2}]' => 'NPOSIXU[\v]',
+    '(?u)[\v]' => 'POSIXU[\v]',
+    '(?u)[^\v]' => 'NPOSIXU[\v]',
+    '(?a)[\v]' => 'POSIXU[\v]',
+    '(?a)[^\v]' => 'NPOSIXU[\v]',
+    '(?a)[\V\x{2C2}]' => 'NPOSIXU[\v]',
+    '[\v\V]' => 'SANY',
+    '[^\v\V]' => 'OPFAIL',
+    '(?l)[\v\V]' => 'SANY',
+    '(?l)[^\v\V]' => 'OPFAIL',
+    '(?u)[\v\V]' => 'SANY',
+    '(?u)[^\v\V]' => 'OPFAIL',
+    '(?a)[\v\V]' => 'SANY',
+    '(?a)[^\v\V]' => 'OPFAIL',
+    '[[:word:]]' => 'POSIXD[\w]',
+    '[[:^word:]]' => 'NPOSIXD[\w]',
+    '[[:^word:]\x{2C2}]' => 'NPOSIXU[\w]',
+    '(?l)[[:word:]]' => 'POSIXL[\w]',
+    '(?l)[[:^word:]]' => 'NPOSIXL[\w]',
+    '(?l)[[:^word:]\x{2C2}]' => 'NPOSIXL[\w]',
+    '(?u)[[:word:]]' => 'POSIXU[\w]',
+    '(?u)[[:^word:]]' => 'NPOSIXU[\w]',
+    '(?a)[[:word:]]' => 'POSIXA[\w]',
+    '(?a)[[:^word:]]' => 'NPOSIXA[\w]',
+    '(?a)[[:^word:]\x{2C2}]' => 'NPOSIXA[\w]',
+    '[[:word:][:^word:]]' => 'SANY',
+    '[^[:word:][:^word:]]' => 'OPFAIL',
+    '(?l)[[:word:][:^word:]]' => 'SANY',
+    '(?l)[^[:word:][:^word:]]' => 'OPFAIL',
+    '(?u)[[:word:][:^word:]]' => 'SANY',
+    '(?u)[^[:word:][:^word:]]' => 'OPFAIL',
+    '(?a)[[:word:][:^word:]]' => 'SANY',
+    '(?a)[^[:word:][:^word:]]' => 'OPFAIL',
+    '[[:xdigit:]]' => 'POSIXU[:xdigit:]',
+    '[[:^xdigit:]]' => 'NPOSIXU[:xdigit:]',
+    '[[:^xdigit:]\x{2C2}]' => 'NPOSIXU[:xdigit:]',
+    '(?l)[[:xdigit:]]' => 'POSIXL[:xdigit:]',
+    '(?l)[[:^xdigit:]]' => 'NPOSIXL[:xdigit:]',
+    '(?l)[[:^xdigit:]\x{2C2}]' => 'NPOSIXL[:xdigit:]',
+    '(?u)[[:xdigit:]]' => 'POSIXU[:xdigit:]',
+    '(?u)[[:^xdigit:]]' => 'NPOSIXU[:xdigit:]',
+    '(?a)[[:xdigit:]]' => 'POSIXA[:xdigit:]',
+    '(?a)[[:^xdigit:]]' => 'NPOSIXA[:xdigit:]',
+    '(?a)[[:^xdigit:]\x{2C2}]' => 'NPOSIXA[:xdigit:]',
+    '[[:xdigit:][:^xdigit:]]' => 'SANY',
+    '[^[:xdigit:][:^xdigit:]]' => 'OPFAIL',
+    '(?l)[[:xdigit:][:^xdigit:]]' => 'SANY',
+    '(?l)[^[:xdigit:][:^xdigit:]]' => 'OPFAIL',
+    '(?u)[[:xdigit:][:^xdigit:]]' => 'SANY',
+    '(?u)[^[:xdigit:][:^xdigit:]]' => 'OPFAIL',
+    '(?a)[[:xdigit:][:^xdigit:]]' => 'SANY',
+    '(?a)[^[:xdigit:][:^xdigit:]]' => 'OPFAIL',
+    '(?i)[[:lower:]]' => 'POSIXD[:cased:]',
+    '(?i)[[:^lower:]]' => 'NPOSIXD[:cased:]',
+    '(?i)[[:^lower:]\x{2C2}]' => 'NPOSIXU[:cased:]',
+    '(?i)(?l)[[:lower:]]' => 'POSIXL[:cased:]',
+    '(?i)(?l)[[:^lower:]]' => 'NPOSIXL[:cased:]',
+    '(?i)(?l)[[:^lower:]\x{2C2}]' => 'NPOSIXL[:cased:]',
+    '(?i)(?u)[[:lower:]]' => 'POSIXU[:cased:]',
+    '(?i)(?u)[[:^lower:]]' => 'NPOSIXU[:cased:]',
+    '(?i)(?a)[[:lower:]]' => 'POSIXA[:alpha:]',
+    '(?i)(?a)[[:^lower:]]' => 'NPOSIXA[:alpha:]',
+    '(?i)(?a)[[:^lower:]\x{2C2}]' => 'NPOSIXA[:alpha:]',
+    '(?i)[[:upper:]]' => 'POSIXD[:cased:]',
+    '(?i)[[:^upper:]]' => 'NPOSIXD[:cased:]',
+    '(?i)[[:^upper:]\x{2C2}]' => 'NPOSIXU[:cased:]',
+    '(?i)(?l)[[:upper:]]' => 'POSIXL[:cased:]',
+    '(?i)(?l)[[:^upper:]]' => 'NPOSIXL[:cased:]',
+    '(?i)(?l)[[:^upper:]\x{2C2}]' => 'NPOSIXL[:cased:]',
+    '(?i)(?u)[[:upper:]]' => 'POSIXU[:cased:]',
+    '(?i)(?u)[[:^upper:]]' => 'NPOSIXU[:cased:]',
+    '(?i)(?a)[[:upper:]]' => 'POSIXA[:alpha:]',
+    '(?i)(?a)[[:^upper:]]' => 'NPOSIXA[:alpha:]',
+    '(?i)(?a)[[:^upper:]\x{2C2}]' => 'NPOSIXA[:alpha:]',
+    '(?i)[\d\w]' => 'POSIXD[\w]',
+    '(?i)[\D\w]' => 'SANY',
+    #'(?i)(?l)[\d\w]' => varies depending on Unicode release
+    '(?i)(?l)[\D\w]' => 'ANYOFPOSIXL[\\w\\D][0100-INFTY]',
+    '(?i)(?u)[\d\w]' => 'POSIXU[\w]',
+    '(?i)(?u)[\D\w]' => 'SANY',
+    '(?i)(?a)[\d\w]' => 'POSIXA[\w]',
+    '(?i)(?a)[\D\w]' => 'SANY',
     '(?l:[\x{212A}])' => 'ANYOFL[212A]',
     '(?l:[\s\x{212A}])' => 'ANYOFPOSIXL[\s][1680 2000-200A 2028-2029 202F 205F 212A 3000]',
     '(?l:[^\S\x{202F}])' => 'ANYOFPOSIXL[^\\S][1680 2000-200A 2028-2029 205F 3000]',
-    '(?i:[^:])' => 'NANYOFM[:]',
+
+    '\p{All}' => 'SANY',
+    '\P{All}' => 'OPFAIL',
     '[\p{Any}]' => 'ANYOF[\x00-\xFF][0100-10FFFF]',
 
     '[\p{IsMyRuntimeProperty}]' => 'ANYOF[+utf8::IsMyRuntimeProperty]',
@@ -113,9 +472,12 @@ my @tests = (
     '[a\p{IsMyRuntimeProperty}]' => 'ANYOF[a][+utf8::IsMyRuntimeProperty]',
     '[^a\p{IsMyRuntimeProperty}]' => 'ANYOF[^a{+utf8::IsMyRuntimeProperty}]',
     '[^a\x{100}\p{IsMyRuntimeProperty}]' => 'ANYOF[^a{+utf8::IsMyRuntimeProperty}0100]',
+    '[^\p{All}\p{IsMyRuntimeProperty}]' => 'OPFAIL',
+    '[\p{All}\p{IsMyRuntimeProperty}]' => 'SANY',
 
     '[\x{00}-{INFTY_minus_1}]' => 'ANYOF[\x00-\xFF][0100-INFTY_minus_1]',
     '[\x{00}-{INFTY}]' => 'SANY',
+    '(?i)[\x{100}]' => 'ANYOF[0100-0101]',
     '[\x{101}-{INFTY}]' => 'ANYOF[0101-INFTY]',
     '[\x{101}-{INFTY_minus_1}]' => 'ANYOF[0101-INFTY_minus_1]',
     '[\x{102}\x{104}]' => 'ANYOF[0102 0104]',
@@ -346,8 +708,148 @@ my @tests = (
     '[\x{10C}-{INFTY}\x{104}\x{102}\x{103}]' => 'ANYOF[0102-0104 010C-INFTY]',
     '[{INFTY_minus_1}]' => 'ANYOF[INFTY_minus_1]',
     '[{INFTY}]' => 'ANYOF[INFTY-INFTY]',
+
+    '(?8)(?i)[\x{100}]' => 'EXACTFU_ONLY8 <\x{101}>',
+    '(?8)(?i)[\x{399}]' => 'EXACTFU_ONLY8 <\x{3b9}>',
+    '(?8)(?i)[\x{345}\x{399}\x{3B9}\x{1FBE}]' => 'EXACTFU_ONLY8 <\x{3b9}>',
+    '(?i)[\x{2b9}]' => 'ANYOF[02B9]',           # Doesn't participate in a fold
+    '(?8)(?i)[\x{2b9}]' => 'EXACT_ONLY8 <\x{2b9}>',
+    '(?i)[\x{2bc}]' => 'EXACTFU_ONLY8 <\x{2bc}>', # Part of a multi-char fold, ASCII component
+    '(?i)[\x{390}]' => 'EXACTFU_ONLY8 <\x{3b9}\x{308}\x{301}>', # Part of a multi-char fold, no ASCII component
+
+    '(?i)[\x{1E9E}]' => 'EXACTFU <ss>',
+    '(?iaa)[\x{1E9E}]' => 'EXACTFAA <\x{17f}\x{17f}>',
+    '(?i)[\x{FB00}]' => 'EXACTFU <ff>',
+    '(?iaa)[\x{FB00}]' => 'ANYOF[FB00]',
+    '(?i)[\x{FB00}]' => 'EXACTFU <ff>',
+    '(?i)[\x{FB01}]' => 'EXACTFU <fi>',
+    '(?i)[\x{FB02}]' => 'EXACTFU <fl>',
+    '(?i)[\x{FB03}]' => 'EXACTFU <ffi>',
+    '(?i)[\x{FB04}]' => 'EXACTFU <ffl>',
+    '(?i)[\x{FB05}]' => 'EXACTFU <st>',
+    '(?i)[\x{FB06}]' => 'EXACTFU <st>',
+
+    '[a][b]' => 'EXACT <ab>',
+    '[a]\x{100}' => 'EXACT_ONLY8 <a\x{100}>',
+    '(?8)[\x{100}]a' => 'EXACT_ONLY8 <\x{100}a>',
+    '(?i)[b][c]' => 'EXACTFU <bc>',
+    '(?i)[b]\x{100}' => 'EXACTFU_ONLY8 <b\x{101}>',
+    '(?8)(?i)[\x{100}]b' => 'EXACTFU_ONLY8 <\x{101}b>',
+    '(?i)b[s]' => 'EXACTFU <bs>',
+    '(?i)b[s]c' => 'EXACTFU <bsc>',
+    '(?i)bs[s]c' => 'EXACTF <bss>',  # The c goes into a 2nd node
+    '(?iu)bs[s]c' => 'EXACTFUP <bssc>',
+    '(?i)b[s]sc' => 'EXACTF <bssc>',
+    '(?iu)b[s]sc' => 'EXACTFUP <bssc>',
+    '(?i)[b]st' => 'EXACTFU <bst>',
+    '(?i)[b]st[s]' => 'EXACTFU <bsts>',
+    '(?i)[b]st[s]st' => 'EXACTF <bstsst>',
+    '(?iu)[b]st[s]st' => 'EXACTFUP <bstsst>',
+    '(?i)[s][s]' => 'EXACTF <ss>',
+    '(?iu)[s][s]' => 'EXACTFUP <ss>',
 );
 
+my @single_chars_to_test =
+(
+    "\x00",     # Always potentially problematic
+    "\x01",     # Unnamed control
+    "\b",       # Named control
+    "\n",       # Potentially special
+    "\r",       # Potentially special
+    "\cK",      # Potentially special
+    "0",        # Digit
+    ":",        # Not in any fold
+    "A",        # ASCII capital, participates in multi-char fold
+    "a",        # ASCII small, participates in multi-char fold
+    "B",        # ASCII capital, participates only in case-pair fold
+    "b",        # ASCII small, participates only in case-pair fold
+    "K",        # ASCII capital, folded to from above Latin1
+    "k",        # ASCII small, folded to from above Latin1
+    "\c?",      # Potentially special
+    "\x80",     # Latin1 control
+    "\xB5",     # Micro sign, folds to above Latin1
+    "\xC0",     # Latin1 capital, participates only in case-pair fold
+    "\xE0",     # Latin1 small, participates only in case-pair fold
+    "\xC5",     # Latin1 capital, folded to from above Latin1
+    "\xE5",     # Latin1 small, folded to from above Latin1
+    "\xDF",     # Small sharp S. folds to 'ss'
+    "\xF7",     # Doesn't participate in any fold
+    "\xFF",     # Folded to by above Latin1
+    "\x{100}",  # First few above Latin1 characters
+    "\x{101}",
+    "\x{102}",
+    "\x{103}",
+    "\x{104}",
+    "\x{105}",
+    "\x{106}",
+    "\x{107}",
+    "\x{108}",
+);
+
+my @single_tests;
+for my $char (@single_chars_to_test) {
+    my $cp = ord $char;
+    my $hex = sprintf "%02x", $cp;
+    my $oct = sprintf "%o", $cp;
+    my $cp_string;
+
+    my $cased;
+    my $folded_hex;
+
+    {
+        use feature 'unicode_strings';
+        $cased = uc $char ne $char || lc $char ne $char;
+        $folded_hex = ($cased)
+                      ? sprintf("%02x", ord lc $char)
+                      : $hex;
+        #print STDERR "$hex, $folded_hex\n";
+    }
+
+    for my $fold ("", "i") {
+        #next unless $fold;
+        for my $charset ("", "u", "l", "aa") {
+            #next if $charset eq "aa" && ! $fold;
+
+            my $modifiers = $fold . $charset;
+            $modifiers = "(?$modifiers)" if $modifiers;
+
+            for my $upgrade ("", "(?8)") {
+                push @single_tests, "$upgrade$modifiers\[\\x{$hex}\]";
+                if ($cp < 256 || $upgrade) {
+                    push @single_tests, get_compiled("$upgrade$modifiers\\x{$hex}");
+                }
+                else {
+                    my $interior = "";
+                    my @list = $cp;
+                    if ($fold) {
+                        if (lc $char ne $char) {
+                            push @list, ord lc $char;
+                        }
+                        elsif (uc $char ne $char) {
+                            push @list, ord uc $char;
+                        }
+                    }
+                    @list = sort { $a <=> $b } @list;
+                    if (@list == 1) {
+                        $interior = sprintf "%04X", $list[0];
+                    }
+                    elsif (@list == 2) {
+                        my $separator = ($list[1] == $list[0] + 1) ? '-' : ', ';
+                        $interior = sprintf "%04X$separator%04X", $list[0], $list[1];
+                    }
+                    else {
+                        die join ", ", @list;
+                    }
+                    my $anyof = ($charset eq "l") ? "ANYOFL" : "ANYOF";
+                    push @single_tests, "$anyof\[$interior\]";
+                }
+            }
+        }
+    }
+}
+
+unshift @tests, @single_tests;
+
 plan(scalar (@tests - 1) / 2);  # -1 because of the marker.
 
 my $skip_ebcdic = $::IS_EBCDIC;
index 48cfb77..a8b6748 100644 (file)
@@ -1675,8 +1675,6 @@ ab[c\\\](??{"x"})]{3}d    ab\\](d y       -       -
 
 /st/i  \x{DF}\x{FB05}  y       $&      \x{FB05}
 /ssst/i        \x{DF}\x{FB05}  y       $&      \x{DF}\x{FB05}
-/[s]s/i        \x{DF}  n       -       -
-/s[s]/i        \x{DF}  n       -       -
 
 # [perl #101970]
 /[[:lower:]]/i \x{100} y       $&      \x{100}
index 949ed97..a557fe3 100644 (file)
@@ -155,6 +155,21 @@ foreach my $test_ref (@CF) {
         # since they use '$u', they are left out of the main loop
         $test = qq[ my \$s = ":$u:"; utf8::upgrade(\$s); \$s =~ /:[_$c]:/i];
         ok eval $test, "$code - $name - $mapping - $type - $test";
+
+        my $bracketed_f = ($f =~ s/(.)/[$1]/gr);
+        $test = qq[":$c:" =~ /:$bracketed_f:/iu];
+        ok eval $test, "$code - $name - $mapping - $type - $test";
+
+        my @f_chars = ($f =~ / (.) (.) (.?) /x);
+        my $every_other_bracketed_f = "[$f_chars[0]]$f_chars[1]";
+        $every_other_bracketed_f .= "[$f_chars[2]]" if $f_chars[2];
+        $test = qq[":$c:" =~ /:$every_other_bracketed_f:/iu];
+        ok eval $test, "$code - $name - $mapping - $type - $test";
+
+        my $other_every_bracketed_f = "$f_chars[0]\[$f_chars[1]]";
+        $other_every_bracketed_f .= "$f_chars[2]" if $f_chars[2];
+        $test = qq[":$c:" =~ /:$other_every_bracketed_f:/iu];
+        ok eval $test, "$code - $name - $mapping - $type - $test";
     }
 }