This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
re.t: Avoid encoding issues by using hex chars
[perl5.git] / regcomp.c
index b970ab8..fb9c606 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -763,8 +763,9 @@ S_cl_and(struct regnode_charclass_class *cl,
     PERL_ARGS_ASSERT_CL_AND;
 
     assert(and_with->type == ANYOF);
-    if (!(and_with->flags & ANYOF_CLASS)
-       && !(cl->flags & ANYOF_CLASS)
+
+    if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
+       && !(ANYOF_CLASS_TEST_ANY_SET(cl))
        && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
        && !(and_with->flags & ANYOF_FOLD)
        && !(cl->flags & ANYOF_FOLD)) {
@@ -837,7 +838,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
            /* OR char bitmap and class bitmap separately */
            for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
                cl->bitmap[i] |= or_with->bitmap[i];
-           if (or_with->flags & ANYOF_CLASS) {
+           if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
                for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
                    cl->classflags[i] |= or_with->classflags[i];
                cl->flags |= ANYOF_CLASS;
@@ -1358,13 +1359,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     regnode *convert = NULL;
     U32 *prev_states; /* temp array mapping each state to previous one */
     /* we just use folder as a flag in utf8 */
-    const U8 * const folder = ( flags == EXACTF
-                       ? PL_fold
-                       : ( flags == EXACTFL
-                           ? PL_fold_locale
-                           : NULL
-                         )
-                     );
+    const U8 * folder = NULL;
 
 #ifdef DEBUGGING
     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
@@ -1384,6 +1379,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     PERL_UNUSED_ARG(depth);
 #endif
 
+    switch (flags) {
+       case EXACTFU: folder = PL_fold_latin1; break;
+       case EXACTF:  folder = PL_fold; break;
+       case EXACTFL: folder = PL_fold_locale; break;
+    }
+
     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
     trie->refcount = 1;
     trie->startstate = 1;
@@ -1484,12 +1485,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
 
                    if ( !UTF ) {
                        /* store first byte of utf8 representation of
-                          codepoints in the 127 < uvc < 256 range */
-                       if (127 < uvc && uvc < 192) {
-                           TRIE_BITMAP_SET(trie,194);
-                       } else if (191 < uvc ) {
-                           TRIE_BITMAP_SET(trie,195);
-                       /* && uvc < 256 -- we know uvc is < 256 already */
+                          variant codepoints */
+                       if (! UNI_IS_INVARIANT(uvc)) {
+                           TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
                        }
                    }
                     set_bit = 0; /* We've done our bit :-) */
@@ -2451,6 +2449,10 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
        }
 #endif
     }
+#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
+#define IOTA_D_T       GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
+#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS    0x03B0
+#define UPSILON_D_T    GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
 
     if (UTF
        && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
@@ -3073,11 +3075,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                /* Check whether it is compatible with what we know already! */
                int compat = 1;
 
+
+               /* If compatibile, we or it in below.  It is compatible if is
+                * in the bitmp and either 1) its bit or its fold is set, or 2)
+                * it's for a locale.  Even if there isn't unicode semantics
+                * here, at runtime there may be because of matching against a
+                * utf8 string, so accept a possible false positive for
+                * latin1-range folds */
                if (uc >= 0x100 ||
                    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
                    && !ANYOF_BITMAP_TEST(data->start_class, uc)
                    && (!(data->start_class->flags & ANYOF_FOLD)
-                       || !ANYOF_BITMAP_TEST(data->start_class, (UNI_SEMANTICS) ? PL_fold_latin1[uc] : PL_fold[uc])))
+                       || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
                     )
                    compat = 0;
                ANYOF_CLASS_ZERO(data->start_class);
@@ -3119,12 +3128,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            if (flags & SCF_DO_STCLASS_AND) {
                /* Check whether it is compatible with what we know already! */
                int compat = 1;
-
                if (uc >= 0x100 ||
-                   (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
-                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
-                    && !ANYOF_BITMAP_TEST(data->start_class, (UNI_SEMANTICS) ? PL_fold_latin1[uc] : PL_fold[uc])))
+                (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+                 && !ANYOF_BITMAP_TEST(data->start_class, uc)
+                 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
+               {
                    compat = 0;
+               }
                ANYOF_CLASS_ZERO(data->start_class);
                ANYOF_BITMAP_ZERO(data->start_class);
                if (compat) {
@@ -3136,13 +3146,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    }
                    else {
 
-                       /* Also set the other member of the fold pair.  Can't
-                        * do this for locale, because not known until runtime
-                        */
-                       ANYOF_BITMAP_SET(data->start_class,
-                                        (OP(scan) == EXACTFU)
-                                                   ? PL_fold_latin1[uc]
-                                                   : PL_fold[uc]);
+                       /* Also set the other member of the fold pair.  In case
+                        * that unicode semantics is called for at runtime, use
+                        * the full latin1 fold.  (Can't do this for locale,
+                        * because not known until runtime */
+                       ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
                    }
                }
            }
@@ -3158,9 +3166,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                              * can't do that in locale because not known until
                              * run-time */
                             ANYOF_BITMAP_SET(data->start_class,
-                                            (OP(scan) == EXACTFU)
-                                                        ? PL_fold_latin1[uc]
-                                                        : PL_fold[uc]);
+                                            PL_fold_latin1[uc]);
                         }
                    }
                    data->start_class->flags &= ~ANYOF_EOS;
@@ -3567,7 +3573,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            }
        }
        else if (OP(scan) == FOLDCHAR) {
-           int d = ARG(scan)==0xDF ? 1 : 2;
+           int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
            flags &= ~SCF_DO_STCLASS;
             min += 1;
             delta += d;
@@ -3604,8 +3610,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        goto do_default;
                    if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
                        value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
-                                || ((data->start_class->flags & ANYOF_CLASS)
-                                     && ANYOF_CLASS_TEST_ANY_SET(data->start_class)));
+                                || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
                        cl_anything(pRExC_state, data->start_class);
                    }
                    if (flags & SCF_DO_STCLASS_AND || !value)
@@ -4375,7 +4380,7 @@ Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
 #endif
 
 REGEXP *
-Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
+Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
 {
     dVAR;
     REGEXP *rx;
@@ -4387,12 +4392,14 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
     regnode *scan;
     I32 flags;
     I32 minlen = 0;
+    U32 pm_flags;
 
     /* these are all flags - maybe they should be turned
      * into a single int with different bit masks */
     I32 sawlookahead = 0;
     I32 sawplus = 0;
     I32 sawopen = 0;
+    bool used_setjump = FALSE;
 
     U8 jump_ret = 0;
     dJMPENV;
@@ -4411,15 +4418,20 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
 
     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
 
-
+    /****************** LONG JUMP TARGET HERE***********************/
     /* Longjmp back to here if have to switch in midstream to utf8 */
     if (! RExC_orig_utf8) {
        JMPENV_PUSH(jump_ret);
+       used_setjump = TRUE;
     }
 
     if (jump_ret == 0) {    /* First time through */
-        exp = SvPV(pattern, plen);
-        xend = exp + plen;
+       exp = SvPV(pattern, plen);
+       xend = exp + plen;
+       /* ignore the utf8ness if the pattern is 0 length */
+       if (plen == 0) {
+           RExC_utf8 = RExC_orig_utf8 = 0;
+       }
 
         DEBUG_COMPILE_r({
             SV *dsv= sv_newmortal();
@@ -4461,6 +4473,13 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
     restudied = 0;
 #endif
 
+    /* Set to use unicode semantics if the pattern is in utf8 and has the
+     * 'dual' charset specified, as it means unicode when utf8  */
+    pm_flags = orig_pm_flags;
+    if (RExC_utf8  && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
+       pm_flags |= RXf_PMf_UNICODE;
+    }
+
     RExC_precomp = exp;
     RExC_flags = pm_flags;
     RExC_sawback = 0;
@@ -4500,12 +4519,8 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
        return(NULL);
     }
 
-    /* Here, finished first pass.  Get rid of our setjmp, which we added for
-     * efficiency only if the passed-in string wasn't in utf8, as shown by
-     * RExC_orig_utf8.  But if the first pass was redone, that variable will be
-     * 1 here even though the original string wasn't utf8, but in this case
-     * there will have been a long jump */
-    if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
+    /* Here, finished first pass.  Get rid of any added setjmp */
+    if (used_setjump) {
        JMPENV_POP;
     }
     DEBUG_PARSE_r({
@@ -5839,9 +5854,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         SvREFCNT_inc_simple_void(sv_dat);
                     }
                     RExC_sawback = 1;
-                    ret = reganode(pRExC_state,
-                          (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
-                          num);
+                   ret = reganode(pRExC_state,
+                                  ((! FOLD)
+                                    ? NREF
+                                    : (UNI_SEMANTICS)
+                                      ? NREFFU
+                                      : (LOC)
+                                        ? NREFFL
+                                        : NREFF),
+                                   num);
                     *flagp |= HASWIDTH;
 
                     Set_Node_Offset(ret, parse_start+1);
@@ -6268,6 +6289,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                                       that follow */
                 has_use_defaults = TRUE;
                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
+               if (RExC_utf8) {    /* But the default for a utf8 pattern is
+                                      unicode semantics */
+                   RExC_flags |= RXf_PMf_UNICODE;
+               }
                 goto parse_flags;
            default:
                --RExC_parse;
@@ -6306,7 +6331,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         {
                             goto fail_modifiers;
                         }
-                        negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+
+                       /* The dual charset means unicode semantics if the
+                        * pattern (or target, not known until runtime) are
+                        * utf8 */
+                       if (RExC_utf8) {
+                           posflags |= RXf_PMf_UNICODE;
+                           negflags |= RXf_PMf_LOCALE;
+                       }
+                       else {
+                           negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+                       }
                         has_charset_modifier = 1;
                         break;
                     case ONCE_PAT_MOD: /* 'o' */
@@ -7272,9 +7307,13 @@ tryagain:
        RExC_parse++;
        vFAIL("Quantifier follows nothing");
        break;
-    case 0xDF:
-    case 0xC3:
-    case 0xCE:
+    case LATIN_SMALL_LETTER_SHARP_S:
+    case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+    case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
+#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
+#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
+    case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
+#endif
         do_foldchar:
         if (!LOC && FOLD) {
             U32 len,cp;
@@ -7303,9 +7342,9 @@ tryagain:
           literal text handling code.
        */
        switch ((U8)*++RExC_parse) {
-       case 0xDF:
-       case 0xC3:
-       case 0xCE:
+       case LATIN_SMALL_LETTER_SHARP_S:
+       case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+       case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
                   goto do_foldchar;        
        /* Special Escapes */
        case 'A':
@@ -7406,11 +7445,19 @@ tryagain:
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'd':
-           ret = reg_node(pRExC_state, DIGIT);
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(DIGITL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(DIGIT));
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'D':
-           ret = reg_node(pRExC_state, NDIGIT);
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NDIGITL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NDIGIT));
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'R':
@@ -7507,8 +7554,14 @@ tryagain:
 
                 RExC_sawback = 1;
                 ret = reganode(pRExC_state,
-                          (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
-                          num);
+                               ((! FOLD)
+                                 ? NREF
+                                 : (UNI_SEMANTICS)
+                                   ? NREFFU
+                                   : (LOC)
+                                     ? NREFFL
+                                     : NREFF),
+                                num);
                 *flagp |= HASWIDTH;
 
                 /* override incorrect value set in reganode MJD */
@@ -7569,8 +7622,14 @@ tryagain:
                    }
                    RExC_sawback = 1;
                    ret = reganode(pRExC_state,
-                                  (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
-                                  num);
+                                  ((! FOLD)
+                                    ? REF
+                                    : (UNI_SEMANTICS)
+                                      ? REFFU
+                                      : (LOC)
+                                        ? REFFL
+                                        : REFF),
+                                   num);
                    *flagp |= HASWIDTH;
 
                     /* override incorrect value set in reganode MJD */
@@ -7633,9 +7692,9 @@ tryagain:
                if (RExC_flags & RXf_PMf_EXTENDED)
                    p = regwhite( pRExC_state, p );
                switch ((U8)*p) {
-               case 0xDF:
-               case 0xC3:
-               case 0xCE:
+               case LATIN_SMALL_LETTER_SHARP_S:
+               case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+               case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
                           if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
                                goto normal_default;
                case '^':
@@ -7662,9 +7721,9 @@ tryagain:
 
                    switch ((U8)*++p) {
                    /* These are all the special escapes. */
-                   case 0xDF:
-                   case 0xC3:
-                   case 0xCE:
+                    case LATIN_SMALL_LETTER_SHARP_S:
+                    case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+                    case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
                           if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
                                goto normal_default;                
                    case 'A':             /* Start assertion */
@@ -8120,14 +8179,14 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
 ANYOF_##NAME:                                           \
        for (value = 0; value < 256; value++)           \
            if (TEST)                                   \
-               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
     yesno = '+';                                        \
     what = WORD;                                        \
     break;                                              \
 case ANYOF_N##NAME:                                     \
        for (value = 0; value < 256; value++)           \
            if (!TEST)                                  \
-               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
     yesno = '!';                                        \
     what = WORD;                                        \
     break
@@ -8142,13 +8201,13 @@ ANYOF_##NAME:                                           \
     else if (UNI_SEMANTICS) {                           \
         for (value = 0; value < 256; value++) {         \
             if (TEST_8) stored +=                       \
-                      S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+                      S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
         }                                               \
     }                                                   \
     else {                                              \
-        for (value = 0; value < 256; value++) {         \
+        for (value = 0; value < 128; value++) {         \
             if (TEST_7) stored +=                       \
-                       S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+                       S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \
         }                                               \
     }                                                   \
     yesno = '+';                                        \
@@ -8159,13 +8218,16 @@ case ANYOF_N##NAME:                                     \
     else if (UNI_SEMANTICS) {                           \
         for (value = 0; value < 256; value++) {         \
             if (! TEST_8) stored +=                     \
-                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+                        S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
         }                                               \
     }                                                   \
     else {                                              \
-        for (value = 0; value < 256; value++) {         \
+        for (value = 0; value < 128; value++) {         \
             if (! TEST_7) stored +=                     \
-                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+                        S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
+        }                                               \
+        for (value = 128; value < 256; value++) {         \
+                        S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
         }                                               \
     }                                                   \
     yesno = '!';                                        \
@@ -8238,7 +8300,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8
 
 
 PERL_STATIC_INLINE U8
-S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U32 value)
+S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
 {
     /* This inline function sets a bit in the bitmap if not already set, and if
      * appropriate, its fold, returning the number of bits that actually
@@ -8282,12 +8344,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
     bool need_class = 0;
     SV *listsv = NULL;
     UV n;
-    bool optimize_invert   = TRUE;
     AV* unicode_alternate  = NULL;
 #ifdef EBCDIC
     UV literal_endpoint = 0;
 #endif
-    UV stored = 0;  /* 0, 1, or more than 1 chars stored in the class */
+    UV stored = 0;  /* how many chars stored in the bitmap */
 
     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
         case we need to change the emitted regop to an EXACT. */
@@ -8316,14 +8377,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
 
     if (SIZE_ONLY) {
        RExC_size += ANYOF_SKIP;
+#ifdef ANYOF_ADD_LOC_SKIP
+       if (LOC) {
+           RExC_size += ANYOF_ADD_LOC_SKIP;
+       }
+#endif
        listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
     }
     else {
        RExC_emit += ANYOF_SKIP;
-       if (FOLD)
-           ANYOF_FLAGS(ret) |= ANYOF_FOLD;
-       if (LOC)
+       if (LOC) {
            ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
+#ifdef ANYOF_ADD_LOC_SKIP
+           RExC_emit += ANYOF_ADD_LOC_SKIP;
+#endif
+       }
        ANYOF_BITMAP_ZERO(ret);
        listsv = newSVpvs("# comment\n");
     }
@@ -8537,10 +8605,14 @@ parseit:
            if (LOC && namedclass < ANYOF_MAX && ! need_class) {
                need_class = 1;
                if (SIZE_ONLY) {
+#ifdef ANYOF_CLASS_ADD_SKIP
                    RExC_size += ANYOF_CLASS_ADD_SKIP;
+#endif
                }
                else {
+#ifdef ANYOF_CLASS_ADD_SKIP
                    RExC_emit += ANYOF_CLASS_ADD_SKIP;
+#endif
                    ANYOF_CLASS_ZERO(ret);
                }
                ANYOF_FLAGS(ret) |= ANYOF_CLASS;
@@ -8558,14 +8630,14 @@ parseit:
 
                    if (prevvalue < 256) {
                        stored +=
-                         S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
+                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
                        stored +=
                          S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
                    }
                    else {
                        ANYOF_FLAGS(ret) |= ANYOF_UTF8;
                        Perl_sv_catpvf(aTHX_ listsv,
-                                      "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
+                          "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
                    }
                }
 
@@ -8578,8 +8650,6 @@ parseit:
                const char *what = NULL;
                char yesno = 0;
 
-               if (namedclass > OOB_NAMEDCLASS)
-                   optimize_invert = FALSE;
                /* Possible truncation here but in some 64-bit environments
                 * the compiler gets heartburn about switch on 64-bit values.
                 * A similar issue a little earlier when switching on value.
@@ -8612,34 +8682,21 @@ parseit:
                    if (LOC)
                        ANYOF_CLASS_SET(ret, ANYOF_ASCII);
                    else {
-#ifndef EBCDIC
                        for (value = 0; value < 128; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-#else  /* EBCDIC */
-                       for (value = 0; value < 256; value++) {
-                           if (isASCII(value))
-                               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-                       }
-#endif /* EBCDIC */
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
                    }
                    yesno = '+';
-                   what = "ASCII";
+                   what = NULL;        /* Doesn't match outside ascii, so
+                                          don't want to add +utf8:: */
                    break;
                case ANYOF_NASCII:
                    if (LOC)
                        ANYOF_CLASS_SET(ret, ANYOF_NASCII);
                    else {
-#ifndef EBCDIC
                        for (value = 128; value < 256; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-#else  /* EBCDIC */
-                       for (value = 0; value < 256; value++) {
-                           if (!isASCII(value))
-                               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-                       }
-#endif /* EBCDIC */
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
                    }
                    yesno = '!';
                    what = "ASCII";
@@ -8651,7 +8708,7 @@ parseit:
                        /* consecutive digits assumed */
                        for (value = '0'; value <= '9'; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
                    }
                    yesno = '+';
                    what = POSIX_CC_UNI_NAME("Digit");
@@ -8663,10 +8720,10 @@ parseit:
                        /* consecutive digits assumed */
                        for (value = 0; value < '0'; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
                        for (value = '9' + 1; value < 256; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
                    }
                    yesno = '!';
                    what = POSIX_CC_UNI_NAME("Digit");
@@ -8681,14 +8738,9 @@ parseit:
                if (what) {
                    /* Strings such as "+utf8::isWord\n" */
                    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
-               }
-               stored+=2; /* can't optimize this class */
-
-               /* All but ASCII can match Unicode characters, but all the ones
-                * that aren't in utf8 are in the bitmap */
-               if (namedclass != ANYOF_ASCII) {
                    ANYOF_FLAGS(ret) |= ANYOF_UTF8;
                }
+
                continue;
            }
        } /* end of namedclass \blah */
@@ -8726,7 +8778,6 @@ parseit:
        }
 
        /* now is the next time */
-        /*stored += (value - prevvalue + 1);*/
        if (!SIZE_ONLY) {
            if (prevvalue < 256) {
                const IV ceilvalue = value < 256 ? value : 255;
@@ -8742,29 +8793,25 @@ parseit:
                        for (i = prevvalue; i <= ceilvalue; i++)
                            if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
                                stored +=
-                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
                            }
                    } else {
                        for (i = prevvalue; i <= ceilvalue; i++)
                            if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
                                stored +=
-                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
                            }
                    }
                }
                else
 #endif
                      for (i = prevvalue; i <= ceilvalue; i++) {
-                       if (!ANYOF_BITMAP_TEST(ret,i)) {
-                           stored +=
-                                S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
-                       }
+                       stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
                      }
          }
          if (value > 255 || UTF) {
                const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
                const UV natvalue      = NATIVE_TO_UNI(value);
-                stored+=2; /* can't optimize this class */
 
                /* If the code point requires utf8 to represent, and we are not
                 * folding, it can't match unless the target is in utf8.  Only
@@ -8862,52 +8909,107 @@ parseit:
         return ret;
     /****** !SIZE_ONLY AFTER HERE *********/
 
-    if( stored == 1 && (value < 128 || (value < 256 && !UTF))
-        && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
-    ) {
-       /* optimize single char class to an EXACT node but *only* when its not
-        * a UTF/high char.  Note that the information needed to decide to do
-        * this optimization is not currently available until the 2nd pass, and
-        * that the actually used EXACT node takes less space than the
-        * calculated ANYOF node, and hence the amount of space calculated in
-         * the first pass is larger than actually used.  Currently we don't
-         * keep track of enough information to do this for nodes which contain
-         * matches outside the bitmap */
+    /* Folding in the bitmap is taken care of above, but not for locale, for
+     * which we have to wait to see what folding is in effect at runtime, and
+     * for things not in the bitmap */
+    if (FOLD && (LOC || ANYOF_FLAGS(ret) & ANYOF_NONBITMAP)) {
+        ANYOF_FLAGS(ret) |= ANYOF_FOLD;
+    }
+
+    /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that this doesn't
+     * optimize locale.  Doing so perhaps could be done as long as there is
+     * nothing like \w in it; some thought also would have to be given to the
+     * interaction with above 0x100 chars */
+    if ((ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
+       for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
+           ANYOF_BITMAP(ret)[value] ^= 0xFF;
+       stored = 256 - stored;
+
+       /* The inversion means that everything above 255 is matched */
+       ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
+    }
+
+    /* A single character class can be "optimized" into an EXACTish node.
+     * Note that since we don't currently count how many characters there are
+     * outside the bitmap, we are XXX missing optimization possibilities for
+     * them.  This optimization can't happen unless this is a truly single
+     * character class, which means that it can't be an inversion into a
+     * many-character class, and there must be no possibility of there being
+     * things outside the bitmap.  'stored' (only) for locales doesn't include
+     * \w, etc, so have to make a special test that they aren't present
+     *
+     * Similarly A 2-character class of the very special form like [bB] can be
+     * optimized into an EXACTFish node, but only for non-locales, and for
+     * characters which only have the two folds; so things like 'fF' and 'Ii'
+     * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
+     * FI'. */
+    if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
+        && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
+                              || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
+           || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
+                                && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
+                                /* If the latest code point has a fold whose
+                                 * bit is set, it must be the only other one */
+                               && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
+                                && ANYOF_BITMAP_TEST(ret, prevvalue)))))
+    {
+        /* Note that the information needed to decide to do this optimization
+         * is not currently available until the 2nd pass, and that the actually
+        * used EXACTish node takes less space than the calculated ANYOF node,
+        * and hence the amount of space calculated in the first pass is larger
+         * than actually used, so this optimization doesn't gain us any space.
+        * But an EXACT node is faster than an ANYOF node, and can be combined
+        * with any adjacent EXACT nodes later by the optimizer for further
+        * gains.  The speed of executing an EXACTF is similar to an ANYOF
+        * node, so the optimization advantage comes from the ability to join
+        * it to adjacent EXACT nodes */
+
         const char * cur_parse= RExC_parse;
+       U8 op;
         RExC_emit = (regnode *)orig_emit;
         RExC_parse = (char *)orig_parse;
-        ret = reg_node(pRExC_state,
-                       (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
-        RExC_parse = (char *)cur_parse;
-        *STRING(ret)= (char)value;
-        STR_LEN(ret)= 1;
-        RExC_emit += STR_SZ(1);
-       SvREFCNT_dec(listsv);
-        return ret;
-    }
-    /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
-    if ( /* If the only flag is folding (plus possibly inversion). */
-       ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
-       ) {
-       for (value = 0; value < 256; ++value) {
-           if (ANYOF_BITMAP_TEST(ret, value)) {
-               UV fold = PL_fold[value];
 
-               if (fold != value)
-                   ANYOF_BITMAP_SET(ret, fold);
+       if (stored == 1) {
+
+           /* A locale node with one point can be folded; all the other cases
+            * with folding will have two points, since we calculate them above
+            */
+           if (ANYOF_FLAGS(ret) & ANYOF_FOLD) {
+                op = EXACTFL;
            }
+           else {
+               op = EXACT;
+           }
+       }   /* else 2 chars in the bit map: the folds of each other */
+       else if (UNI_SEMANTICS || !isASCII(value)) {
+
+           /* To join adjacent nodes, they must be the exact EXACTish type.
+            * Try to use the most likely type, by using EXACTFU if the regex
+            * calls for them, or is required because the character is
+            * non-ASCII */
+           op = EXACTFU;
+       }
+       else {    /* Otherwise, more likely to be EXACTF type */
+           op = EXACTF;
        }
-       ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
-    }
 
-    /* optimize inverted simple patterns (e.g. [^a-z]) */
-    if (optimize_invert &&
-       /* If the only flag is inversion. */
-       (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
-       for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
-           ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
-       ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
+       ret = reg_node(pRExC_state, op);
+        RExC_parse = (char *)cur_parse;
+       if (UTF && ! NATIVE_IS_INVARIANT(value)) {
+           *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
+           *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
+           STR_LEN(ret)= 2;
+           RExC_emit += STR_SZ(2);
+       }
+       else {
+           *STRING(ret)= (char)value;
+           STR_LEN(ret)= 1;
+           RExC_emit += STR_SZ(1);
+       }
+       SvREFCNT_dec(listsv);
+        return ret;
     }
+
     {
        AV * const av = newAV();
        SV *rv;
@@ -9570,7 +9672,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
        Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
        if ( RXp_PAREN_NAMES(prog) ) {
-            if ( k != REF || OP(o) < NREF) {       
+            if ( k != REF || (OP(o) < NREF)) {
                AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
                SV **name= av_fetch(list, ARG(o), 0 );
                if (name)
@@ -9668,8 +9770,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
        }
         
         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
-        /* output any special charclass tests (used mostly under use locale) */
-       if (o->flags & ANYOF_CLASS && ANYOF_CLASS_TEST_ANY_SET(o))
+        /* output any special charclass tests (used entirely under use locale) */
+       if (ANYOF_CLASS_TEST_ANY_SET(o))
            for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
                if (ANYOF_CLASS_TEST(o,i)) {
                    sv_catpv(sv, anyofs[i]);