This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
re.t: Avoid encoding issues by using hex chars
[perl5.git] / regcomp.c
index 4b69bf7..fb9c606 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -763,8 +763,9 @@ S_cl_and(struct regnode_charclass_class *cl,
     PERL_ARGS_ASSERT_CL_AND;
 
     assert(and_with->type == ANYOF);
-    if (!(and_with->flags & ANYOF_CLASS)
-       && !(cl->flags & ANYOF_CLASS)
+
+    if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
+       && !(ANYOF_CLASS_TEST_ANY_SET(cl))
        && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
        && !(and_with->flags & ANYOF_FOLD)
        && !(cl->flags & ANYOF_FOLD)) {
@@ -837,7 +838,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
            /* OR char bitmap and class bitmap separately */
            for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
                cl->bitmap[i] |= or_with->bitmap[i];
-           if (or_with->flags & ANYOF_CLASS) {
+           if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
                for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
                    cl->classflags[i] |= or_with->classflags[i];
                cl->flags |= ANYOF_CLASS;
@@ -1156,7 +1157,7 @@ the silent ignoring of duplicate alternations which are of the form:
 
  / (DUPE|DUPE) X? (?{ ... }) Y /x
 
-Thus EVAL blocks follwing a trie may be called a different number of times with
+Thus EVAL blocks following a trie may be called a different number of times with
 and without the optimisation. With the optimisations dupes will be silently
 ignored. This inconsistant behaviour of EVAL type nodes is well established as
 the following demonstrates:
@@ -1358,13 +1359,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     regnode *convert = NULL;
     U32 *prev_states; /* temp array mapping each state to previous one */
     /* we just use folder as a flag in utf8 */
-    const U8 * const folder = ( flags == EXACTF
-                       ? PL_fold
-                       : ( flags == EXACTFL
-                           ? PL_fold_locale
-                           : NULL
-                         )
-                     );
+    const U8 * folder = NULL;
 
 #ifdef DEBUGGING
     const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
@@ -1384,6 +1379,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     PERL_UNUSED_ARG(depth);
 #endif
 
+    switch (flags) {
+       case EXACTFU: folder = PL_fold_latin1; break;
+       case EXACTF:  folder = PL_fold; break;
+       case EXACTFL: folder = PL_fold_locale; break;
+    }
+
     trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
     trie->refcount = 1;
     trie->startstate = 1;
@@ -1484,12 +1485,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
 
                    if ( !UTF ) {
                        /* store first byte of utf8 representation of
-                          codepoints in the 127 < uvc < 256 range */
-                       if (127 < uvc && uvc < 192) {
-                           TRIE_BITMAP_SET(trie,194);
-                       } else if (191 < uvc ) {
-                           TRIE_BITMAP_SET(trie,195);
-                       /* && uvc < 256 -- we know uvc is < 256 already */
+                          variant codepoints */
+                       if (! UNI_IS_INVARIANT(uvc)) {
+                           TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
                        }
                    }
                     set_bit = 0; /* We've done our bit :-) */
@@ -2451,6 +2449,10 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
        }
 #endif
     }
+#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
+#define IOTA_D_T       GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
+#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS    0x03B0
+#define UPSILON_D_T    GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
 
     if (UTF
        && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
@@ -3073,11 +3075,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                /* Check whether it is compatible with what we know already! */
                int compat = 1;
 
+
+               /* If compatibile, we or it in below.  It is compatible if is
+                * in the bitmp and either 1) its bit or its fold is set, or 2)
+                * it's for a locale.  Even if there isn't unicode semantics
+                * here, at runtime there may be because of matching against a
+                * utf8 string, so accept a possible false positive for
+                * latin1-range folds */
                if (uc >= 0x100 ||
                    (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
                    && !ANYOF_BITMAP_TEST(data->start_class, uc)
                    && (!(data->start_class->flags & ANYOF_FOLD)
-                       || !ANYOF_BITMAP_TEST(data->start_class, (UNI_SEMANTICS) ? PL_fold_latin1[uc] : PL_fold[uc])))
+                       || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
                     )
                    compat = 0;
                ANYOF_CLASS_ZERO(data->start_class);
@@ -3119,12 +3128,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            if (flags & SCF_DO_STCLASS_AND) {
                /* Check whether it is compatible with what we know already! */
                int compat = 1;
-
                if (uc >= 0x100 ||
-                   (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
-                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
-                    && !ANYOF_BITMAP_TEST(data->start_class, (UNI_SEMANTICS) ? PL_fold_latin1[uc] : PL_fold[uc])))
+                (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+                 && !ANYOF_BITMAP_TEST(data->start_class, uc)
+                 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
+               {
                    compat = 0;
+               }
                ANYOF_CLASS_ZERO(data->start_class);
                ANYOF_BITMAP_ZERO(data->start_class);
                if (compat) {
@@ -3136,13 +3146,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    }
                    else {
 
-                       /* Also set the other member of the fold pair.  Can't
-                        * do this for locale, because not known until runtime
-                        */
-                       ANYOF_BITMAP_SET(data->start_class,
-                                        (OP(scan) == EXACTFU)
-                                                   ? PL_fold_latin1[uc]
-                                                   : PL_fold[uc]);
+                       /* Also set the other member of the fold pair.  In case
+                        * that unicode semantics is called for at runtime, use
+                        * the full latin1 fold.  (Can't do this for locale,
+                        * because not known until runtime */
+                       ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
                    }
                }
            }
@@ -3158,9 +3166,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                              * can't do that in locale because not known until
                              * run-time */
                             ANYOF_BITMAP_SET(data->start_class,
-                                            (OP(scan) == EXACTFU)
-                                                        ? PL_fold_latin1[uc]
-                                                        : PL_fold[uc]);
+                                            PL_fold_latin1[uc]);
                         }
                    }
                    data->start_class->flags &= ~ANYOF_EOS;
@@ -3567,7 +3573,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            }
        }
        else if (OP(scan) == FOLDCHAR) {
-           int d = ARG(scan)==0xDF ? 1 : 2;
+           int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
            flags &= ~SCF_DO_STCLASS;
             min += 1;
             delta += d;
@@ -3604,8 +3610,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        goto do_default;
                    if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
                        value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
-                                || ((data->start_class->flags & ANYOF_CLASS)
-                                     && ANYOF_CLASS_TEST_ANY_SET(data->start_class)));
+                                || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
                        cl_anything(pRExC_state, data->start_class);
                    }
                    if (flags & SCF_DO_STCLASS_AND || !value)
@@ -4375,7 +4380,7 @@ Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
 #endif
 
 REGEXP *
-Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
+Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
 {
     dVAR;
     REGEXP *rx;
@@ -4387,12 +4392,14 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
     regnode *scan;
     I32 flags;
     I32 minlen = 0;
+    U32 pm_flags;
 
     /* these are all flags - maybe they should be turned
      * into a single int with different bit masks */
     I32 sawlookahead = 0;
     I32 sawplus = 0;
     I32 sawopen = 0;
+    bool used_setjump = FALSE;
 
     U8 jump_ret = 0;
     dJMPENV;
@@ -4411,15 +4418,20 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
 
     RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
 
-
+    /****************** LONG JUMP TARGET HERE***********************/
     /* Longjmp back to here if have to switch in midstream to utf8 */
     if (! RExC_orig_utf8) {
        JMPENV_PUSH(jump_ret);
+       used_setjump = TRUE;
     }
 
     if (jump_ret == 0) {    /* First time through */
-        exp = SvPV(pattern, plen);
-        xend = exp + plen;
+       exp = SvPV(pattern, plen);
+       xend = exp + plen;
+       /* ignore the utf8ness if the pattern is 0 length */
+       if (plen == 0) {
+           RExC_utf8 = RExC_orig_utf8 = 0;
+       }
 
         DEBUG_COMPILE_r({
             SV *dsv= sv_newmortal();
@@ -4461,6 +4473,13 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
     restudied = 0;
 #endif
 
+    /* Set to use unicode semantics if the pattern is in utf8 and has the
+     * 'dual' charset specified, as it means unicode when utf8  */
+    pm_flags = orig_pm_flags;
+    if (RExC_utf8  && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
+       pm_flags |= RXf_PMf_UNICODE;
+    }
+
     RExC_precomp = exp;
     RExC_flags = pm_flags;
     RExC_sawback = 0;
@@ -4500,12 +4519,8 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
        return(NULL);
     }
 
-    /* Here, finished first pass.  Get rid of our setjmp, which we added for
-     * efficiency only if the passed-in string wasn't in utf8, as shown by
-     * RExC_orig_utf8.  But if the first pass was redone, that variable will be
-     * 1 here even though the original string wasn't utf8, but in this case
-     * there will have been a long jump */
-    if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
+    /* Here, finished first pass.  Get rid of any added setjmp */
+    if (used_setjump) {
        JMPENV_POP;
     }
     DEBUG_PARSE_r({
@@ -5839,9 +5854,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         SvREFCNT_inc_simple_void(sv_dat);
                     }
                     RExC_sawback = 1;
-                    ret = reganode(pRExC_state,
-                          (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
-                          num);
+                   ret = reganode(pRExC_state,
+                                  ((! FOLD)
+                                    ? NREF
+                                    : (UNI_SEMANTICS)
+                                      ? NREFFU
+                                      : (LOC)
+                                        ? NREFFL
+                                        : NREFF),
+                                   num);
                     *flagp |= HASWIDTH;
 
                     Set_Node_Offset(ret, parse_start+1);
@@ -6268,7 +6289,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                                       that follow */
                 has_use_defaults = TRUE;
                 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
-                RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+               if (RExC_utf8) {    /* But the default for a utf8 pattern is
+                                      unicode semantics */
+                   RExC_flags |= RXf_PMf_UNICODE;
+               }
                 goto parse_flags;
            default:
                --RExC_parse;
@@ -6307,7 +6331,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                         {
                             goto fail_modifiers;
                         }
-                        negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+
+                       /* The dual charset means unicode semantics if the
+                        * pattern (or target, not known until runtime) are
+                        * utf8 */
+                       if (RExC_utf8) {
+                           posflags |= RXf_PMf_UNICODE;
+                           negflags |= RXf_PMf_LOCALE;
+                       }
+                       else {
+                           negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+                       }
                         has_charset_modifier = 1;
                         break;
                     case ONCE_PAT_MOD: /* 'o' */
@@ -7273,9 +7307,13 @@ tryagain:
        RExC_parse++;
        vFAIL("Quantifier follows nothing");
        break;
-    case 0xDF:
-    case 0xC3:
-    case 0xCE:
+    case LATIN_SMALL_LETTER_SHARP_S:
+    case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+    case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
+#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
+#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
+    case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
+#endif
         do_foldchar:
         if (!LOC && FOLD) {
             U32 len,cp;
@@ -7304,9 +7342,9 @@ tryagain:
           literal text handling code.
        */
        switch ((U8)*++RExC_parse) {
-       case 0xDF:
-       case 0xC3:
-       case 0xCE:
+       case LATIN_SMALL_LETTER_SHARP_S:
+       case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+       case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
                   goto do_foldchar;        
        /* Special Escapes */
        case 'A':
@@ -7407,11 +7445,19 @@ tryagain:
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'd':
-           ret = reg_node(pRExC_state, DIGIT);
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(DIGITL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(DIGIT));
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'D':
-           ret = reg_node(pRExC_state, NDIGIT);
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NDIGITL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NDIGIT));
+            }
            *flagp |= HASWIDTH|SIMPLE;
            goto finish_meta_pat;
        case 'R':
@@ -7508,8 +7554,14 @@ tryagain:
 
                 RExC_sawback = 1;
                 ret = reganode(pRExC_state,
-                          (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
-                          num);
+                               ((! FOLD)
+                                 ? NREF
+                                 : (UNI_SEMANTICS)
+                                   ? NREFFU
+                                   : (LOC)
+                                     ? NREFFL
+                                     : NREFF),
+                                num);
                 *flagp |= HASWIDTH;
 
                 /* override incorrect value set in reganode MJD */
@@ -7570,8 +7622,14 @@ tryagain:
                    }
                    RExC_sawback = 1;
                    ret = reganode(pRExC_state,
-                                  (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
-                                  num);
+                                  ((! FOLD)
+                                    ? REF
+                                    : (UNI_SEMANTICS)
+                                      ? REFFU
+                                      : (LOC)
+                                        ? REFFL
+                                        : REFF),
+                                   num);
                    *flagp |= HASWIDTH;
 
                     /* override incorrect value set in reganode MJD */
@@ -7634,9 +7692,9 @@ tryagain:
                if (RExC_flags & RXf_PMf_EXTENDED)
                    p = regwhite( pRExC_state, p );
                switch ((U8)*p) {
-               case 0xDF:
-               case 0xC3:
-               case 0xCE:
+               case LATIN_SMALL_LETTER_SHARP_S:
+               case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+               case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
                           if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
                                goto normal_default;
                case '^':
@@ -7663,9 +7721,9 @@ tryagain:
 
                    switch ((U8)*++p) {
                    /* These are all the special escapes. */
-                   case 0xDF:
-                   case 0xC3:
-                   case 0xCE:
+                    case LATIN_SMALL_LETTER_SHARP_S:
+                    case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+                    case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
                           if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
                                goto normal_default;                
                    case 'A':             /* Start assertion */
@@ -8121,14 +8179,14 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
 ANYOF_##NAME:                                           \
        for (value = 0; value < 256; value++)           \
            if (TEST)                                   \
-               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
     yesno = '+';                                        \
     what = WORD;                                        \
     break;                                              \
 case ANYOF_N##NAME:                                     \
        for (value = 0; value < 256; value++)           \
            if (!TEST)                                  \
-               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
     yesno = '!';                                        \
     what = WORD;                                        \
     break
@@ -8143,13 +8201,13 @@ ANYOF_##NAME:                                           \
     else if (UNI_SEMANTICS) {                           \
         for (value = 0; value < 256; value++) {         \
             if (TEST_8) stored +=                       \
-                      S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+                      S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
         }                                               \
     }                                                   \
     else {                                              \
-        for (value = 0; value < 256; value++) {         \
+        for (value = 0; value < 128; value++) {         \
             if (TEST_7) stored +=                       \
-                       S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+                       S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \
         }                                               \
     }                                                   \
     yesno = '+';                                        \
@@ -8160,13 +8218,16 @@ case ANYOF_N##NAME:                                     \
     else if (UNI_SEMANTICS) {                           \
         for (value = 0; value < 256; value++) {         \
             if (! TEST_8) stored +=                     \
-                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+                        S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
         }                                               \
     }                                                   \
     else {                                              \
-        for (value = 0; value < 256; value++) {         \
+        for (value = 0; value < 128; value++) {         \
             if (! TEST_7) stored +=                     \
-                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+                        S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
+        }                                               \
+        for (value = 128; value < 256; value++) {         \
+                        S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
         }                                               \
     }                                                   \
     yesno = '!';                                        \
@@ -8239,7 +8300,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8
 
 
 PERL_STATIC_INLINE U8
-S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U32 value)
+S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
 {
     /* This inline function sets a bit in the bitmap if not already set, and if
      * appropriate, its fold, returning the number of bits that actually
@@ -8283,12 +8344,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
     bool need_class = 0;
     SV *listsv = NULL;
     UV n;
-    bool optimize_invert   = TRUE;
     AV* unicode_alternate  = NULL;
 #ifdef EBCDIC
     UV literal_endpoint = 0;
 #endif
-    UV stored = 0;  /* 0, 1, or more than 1 chars stored in the class */
+    UV stored = 0;  /* how many chars stored in the bitmap */
 
     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
         case we need to change the emitted regop to an EXACT. */
@@ -8317,14 +8377,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
 
     if (SIZE_ONLY) {
        RExC_size += ANYOF_SKIP;
+#ifdef ANYOF_ADD_LOC_SKIP
+       if (LOC) {
+           RExC_size += ANYOF_ADD_LOC_SKIP;
+       }
+#endif
        listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
     }
     else {
        RExC_emit += ANYOF_SKIP;
-       if (FOLD)
-           ANYOF_FLAGS(ret) |= ANYOF_FOLD;
-       if (LOC)
+       if (LOC) {
            ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
+#ifdef ANYOF_ADD_LOC_SKIP
+           RExC_emit += ANYOF_ADD_LOC_SKIP;
+#endif
+       }
        ANYOF_BITMAP_ZERO(ret);
        listsv = newSVpvs("# comment\n");
     }
@@ -8538,10 +8605,14 @@ parseit:
            if (LOC && namedclass < ANYOF_MAX && ! need_class) {
                need_class = 1;
                if (SIZE_ONLY) {
+#ifdef ANYOF_CLASS_ADD_SKIP
                    RExC_size += ANYOF_CLASS_ADD_SKIP;
+#endif
                }
                else {
+#ifdef ANYOF_CLASS_ADD_SKIP
                    RExC_emit += ANYOF_CLASS_ADD_SKIP;
+#endif
                    ANYOF_CLASS_ZERO(ret);
                }
                ANYOF_FLAGS(ret) |= ANYOF_CLASS;
@@ -8559,14 +8630,14 @@ parseit:
 
                    if (prevvalue < 256) {
                        stored +=
-                         S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
+                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
                        stored +=
                          S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
                    }
                    else {
                        ANYOF_FLAGS(ret) |= ANYOF_UTF8;
                        Perl_sv_catpvf(aTHX_ listsv,
-                                      "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
+                          "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
                    }
                }
 
@@ -8579,8 +8650,6 @@ parseit:
                const char *what = NULL;
                char yesno = 0;
 
-               if (namedclass > OOB_NAMEDCLASS)
-                   optimize_invert = FALSE;
                /* Possible truncation here but in some 64-bit environments
                 * the compiler gets heartburn about switch on 64-bit values.
                 * A similar issue a little earlier when switching on value.
@@ -8613,34 +8682,21 @@ parseit:
                    if (LOC)
                        ANYOF_CLASS_SET(ret, ANYOF_ASCII);
                    else {
-#ifndef EBCDIC
                        for (value = 0; value < 128; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-#else  /* EBCDIC */
-                       for (value = 0; value < 256; value++) {
-                           if (isASCII(value))
-                               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-                       }
-#endif /* EBCDIC */
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
                    }
                    yesno = '+';
-                   what = "ASCII";
+                   what = NULL;        /* Doesn't match outside ascii, so
+                                          don't want to add +utf8:: */
                    break;
                case ANYOF_NASCII:
                    if (LOC)
                        ANYOF_CLASS_SET(ret, ANYOF_NASCII);
                    else {
-#ifndef EBCDIC
                        for (value = 128; value < 256; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-#else  /* EBCDIC */
-                       for (value = 0; value < 256; value++) {
-                           if (!isASCII(value))
-                               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-                       }
-#endif /* EBCDIC */
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
                    }
                    yesno = '!';
                    what = "ASCII";
@@ -8652,7 +8708,7 @@ parseit:
                        /* consecutive digits assumed */
                        for (value = '0'; value <= '9'; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
                    }
                    yesno = '+';
                    what = POSIX_CC_UNI_NAME("Digit");
@@ -8664,10 +8720,10 @@ parseit:
                        /* consecutive digits assumed */
                        for (value = 0; value < '0'; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
                        for (value = '9' + 1; value < 256; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
                    }
                    yesno = '!';
                    what = POSIX_CC_UNI_NAME("Digit");
@@ -8682,14 +8738,9 @@ parseit:
                if (what) {
                    /* Strings such as "+utf8::isWord\n" */
                    Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
-               }
-               stored+=2; /* can't optimize this class */
-
-               /* All but ASCII can match Unicode characters, but all the ones
-                * that aren't in utf8 are in the bitmap */
-               if (namedclass != ANYOF_ASCII) {
                    ANYOF_FLAGS(ret) |= ANYOF_UTF8;
                }
+
                continue;
            }
        } /* end of namedclass \blah */
@@ -8727,7 +8778,6 @@ parseit:
        }
 
        /* now is the next time */
-        /*stored += (value - prevvalue + 1);*/
        if (!SIZE_ONLY) {
            if (prevvalue < 256) {
                const IV ceilvalue = value < 256 ? value : 255;
@@ -8743,29 +8793,25 @@ parseit:
                        for (i = prevvalue; i <= ceilvalue; i++)
                            if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
                                stored +=
-                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
                            }
                    } else {
                        for (i = prevvalue; i <= ceilvalue; i++)
                            if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
                                stored +=
-                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
                            }
                    }
                }
                else
 #endif
                      for (i = prevvalue; i <= ceilvalue; i++) {
-                       if (!ANYOF_BITMAP_TEST(ret,i)) {
-                           stored +=
-                                S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
-                       }
+                       stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
                      }
          }
          if (value > 255 || UTF) {
                const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
                const UV natvalue      = NATIVE_TO_UNI(value);
-                stored+=2; /* can't optimize this class */
 
                /* If the code point requires utf8 to represent, and we are not
                 * folding, it can't match unless the target is in utf8.  Only
@@ -8863,52 +8909,107 @@ parseit:
         return ret;
     /****** !SIZE_ONLY AFTER HERE *********/
 
-    if( stored == 1 && (value < 128 || (value < 256 && !UTF))
-        && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
-    ) {
-       /* optimize single char class to an EXACT node but *only* when its not
-        * a UTF/high char.  Note that the information needed to decide to do
-        * this optimization is not currently available until the 2nd pass, and
-        * that the actually used EXACT node takes less space than the
-        * calculated ANYOF node, and hence the amount of space calculated in
-         * the first pass is larger than actually used.  Currently we don't
-         * keep track of enough information to do this for nodes which contain
-         * matches outside the bitmap */
+    /* Folding in the bitmap is taken care of above, but not for locale, for
+     * which we have to wait to see what folding is in effect at runtime, and
+     * for things not in the bitmap */
+    if (FOLD && (LOC || ANYOF_FLAGS(ret) & ANYOF_NONBITMAP)) {
+        ANYOF_FLAGS(ret) |= ANYOF_FOLD;
+    }
+
+    /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that this doesn't
+     * optimize locale.  Doing so perhaps could be done as long as there is
+     * nothing like \w in it; some thought also would have to be given to the
+     * interaction with above 0x100 chars */
+    if ((ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
+       for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
+           ANYOF_BITMAP(ret)[value] ^= 0xFF;
+       stored = 256 - stored;
+
+       /* The inversion means that everything above 255 is matched */
+       ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
+    }
+
+    /* A single character class can be "optimized" into an EXACTish node.
+     * Note that since we don't currently count how many characters there are
+     * outside the bitmap, we are XXX missing optimization possibilities for
+     * them.  This optimization can't happen unless this is a truly single
+     * character class, which means that it can't be an inversion into a
+     * many-character class, and there must be no possibility of there being
+     * things outside the bitmap.  'stored' (only) for locales doesn't include
+     * \w, etc, so have to make a special test that they aren't present
+     *
+     * Similarly A 2-character class of the very special form like [bB] can be
+     * optimized into an EXACTFish node, but only for non-locales, and for
+     * characters which only have the two folds; so things like 'fF' and 'Ii'
+     * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
+     * FI'. */
+    if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
+        && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
+                              || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
+           || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
+                                && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
+                                /* If the latest code point has a fold whose
+                                 * bit is set, it must be the only other one */
+                               && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
+                                && ANYOF_BITMAP_TEST(ret, prevvalue)))))
+    {
+        /* Note that the information needed to decide to do this optimization
+         * is not currently available until the 2nd pass, and that the actually
+        * used EXACTish node takes less space than the calculated ANYOF node,
+        * and hence the amount of space calculated in the first pass is larger
+         * than actually used, so this optimization doesn't gain us any space.
+        * But an EXACT node is faster than an ANYOF node, and can be combined
+        * with any adjacent EXACT nodes later by the optimizer for further
+        * gains.  The speed of executing an EXACTF is similar to an ANYOF
+        * node, so the optimization advantage comes from the ability to join
+        * it to adjacent EXACT nodes */
+
         const char * cur_parse= RExC_parse;
+       U8 op;
         RExC_emit = (regnode *)orig_emit;
         RExC_parse = (char *)orig_parse;
-        ret = reg_node(pRExC_state,
-                       (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
-        RExC_parse = (char *)cur_parse;
-        *STRING(ret)= (char)value;
-        STR_LEN(ret)= 1;
-        RExC_emit += STR_SZ(1);
-       SvREFCNT_dec(listsv);
-        return ret;
-    }
-    /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
-    if ( /* If the only flag is folding (plus possibly inversion). */
-       ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
-       ) {
-       for (value = 0; value < 256; ++value) {
-           if (ANYOF_BITMAP_TEST(ret, value)) {
-               UV fold = PL_fold[value];
 
-               if (fold != value)
-                   ANYOF_BITMAP_SET(ret, fold);
+       if (stored == 1) {
+
+           /* A locale node with one point can be folded; all the other cases
+            * with folding will have two points, since we calculate them above
+            */
+           if (ANYOF_FLAGS(ret) & ANYOF_FOLD) {
+                op = EXACTFL;
            }
+           else {
+               op = EXACT;
+           }
+       }   /* else 2 chars in the bit map: the folds of each other */
+       else if (UNI_SEMANTICS || !isASCII(value)) {
+
+           /* To join adjacent nodes, they must be the exact EXACTish type.
+            * Try to use the most likely type, by using EXACTFU if the regex
+            * calls for them, or is required because the character is
+            * non-ASCII */
+           op = EXACTFU;
+       }
+       else {    /* Otherwise, more likely to be EXACTF type */
+           op = EXACTF;
        }
-       ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
-    }
 
-    /* optimize inverted simple patterns (e.g. [^a-z]) */
-    if (optimize_invert &&
-       /* If the only flag is inversion. */
-       (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
-       for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
-           ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
-       ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
+       ret = reg_node(pRExC_state, op);
+        RExC_parse = (char *)cur_parse;
+       if (UTF && ! NATIVE_IS_INVARIANT(value)) {
+           *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
+           *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
+           STR_LEN(ret)= 2;
+           RExC_emit += STR_SZ(2);
+       }
+       else {
+           *STRING(ret)= (char)value;
+           STR_LEN(ret)= 1;
+           RExC_emit += STR_SZ(1);
+       }
+       SvREFCNT_dec(listsv);
+        return ret;
     }
+
     {
        AV * const av = newAV();
        SV *rv;
@@ -9571,7 +9672,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
     else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
        Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
        if ( RXp_PAREN_NAMES(prog) ) {
-            if ( k != REF || OP(o) < NREF) {       
+            if ( k != REF || (OP(o) < NREF)) {
                AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
                SV **name= av_fetch(list, ARG(o), 0 );
                if (name)
@@ -9669,8 +9770,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
        }
         
         EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
-        /* output any special charclass tests (used mostly under use locale) */
-       if (o->flags & ANYOF_CLASS && ANYOF_CLASS_TEST_ANY_SET(o))
+        /* output any special charclass tests (used entirely under use locale) */
+       if (ANYOF_CLASS_TEST_ANY_SET(o))
            for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
                if (ANYOF_CLASS_TEST(o,i)) {
                    sv_catpv(sv, anyofs[i]);