perldelta for [perl #72090]

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 07834a0..4fb8c37 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1156,7 +1156,7 @@ the silent ignoring of duplicate alternations which are of the form:
  
   / (DUPE|DUPE) X? (?{ ... }) Y /x
  
-Thus EVAL blocks follwing a trie may be called a different number of times with
+Thus EVAL blocks following a trie may be called a different number of times with
  and without the optimisation. With the optimisations dupes will be silently
  ignored. This inconsistant behaviour of EVAL type nodes is well established as
  the following demonstrates:
@@ -1358,13 +1358,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
      regnode *convert = NULL;
      U32 *prev_states; /* temp array mapping each state to previous one */
      /* we just use folder as a flag in utf8 */
-    const U8 * const folder = ( flags == EXACTF
-                       ? PL_fold
-                       : ( flags == EXACTFL
-                           ? PL_fold_locale
-                           : NULL
-                         )
-                     );
+    const U8 * folder = NULL;
  
  #ifdef DEBUGGING
      const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
@@ -1384,6 +1378,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
      PERL_UNUSED_ARG(depth);
  #endif
  
+    switch (flags) {
+       case EXACTFU: folder = PL_fold_latin1; break;
+       case EXACTF:  folder = PL_fold; break;
+       case EXACTFL: folder = PL_fold_locale; break;
+    }
+
      trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
      trie->refcount = 1;
      trie->startstate = 1;
@@ -1475,8 +1475,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                      TRIE_STORE_REVCHAR;
                  }
                  if ( set_bit ) {
-                    /* store the codepoint in the bitmap, and if its ascii
-                       also store its folded equivelent. */
+                   /* store the codepoint in the bitmap, and its folded
+                    * equivalent. */
                      TRIE_BITMAP_SET(trie,uvc);
  
                     /* store the folded codepoint */
@@ -1484,12 +1484,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
  
                     if ( !UTF ) {
                         /* store first byte of utf8 representation of
-                          codepoints in the 127 < uvc < 256 range */
-                       if (127 < uvc && uvc < 192) {
-                           TRIE_BITMAP_SET(trie,194);
-                       } else if (191 < uvc ) {
-                           TRIE_BITMAP_SET(trie,195);
-                       /* && uvc < 256 -- we know uvc is < 256 already */
+                          variant codepoints */
+                       if (! UNI_IS_INVARIANT(uvc)) {
+                           TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
                         }
                     }
                      set_bit = 0; /* We've done our bit :-) */
@@ -2451,8 +2448,15 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags
         }
  #endif
      }
-    
-    if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
+#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS   0x0390
+#define IOTA_D_T       GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
+#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS    0x03B0
+#define UPSILON_D_T    GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
+
+    if (UTF
+       && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
+       && ( STR_LEN(scan) >= 6 ) )
+    {
      /*
      Two problematic code points in Unicode casefolding of EXACT nodes:
      
@@ -3070,11 +3074,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                 /* Check whether it is compatible with what we know already! */
                 int compat = 1;
  
+
+               /* If compatibile, we or it in below.  It is compatible if is
+                * in the bitmp and either 1) its bit or its fold is set, or 2)
+                * it's for a locale.  Even if there isn't unicode semantics
+                * here, at runtime there may be because of matching against a
+                * utf8 string, so accept a possible false positive for
+                * latin1-range folds */
                 if (uc >= 0x100 ||
                     (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
                     && !ANYOF_BITMAP_TEST(data->start_class, uc)
                     && (!(data->start_class->flags & ANYOF_FOLD)
-                       || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
+                       || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
                      )
                     compat = 0;
                 ANYOF_CLASS_ZERO(data->start_class);
@@ -3116,28 +3127,47 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             if (flags & SCF_DO_STCLASS_AND) {
                 /* Check whether it is compatible with what we know already! */
                 int compat = 1;
-
                 if (uc >= 0x100 ||
-                   (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
-                   && !ANYOF_BITMAP_TEST(data->start_class, uc)
-                    && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
+                (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+                 && !ANYOF_BITMAP_TEST(data->start_class, uc)
+                 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
+               {
                     compat = 0;
+               }
                 ANYOF_CLASS_ZERO(data->start_class);
                 ANYOF_BITMAP_ZERO(data->start_class);
                 if (compat) {
                     ANYOF_BITMAP_SET(data->start_class, uc);
                     data->start_class->flags &= ~ANYOF_EOS;
                     data->start_class->flags |= ANYOF_FOLD;
-                   if (OP(scan) == EXACTFL)
+                   if (OP(scan) == EXACTFL) {
                         data->start_class->flags |= ANYOF_LOCALE;
+                   }
+                   else {
+
+                       /* Also set the other member of the fold pair.  In case
+                        * that unicode semantics is called for at runtime, use
+                        * the full latin1 fold.  (Can't do this for locale,
+                        * because not known until runtime */
+                       ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
+                   }
                 }
             }
             else if (flags & SCF_DO_STCLASS_OR) {
                 if (data->start_class->flags & ANYOF_FOLD) {
                     /* false positive possible if the class is case-folded.
                        Assume that the locale settings are the same... */
-                   if (uc < 0x100)
+                   if (uc < 0x100) {
                         ANYOF_BITMAP_SET(data->start_class, uc);
+                        if (OP(scan) != EXACTFL) {
+
+                            /* And set the other member of the fold pair, but
+                             * can't do that in locale because not known until
+                             * run-time */
+                            ANYOF_BITMAP_SET(data->start_class,
+                                            PL_fold_latin1[uc]);
+                        }
+                   }
                     data->start_class->flags &= ~ANYOF_EOS;
                 }
                 cl_and(data->start_class, and_withp);
@@ -3221,13 +3251,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                     f |= SCF_DO_STCLASS_AND;
                     f &= ~SCF_DO_STCLASS_OR;
                 }
-               /* These are the cases when once a subexpression
-                  fails at a particular position, it cannot succeed
-                  even after backtracking at the enclosing scope.
-
-                  XXXX what if minimal match and we are at the
-                       initial run of {n,m}? */
-               if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
+               /* Exclude from super-linear cache processing any {n,m}
+                  regops for which the combination of input pos and regex
+                  pos is not enough information to determine if a match
+                  will be possible.
+
+                  For example, in the regex /foo(bar\s*){4,8}baz/ with the
+                  regex pos at the \s*, the prospects for a match depend not
+                  only on the input position but also on how many (bar\s*)
+                  repeats into the {4,8} we are. */
+               if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
                     f &= ~SCF_WHILEM_VISITED_POS;
  
                 /* This will finish on WHILEM, setting scan, or on NULL: */
@@ -3539,7 +3572,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
             }
         }
         else if (OP(scan) == FOLDCHAR) {
-           int d = ARG(scan)==0xDF ? 1 : 2;
+           int d = ARG(scan) == LATIN_SMALL_LETTER_SHARP_S ? 1 : 2;
             flags &= ~SCF_DO_STCLASS;
              min += 1;
              delta += d;
@@ -4347,7 +4380,7 @@ Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
  #endif
  
  REGEXP *
-Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
+Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
  {
      dVAR;
      REGEXP *rx;
@@ -4359,12 +4392,14 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
      regnode *scan;
      I32 flags;
      I32 minlen = 0;
+    U32 pm_flags;
  
      /* these are all flags - maybe they should be turned
       * into a single int with different bit masks */
      I32 sawlookahead = 0;
      I32 sawplus = 0;
      I32 sawopen = 0;
+    bool used_setjump = FALSE;
  
      U8 jump_ret = 0;
      dJMPENV;
@@ -4383,15 +4418,20 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
  
      RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
  
-
+    /****************** LONG JUMP TARGET HERE***********************/
      /* Longjmp back to here if have to switch in midstream to utf8 */
      if (! RExC_orig_utf8) {
         JMPENV_PUSH(jump_ret);
+       used_setjump = TRUE;
      }
  
      if (jump_ret == 0) {    /* First time through */
-        exp = SvPV(pattern, plen);
-        xend = exp + plen;
+       exp = SvPV(pattern, plen);
+       xend = exp + plen;
+       /* ignore the utf8ness if the pattern is 0 length */
+       if (plen == 0) {
+           RExC_utf8 = RExC_orig_utf8 = 0;
+       }
  
          DEBUG_COMPILE_r({
              SV *dsv= sv_newmortal();
@@ -4433,6 +4473,13 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
      restudied = 0;
  #endif
  
+    /* Set to use unicode semantics if the pattern is in utf8 and has the
+     * 'dual' charset specified, as it means unicode when utf8  */
+    pm_flags = orig_pm_flags;
+    if (RExC_utf8  && ! (pm_flags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE))) {
+       pm_flags |= RXf_PMf_UNICODE;
+    }
+
      RExC_precomp = exp;
      RExC_flags = pm_flags;
      RExC_sawback = 0;
@@ -4472,12 +4519,8 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags)
         return(NULL);
      }
  
-    /* Here, finished first pass.  Get rid of our setjmp, which we added for
-     * efficiency only if the passed-in string wasn't in utf8, as shown by
-     * RExC_orig_utf8.  But if the first pass was redone, that variable will be
-     * 1 here even though the original string wasn't utf8, but in this case
-     * there will have been a long jump */
-    if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) {
+    /* Here, finished first pass.  Get rid of any added setjmp */
+    if (used_setjump) {
         JMPENV_POP;
      }
      DEBUG_PARSE_r({
@@ -4724,7 +4767,7 @@ reStudy:
         if (PL_regkind[OP(first)] == EXACT) {
             if (OP(first) == EXACT)
                 NOOP;   /* Empty, get anchored substr later. */
-           else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
+           else
                 ri->regstclass = first;
         }
  #ifdef TRIE_STCLASS    
@@ -5811,9 +5854,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                          SvREFCNT_inc_simple_void(sv_dat);
                      }
                      RExC_sawback = 1;
-                    ret = reganode(pRExC_state,
-                          (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
-                          num);
+                   ret = reganode(pRExC_state,
+                                  ((! FOLD)
+                                    ? NREF
+                                    : (UNI_SEMANTICS)
+                                      ? NREFFU
+                                      : (LOC)
+                                        ? NREFFL
+                                        : NREFF),
+                                   num);
                      *flagp |= HASWIDTH;
  
                      Set_Node_Offset(ret, parse_start+1);
@@ -6240,7 +6289,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                                        that follow */
                  has_use_defaults = TRUE;
                  STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
-                RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+               if (RExC_utf8) {    /* But the default for a utf8 pattern is
+                                      unicode semantics */
+                   RExC_flags |= RXf_PMf_UNICODE;
+               }
                  goto parse_flags;
             default:
                 --RExC_parse;
@@ -6279,7 +6331,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                          {
                              goto fail_modifiers;
                          }
-                        negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+
+                       /* The dual charset means unicode semantics if the
+                        * pattern (or target, not known until runtime) are
+                        * utf8 */
+                       if (RExC_utf8) {
+                           posflags |= RXf_PMf_UNICODE;
+                           negflags |= RXf_PMf_LOCALE;
+                       }
+                       else {
+                           negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE);
+                       }
                          has_charset_modifier = 1;
                          break;
                      case ONCE_PAT_MOD: /* 'o' */
@@ -6978,8 +7040,12 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
         char *endchar;      /* Points to '.' or '}' ending cur char in the input
                                stream */
  
-       ret = reg_node(pRExC_state,
-                       (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+       ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
+                                                  : (LOC)
+                                                     ? EXACTFL
+                                                     : UNI_SEMANTICS
+                                                       ? EXACTFU
+                                                       : EXACTF));
         s= STRING(ret);
  
         /* Exact nodes can hold only a U8 length's of text = 255.  Loop through
@@ -7241,9 +7307,13 @@ tryagain:
         RExC_parse++;
         vFAIL("Quantifier follows nothing");
         break;
-    case 0xDF:
-    case 0xC3:
-    case 0xCE:
+    case LATIN_SMALL_LETTER_SHARP_S:
+    case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+    case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
+#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
+#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ.  Other instances in this code should have the case statement below.
+    case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
+#endif
          do_foldchar:
          if (!LOC && FOLD) {
              U32 len,cp;
@@ -7272,9 +7342,9 @@ tryagain:
            literal text handling code.
         */
         switch ((U8)*++RExC_parse) {
-       case 0xDF:
-       case 0xC3:
-       case 0xCE:
+       case LATIN_SMALL_LETTER_SHARP_S:
+       case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+       case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
                    goto do_foldchar;        
         /* Special Escapes */
         case 'A':
@@ -7375,11 +7445,19 @@ tryagain:
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'd':
-           ret = reg_node(pRExC_state, DIGIT);
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(DIGITL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(DIGIT));
+            }
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'D':
-           ret = reg_node(pRExC_state, NDIGIT);
+            if (LOC) {
+                ret = reg_node(pRExC_state, (U8)(NDIGITL));
+            } else {
+                ret = reg_node(pRExC_state, (U8)(NDIGIT));
+            }
             *flagp |= HASWIDTH|SIMPLE;
             goto finish_meta_pat;
         case 'R':
@@ -7476,8 +7554,14 @@ tryagain:
  
                  RExC_sawback = 1;
                  ret = reganode(pRExC_state,
-                          (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
-                          num);
+                               ((! FOLD)
+                                 ? NREF
+                                 : (UNI_SEMANTICS)
+                                   ? NREFFU
+                                   : (LOC)
+                                     ? NREFFL
+                                     : NREFF),
+                                num);
                  *flagp |= HASWIDTH;
  
                  /* override incorrect value set in reganode MJD */
@@ -7538,8 +7622,14 @@ tryagain:
                     }
                     RExC_sawback = 1;
                     ret = reganode(pRExC_state,
-                                  (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
-                                  num);
+                                  ((! FOLD)
+                                    ? REF
+                                    : (UNI_SEMANTICS)
+                                      ? REFFU
+                                      : (LOC)
+                                        ? REFFL
+                                        : REFF),
+                                   num);
                     *flagp |= HASWIDTH;
  
                      /* override incorrect value set in reganode MJD */
@@ -7585,7 +7675,13 @@ tryagain:
         defchar:
             ender = 0;
             ret = reg_node(pRExC_state,
-                          (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+                          (U8) ((! FOLD) ? EXACT
+                                         : (LOC)
+                                            ? EXACTFL
+                                            : (UNI_SEMANTICS)
+                                              ? EXACTFU
+                                              : EXACTF)
+                   );
             s = STRING(ret);
             for (len = 0, p = RExC_parse - 1;
               len < 127 && p < RExC_end;
@@ -7596,9 +7692,9 @@ tryagain:
                 if (RExC_flags & RXf_PMf_EXTENDED)
                     p = regwhite( pRExC_state, p );
                 switch ((U8)*p) {
-               case 0xDF:
-               case 0xC3:
-               case 0xCE:
+               case LATIN_SMALL_LETTER_SHARP_S:
+               case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+               case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
                                 goto normal_default;
                 case '^':
@@ -7625,9 +7721,9 @@ tryagain:
  
                     switch ((U8)*++p) {
                     /* These are all the special escapes. */
-                   case 0xDF:
-                   case 0xC3:
-                   case 0xCE:
+                    case LATIN_SMALL_LETTER_SHARP_S:
+                    case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
+                    case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
                            if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
                                 goto normal_default;                
                     case 'A':             /* Start assertion */
@@ -8109,9 +8205,9 @@ ANYOF_##NAME:                                           \
          }                                               \
      }                                                   \
      else {                                              \
-        for (value = 0; value < 256; value++) {         \
+        for (value = 0; value < 128; value++) {         \
              if (TEST_7) stored +=                       \
-                       S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+                       S_set_regclass_bit(aTHX_ pRExC_state, ret, UNI_TO_NATIVE(value)); \
          }                                               \
      }                                                   \
      yesno = '+';                                        \
@@ -8126,10 +8222,13 @@ case ANYOF_N##NAME:                                     \
          }                                               \
      }                                                   \
      else {                                              \
-        for (value = 0; value < 256; value++) {         \
+        for (value = 0; value < 128; value++) {         \
              if (! TEST_7) stored +=                     \
                          S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
          }                                               \
+        for (value = 128; value < 256; value++) {         \
+                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+        }                                               \
      }                                                   \
      yesno = '!';                                        \
      what = WORD;                                        \
@@ -8184,7 +8283,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8
      if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
         ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
      }
-    else if (_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C(value)
+    else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
              || (! UNI_SEMANTICS
                   && ! isASCII(value)
                   && PL_fold_latin1[value] != value))
@@ -8575,16 +8674,9 @@ parseit:
                     if (LOC)
                         ANYOF_CLASS_SET(ret, ANYOF_ASCII);
                     else {
-#ifndef EBCDIC
                         for (value = 0; value < 128; value++)
                             stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-#else  /* EBCDIC */
-                       for (value = 0; value < 256; value++) {
-                           if (isASCII(value))
-                               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-                       }
-#endif /* EBCDIC */
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
                     }
                     yesno = '+';
                     what = "ASCII";
@@ -8593,16 +8685,9 @@ parseit:
                     if (LOC)
                         ANYOF_CLASS_SET(ret, ANYOF_NASCII);
                     else {
-#ifndef EBCDIC
                         for (value = 128; value < 256; value++)
                             stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-#else  /* EBCDIC */
-                       for (value = 0; value < 256; value++) {
-                           if (!isASCII(value))
-                               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
-                       }
-#endif /* EBCDIC */
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
                     }
                     yesno = '!';
                     what = "ASCII";
@@ -8718,10 +8803,7 @@ parseit:
                 else
  #endif
                       for (i = prevvalue; i <= ceilvalue; i++) {
-                       if (!ANYOF_BITMAP_TEST(ret,i)) {
-                           stored +=
-                                S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
-                       }
+                       stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
                       }
           }
           if (value > 255 || UTF) {
@@ -9264,6 +9346,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
              switch (OP(scan)) {
                  case EXACT:
                  case EXACTF:
+                case EXACTFU:
                  case EXACTFL:
                          if( exact == PSEUDO )
                              exact= OP(scan);
@@ -9532,7 +9615,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
      else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
         Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o));    /* Parenth number */
         if ( RXp_PAREN_NAMES(prog) ) {
-            if ( k != REF || OP(o) < NREF) {       
+            if ( k != REF || (OP(o) < NREF)) {
                 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
                 SV **name= av_fetch(list, ARG(o), 0 );
                 if (name)
@@ -9645,7 +9728,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
             sv_catpvs(sv, "{unicode_all}");
         else if (flags & ANYOF_UTF8)
             sv_catpvs(sv, "{unicode}");
-       else if (flags & ANYOF_NONBITMAP)
+       if (flags & ANYOF_NONBITMAP_NON_UTF8)
             sv_catpvs(sv, "{outside bitmap}");
  
         {
@@ -10351,8 +10434,14 @@ S_put_byte(pTHX_ SV *sv, int c)
         ones (binary 1111 1111, hexadecimal FF). It is similar, but not
         identical, to the ASCII delete (DEL) or rubout control character.
         ) So the old condition can be simplified to !isPRINT(c)  */
-    if (!isPRINT(c))
-       Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
+    if (!isPRINT(c)) {
+       if (c < 256) {
+           Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
+       }
+       else {
+           Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
+       }
+    }
      else {
         const char string = c;
         if (c == '-' || c == ']' || c == '\\' || c == '^')