This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c:current_re_engine: Avoid %^H lookup when possible
[perl5.git] / regcomp.c
index 440d26a..931f8fb 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -1041,8 +1041,8 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con
                     255, which means that the union with cl should just be
                     what cl has in it, so can ignore this flag
             ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
-                    is 127-255 to match them, but then invert that, so the
-                    union with cl should just be what cl has in it, so can
+                    is (ASCII) 127-255 to match them, but then invert that, so
+                    the union with cl should just be what cl has in it, so can
                     ignore this flag
          */
     } else {    /* 'or_with' is not inverted */
@@ -1443,7 +1443,7 @@ and would end up looking like:
    8: EXACT <baz>(10)
   10: END(0)
 
-    d = uvuni_to_utf8_flags(d, uv, 0);
+    d = uvchr_to_utf8_flags(d, uv, 0);
 
 is the recommended Unicode-aware way of saying
 
@@ -1455,7 +1455,7 @@ is the recommended Unicode-aware way of saying
        if (UTF) {                                                         \
             SV *zlopp = newSV(7); /* XXX: optimize me */                   \
            unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp);      \
-            unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
+            unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
            SvCUR_set(zlopp, kapow - flrbbbbb);                            \
            SvPOK_on(zlopp);                                               \
            SvUTF8_on(zlopp);                                              \
@@ -1466,31 +1466,28 @@ is the recommended Unicode-aware way of saying
        }                                                                  \
         } STMT_END
 
-#define TRIE_READ_CHAR STMT_START {                                                     \
-    wordlen++;                                                                          \
-    if ( UTF ) {                                                                        \
-        /* if it is UTF then it is either already folded, or does not need folding */   \
-        uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags);             \
-    }                                                                                   \
-    else if (folder == PL_fold_latin1) {                                                \
-        /* if we use this folder we have to obey unicode rules on latin-1 data */       \
-        if ( foldlen > 0 ) {                                                            \
-           uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags );       \
-           foldlen -= len;                                                              \
-           scan += len;                                                                 \
-           len = 0;                                                                     \
-        } else {                                                                        \
-            len = 1;                                                                    \
-            uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL);       \
-            skiplen = UNISKIP(uvc);                                                     \
-            foldlen -= skiplen;                                                         \
-            scan = foldbuf + skiplen;                                                   \
-        }                                                                               \
-    } else {                                                                            \
-        /* raw data, will be folded later if needed */                                  \
-        uvc = (U32)*uc;                                                                 \
-        len = 1;                                                                        \
-    }                                                                                   \
+/* This gets the next character from the input, folding it if not already
+ * folded. */
+#define TRIE_READ_CHAR STMT_START {                                           \
+    wordlen++;                                                                \
+    if ( UTF ) {                                                              \
+        /* if it is UTF then it is either already folded, or does not need    \
+         * folding */                                                         \
+        uvc = valid_utf8_to_uvchr( (const U8*) uc, &len);                     \
+    }                                                                         \
+    else if (folder == PL_fold_latin1) {                                      \
+        /* This folder implies Unicode rules, which in the range expressible  \
+         *  by not UTF is the lower case, with the two exceptions, one of     \
+         *  which should have been taken care of before calling this */       \
+        assert(*uc != LATIN_SMALL_LETTER_SHARP_S);                            \
+        uvc = toLOWER_L1(*uc);                                                \
+        if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU;         \
+        len = 1;                                                              \
+    } else {                                                                  \
+        /* raw data, will be folded later if needed */                        \
+        uvc = (U32)*uc;                                                       \
+        len = 1;                                                              \
+    }                                                                         \
 } STMT_END
 
 
@@ -1576,7 +1573,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
     HV *widecharmap = NULL;
     AV *revcharmap = newAV();
     regnode *cur;
-    const U32 uniflags = UTF8_ALLOW_DEFAULT;
     STRLEN len = 0;
     UV uvc = 0;
     U16 curword = 0;
@@ -1610,7 +1606,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
         case EXACT: break;
        case EXACTFA:
         case EXACTFU_SS:
-        case EXACTFU_TRICKYFOLD:
        case EXACTFU: folder = PL_fold_latin1; break;
        case EXACTF:  folder = PL_fold; break;
        case EXACTFL: folder = PL_fold_locale; break;
@@ -1682,11 +1677,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
         const U8 *uc = (U8*)STRING( noper );
         const U8 *e  = uc + STR_LEN( noper );
         STRLEN foldlen = 0;
-        U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
-        STRLEN skiplen = 0;
-        const U8 *scan = (U8*)NULL;
         U32 wordlen      = 0;         /* required init */
-        STRLEN chars = 0;
+        STRLEN minbytes = 0;
+        STRLEN maxbytes = 0;
         bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
 
         if (OP(noper) == NOTHING) {
@@ -1707,13 +1700,61 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                                           regardless of encoding */
             if (OP( noper ) == EXACTFU_SS) {
                 /* false positives are ok, so just set this */
-                TRIE_BITMAP_SET(trie,0xDF);
+                TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
             }
         }
         for ( ; uc < e ; uc += len ) {
             TRIE_CHARCOUNT(trie)++;
             TRIE_READ_CHAR;
-            chars++;
+
+            /* Acummulate to the current values, the range in the number of
+             * bytes that this character could match.  The max is presumed to
+             * be the same as the folded input (which TRIE_READ_CHAR returns),
+             * except that when this is not in UTF-8, it could be matched
+             * against a string which is UTF-8, and the variant characters
+             * could be 2 bytes instead of the 1 here.  Likewise, for the
+             * minimum number of bytes when not folded.  When folding, the min
+             * is assumed to be 1 byte could fold to match the single character
+             * here, or in the case of a multi-char fold, 1 byte can fold to
+             * the whole sequence.  'foldlen' is used to denote whether we are
+             * in such a sequence, skipping the min setting if so.  XXX TODO
+             * Use the exact list of what folds to each character, from
+             * PL_utf8_foldclosures */
+            if (UTF) {
+                maxbytes += UTF8SKIP(uc);
+                if (! folder) {
+                    /* A non-UTF-8 string could be 1 byte to match our 2 */
+                    minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
+                                ? 1
+                                : UTF8SKIP(uc);
+                }
+                else {
+                    if (foldlen) {
+                        foldlen -= UTF8SKIP(uc);
+                    }
+                    else {
+                        foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
+                        minbytes++;
+                    }
+                }
+            }
+            else {
+                maxbytes += (UNI_IS_INVARIANT(*uc))
+                             ? 1
+                             : 2;
+                if (! folder) {
+                    minbytes++;
+                }
+                else {
+                    if (foldlen) {
+                        foldlen--;
+                    }
+                    else {
+                        foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
+                        minbytes++;
+                    }
+                }
+            }
             if ( uvc < 256 ) {
                 if ( folder ) {
                     U8 folded= folder[ (U8) uvc ];
@@ -1737,7 +1778,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                    if ( !UTF ) {
                        /* store first byte of utf8 representation of
                           variant codepoints */
-                       if (! UNI_IS_INVARIANT(uvc)) {
+                       if (! NATIVE_IS_INVARIANT(uvc)) {
                            TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
                        }
                    }
@@ -1760,25 +1801,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
             }
         }
         if( cur == first ) {
-            trie->minlen = chars;
-            trie->maxlen = chars;
-        } else if (chars < trie->minlen) {
-            trie->minlen = chars;
-        } else if (chars > trie->maxlen) {
-            trie->maxlen = chars;
-        }
-        if (OP( noper ) == EXACTFU_SS) {
-            /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
-           if (trie->minlen > 1)
-                trie->minlen= 1;
+            trie->minlen = minbytes;
+            trie->maxlen = maxbytes;
+        } else if (minbytes < trie->minlen) {
+            trie->minlen = minbytes;
+        } else if (maxbytes > trie->maxlen) {
+            trie->maxlen = maxbytes;
         }
-       if (OP( noper ) == EXACTFU_TRICKYFOLD) {
-           /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}" 
-            *                - We assume that any such sequence might match a 2 byte string */
-            if (trie->minlen > 2 )
-                trie->minlen= 2;
-        }
-
     } /* end first pass */
     DEBUG_TRIE_COMPILE_r(
         PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
@@ -1845,11 +1874,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
             const U8 *e      = uc + STR_LEN( noper );
            U32 state        = 1;         /* required init */
            U16 charid       = 0;         /* sanity init */
-           U8 *scan         = (U8*)NULL; /* sanity init */
-           STRLEN foldlen   = 0;         /* required init */
             U32 wordlen      = 0;         /* required init */
-           U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
-            STRLEN skiplen   = 0;
 
             if (OP(noper) == NOTHING) {
                 regnode *noper_next= regnext(noper);
@@ -2055,12 +2080,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
 
             U16 charid       = 0;         /* sanity init */
             U32 accept_state = 0;         /* sanity init */
-            U8 *scan         = (U8*)NULL; /* sanity init */
 
-            STRLEN foldlen   = 0;         /* required init */
             U32 wordlen      = 0;         /* required init */
-            STRLEN skiplen   = 0;
-            U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
 
             if (OP(noper) == NOTHING) {
                 regnode *noper_next= regnext(noper);
@@ -2680,8 +2701,8 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
  * that is "sss".
  *
  * It turns out that there are problems with all multi-character folds, and not
- * just these three.  Now the code is general, for all such cases, but the
- * three still have some special handling.  The approach taken is:
+ * just these three.  Now the code is general, for all such cases.  The
+ * approach taken is:
  * 1)   This routine examines each EXACTFish node that could contain multi-
  *      character fold sequences.  It returns in *min_subtract how much to
  *      subtract from the the actual length of the string to get a real minimum
@@ -2689,10 +2710,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
  *      used by the caller to adjust the min length of the match, and the delta
  *      between min and max, so that the optimizer doesn't reject these
  *      possibilities based on size constraints.
- * 2)   Certain of these sequences require special handling by the trie code,
- *      so, if found, this code changes the joined node type to special ops:
- *      EXACTFU_TRICKYFOLD and EXACTFU_SS.
- * 3)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
+ * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
  *      is used for an EXACTFU node that contains at least one "ss" sequence in
  *      it.  For non-UTF-8 patterns and strings, this is the only case where
  *      there is a possible fold length change.  That means that a regular
@@ -2711,7 +2729,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
  *      this file makes sure that in EXACTFU nodes, the sharp s gets folded to
  *      'ss', even if the pattern isn't UTF-8.  This avoids the issues
  *      described in the next item.
- * 4)   A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
+ * 3)   A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
  *      pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
  *      UTF-8 pattern.)  An assumption that the optimizer part of regexec.c
  *      (probably unwittingly, in Perl_regexec_flags()) makes is that a
@@ -2742,7 +2760,14 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source,  regnode
  *      but in a non-UTF8 pattern, folding it to that above-Latin1 string would
  *      require the pattern to be forced into UTF-8, the overhead of which we
  *      want to avoid.)
- */
+ *
+ *      Similarly, the code that generates tries doesn't currently handle
+ *      not-already-folded multi-char folds, and it looks like a pain to change
+ *      that.  Therefore, trie generation of EXACTFA nodes with the sharp s
+ *      doesn't work.  Instead, such an EXACTFA is turned into a new regnode,
+ *      EXACTFA_NO_TRIE, which the trie code knows not to handle.  Most people
+ *      using /iaa matching will be doing so almost entirely with ASCII
+ *      strings, so this should rarely be encountered in practice */
 
 #define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
     if (PL_regkind[OP(scan)] == EXACT) \
@@ -2863,39 +2888,17 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                 }
 
                 /* Nodes with 'ss' require special handling, except for EXACTFL
-                 * and EXACTFA for which there is no multi-char fold to this */
+                 * and EXACTFA-ish for which there is no multi-char fold to
+                 * this */
                 if (len == 2 && *s == 's' && *(s+1) == 's'
-                    && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
+                    && OP(scan) != EXACTFL
+                    && OP(scan) != EXACTFA
+                    && OP(scan) != EXACTFA_NO_TRIE)
                 {
                     count = 2;
                     OP(scan) = EXACTFU_SS;
                     s += 2;
                 }
-                else if (len == 6   /* len is the same in both ASCII and EBCDIC
-                                       for these */
-                         && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
-                                      COMBINING_DIAERESIS_UTF8
-                                      COMBINING_ACUTE_ACCENT_UTF8,
-                                   6)
-                             || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
-                                         COMBINING_DIAERESIS_UTF8
-                                         COMBINING_ACUTE_ACCENT_UTF8,
-                                     6)))
-                {
-                    count = 3;
-
-                    /* These two folds require special handling by trie's, so
-                     * change the node type to indicate this.  If EXACTFA and
-                     * EXACTFL were ever to be handled by trie's, this would
-                     * have to be changed.  If this node has already been
-                     * changed to EXACTFU_SS in this loop, leave it as is.  (I
-                     * (khw) think it doesn't matter in regexec.c for UTF
-                     * patterns, but no need to change it */
-                    if (OP(scan) == EXACTFU) {
-                        OP(scan) = EXACTFU_TRICKYFOLD;
-                    }
-                    s += 6;
-                }
                 else { /* Here is a generic multi-char fold. */
                     const U8* multi_end  = s + len;
 
@@ -2908,7 +2911,10 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                      * test for them.  The code that generates the
                      * is_MULTI_foo() macros croaks should one actually get put
                      * into Unicode .) */
-                    if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
+                    if (OP(scan) != EXACTFL
+                        && OP(scan) != EXACTFA
+                        && OP(scan) != EXACTFA_NO_TRIE)
+                    {
                         count = utf8_length(s, multi_end);
                         s = multi_end;
                     }
@@ -2937,9 +2943,12 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
             /* Non-UTF-8 pattern, EXACTFA node.  There can't be a multi-char
              * fold to the ASCII range (and there are no existing ones in the
              * upper latin1 range).  But, as outlined in the comments preceding
-             * this function, we need to flag any occurrences of the sharp s */
+             * this function, we need to flag any occurrences of the sharp s.
+             * This character forbids trie formation (because of added
+             * complexity) */
            while (s < s_end) {
                 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
+                    OP(scan) = EXACTFA_NO_TRIE;
                     *has_exactf_sharp_s = TRUE;
                     break;
                 }
@@ -3359,14 +3368,14 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                 EXACT           | EXACT
                                 EXACTFU         | EXACTFU
                                 EXACTFU_SS      | EXACTFU
-                                EXACTFU_TRICKYFOLD | EXACTFU
-                                EXACTFA         | 0
+                                EXACTFA         | EXACTFA
 
 
                         */
 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING :   \
                        ( EXACT == (X) )   ? EXACT :        \
-                       ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU :        \
+                       ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU :        \
+                       ( EXACTFA == (X) ) ? EXACTFA :        \
                        0 )
 
                         /* dont use tail as the end marker for this traverse */
@@ -3746,7 +3755,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         /* All other (EXACTFL handled above) folds except under
                          * /iaa that include s, S, and sharp_s also may include
                          * the others */
-                       if (OP(scan) != EXACTFA) {
+                       if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE)
+                        {
                            if (uc == 's' || uc == 'S') {
                                ANYOF_BITMAP_SET(data->start_class,
                                                 LATIN_SMALL_LETTER_SHARP_S);
@@ -3783,7 +3793,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
                            /* All folds except under /iaa that include s, S,
                             * and sharp_s also may include the others */
-                           if (OP(scan) != EXACTFA) {
+                           if (OP(scan) != EXACTFA
+                                && OP(scan) != EXACTFA_NO_TRIE)
+                            {
                                if (uc == 's' || uc == 'S') {
                                    ANYOF_BITMAP_SET(data->start_class,
                                                   LATIN_SMALL_LETTER_SHARP_S);
@@ -4869,7 +4881,7 @@ Perl_current_re_engine(pTHX)
        HV * const table = GvHV(PL_hintgv);
        SV **ptr;
 
-       if (!table)
+       if (!table || !(PL_hints & HINT_LOCALIZE_HH))
            return &PL_core_reg_engine;
        ptr = hv_fetchs(table, "regcomp", FALSE);
        if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
@@ -4950,12 +4962,11 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
     Newx(dst, *plen_p * 2 + 1, U8);
 
     while (s < *plen_p) {
-        const UV uv = NATIVE_TO_ASCII(src[s]);
-        if (UNI_IS_INVARIANT(uv))
-            dst[d]   = (U8)UTF_TO_NATIVE(uv);
+        if (NATIVE_IS_INVARIANT(src[s]))
+            dst[d]   = src[s];
         else {
-            dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
-            dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
+            dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
+            dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
         }
         if (n < num_code_blocks) {
             if (!do_end && pRExC_state->code_blocks[n].start == s) {
@@ -5019,6 +5030,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
         STRLEN orig_patlen = 0;
         bool code = 0;
         SV *msv = use_delim ? delim : *svp;
+        if (!msv) msv = &PL_sv_undef;
 
         /* if we've got a delimiter, we go round the loop twice for each
          * svp slot (except the last), using the delimiter the second
@@ -5037,7 +5049,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
              * The code in this block is based on S_pushav() */
 
             AV *const av = (AV*)msv;
-            const I32 maxarg = AvFILL(av) + 1;
+            const SSize_t maxarg = AvFILL(av) + 1;
             SV **array;
 
             if (oplist) {
@@ -5047,11 +5059,11 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state,
             }
 
             if (SvRMAGICAL(av)) {
-                U32 i;
+                SSize_t i;
 
                 Newx(array, maxarg, SV*);
                 SAVEFREEPV(array);
-                for (i=0; i < (U32)maxarg; i++) {
+                for (i=0; i < maxarg; i++) {
                     SV ** const svp = av_fetch(av, i, FALSE);
                     array[i] = svp ? *svp : &PL_sv_undef;
                 }
@@ -10184,7 +10196,7 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32
     if (! len_passed_in) {
         if (UTF) {
             if (FOLD && (! LOC || code_point > 255)) {
-                _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
+                _to_uni_fold_flags(code_point,
                                    character,
                                    &len,
                                    FOLD_FLAGS_FULL | ((LOC)
@@ -10918,7 +10930,7 @@ tryagain:
                        p++;
                        break;
                    case 'a':
-                         ender = ASCII_TO_NATIVE('\007');
+                         ender = '\a';
                        p++;
                        break;
                    case 'o':
@@ -10988,16 +11000,18 @@ tryagain:
                         goto loopdone;
                     case '1': case '2': case '3':case '4':
                    case '5': case '6': case '7':
-                        /* When we parse backslash escapes there is ambiguity between
-                         * backreferences and octal escapes. Any escape from \1 - \9 is
-                         * a backreference, any multi-digit escape which does not start with
-                         * 0 and which when evaluated as decimal could refer to an already
-                         * parsed capture buffer is a backslash. Anything else is octal.
+                        /* When we parse backslash escapes there is ambiguity
+                         * between backreferences and octal escapes. Any escape
+                         * from \1 - \9 is a backreference, any multi-digit
+                         * escape which does not start with 0 and which when
+                         * evaluated as decimal could refer to an already
+                         * parsed capture buffer is a backslash. Anything else
+                         * is octal.
                          *
-                         * Note this implies that \118 could be interpreted as 118 OR as
-                         * "\11" . "8" depending on whether there were 118 capture buffers
-                         * defined already in the pattern.
-                         */
+                         * Note this implies that \118 could be interpreted as
+                         * 118 OR as "\11" . "8" depending on whether there
+                         * were 118 capture buffers defined already in the
+                         * pattern.  */
                         if ( !isDIGIT(p[1]) || atoi(p) <= RExC_npar )
                         {  /* Not to be treated as an octal constant, go
                                    find backref */
@@ -11112,8 +11126,7 @@ tryagain:
                         REGC((char)ender, s++);
                     }
                 }
-                else /* FOLD */
-                     if (! ( UTF
+                else /* FOLD */ if (! ( UTF
                         /* See comments for join_exact() as to why we fold this
                          * non-UTF at compile time */
                         || (node_type == EXACTFU
@@ -11145,7 +11158,7 @@ tryagain:
                      * utf8.  If we start to fold non-UTF patterns, be sure to
                      * update join_exact() */
                     if (LOC && ender < 256) {
-                        if (UNI_IS_INVARIANT(ender)) {
+                        if (NATIVE_IS_INVARIANT(ender)) {
                             *s = (U8) ender;
                             foldlen = 1;
                         } else {
@@ -11286,8 +11299,8 @@ tryagain:
                             /* No Latin1 characters participate in multi-char
                              * folds under /l */
                             if (LOC
-                                || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
-                                                                *s, *(s+1))))
+                                || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_NATIVE(
+                                                                  *s, *(s+1))))
                             {
                                 break;
                             }
@@ -12579,7 +12592,7 @@ parseit:
            case 'f':   value = '\f';                   break;
            case 'b':   value = '\b';                   break;
            case 'e':   value = ASCII_TO_NATIVE('\033');break;
-           case 'a':   value = ASCII_TO_NATIVE('\007');break;
+           case 'a':   value = '\a';                   break;
            case 'o':
                RExC_parse--;   /* function expects to be pointed at the 'o' */
                {
@@ -13190,9 +13203,11 @@ parseit:
              * included.  literal_endpoint==2 means both ends of the range used
              * a literal character, not \x{foo} */
            if (literal_endpoint == 2
-                && (prevvalue >= 'a' && value <= 'z')
-                    || (prevvalue >= 'A' && value <= 'Z'))
+                && ((prevvalue >= 'a' && value <= 'z')
+                    || (prevvalue >= 'A' && value <= 'Z')))
             {
+                _invlist_intersection(this_range, PL_ASCII,
+                                      &this_range);
                 _invlist_intersection(this_range, PL_Posix_ptrs[_CC_ALPHA],
                                       &this_range);
             }
@@ -13489,7 +13504,7 @@ parseit:
                     /* If the folds haven't been read in, call a fold function
                      * to force that */
                     if (! PL_utf8_tofold) {
-                        U8 dummy[UTF8_MAXBYTES+1];
+                        U8 dummy[UTF8_MAXBYTES_CASE+1];
 
                         /* This string is just a short named one above \xff */
                         to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
@@ -14440,10 +14455,10 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,
             switch (OP(scan)) {
                 case EXACT:
                 case EXACTF:
+                case EXACTFA_NO_TRIE:
                 case EXACTFA:
                 case EXACTFU:
                 case EXACTFU_SS:
-                case EXACTFU_TRICKYFOLD:
                 case EXACTFL:
                         if( exact == PSEUDO )
                             exact= OP(scan);