This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix some EBCDIC problems
[perl5.git] / regcomp.c
index 72dd943..b830385 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -123,7 +123,7 @@ typedef struct RExC_state_t {
     char       *start;                 /* Start of input for compile */
     char       *end;                   /* End of input for compile */
     char       *parse;                 /* Input-scan pointer. */
-    I32                whilem_seen;            /* number of WHILEM in this expr */
+    SSize_t    whilem_seen;            /* number of WHILEM in this expr */
     regnode    *emit_start;            /* Start of emitted-code area */
     regnode    *emit_bound;            /* First regnode outside of the allocated space */
     regnode    *emit;                  /* Code-emit pointer; if = &emit_dummy,
@@ -132,7 +132,7 @@ typedef struct RExC_state_t {
     I32                naughty;                /* How bad is this pattern? */
     I32                sawback;                /* Did we see \1, ...? */
     U32                seen;
-    I32                size;                   /* Code size. */
+    SSize_t    size;                   /* Code size. */
     I32                npar;                   /* Capture buffer count, (OPEN). */
     I32                cpar;                   /* Capture buffer count, (CLOSE). */
     I32                nestroot;               /* root parens we are in - used by accept */
@@ -301,7 +301,7 @@ typedef struct RExC_state_t {
   
   - max_offset
     Only used for floating strings. This is the rightmost point that
-    the string can appear at. If set to I32 max it indicates that the
+    the string can appear at. If set to SSize_t_MAX it indicates that the
     string can occur infinitely far to the right.
   
   - minlenp
@@ -345,28 +345,46 @@ typedef struct RExC_state_t {
 typedef struct scan_data_t {
     /*I32 len_min;      unused */
     /*I32 len_delta;    unused */
-    I32 pos_min;
-    I32 pos_delta;
+    SSize_t pos_min;
+    SSize_t pos_delta;
     SV *last_found;
-    I32 last_end;          /* min value, <0 unless valid. */
-    I32 last_start_min;
-    I32 last_start_max;
+    SSize_t last_end;      /* min value, <0 unless valid. */
+    SSize_t last_start_min;
+    SSize_t last_start_max;
     SV **longest;          /* Either &l_fixed, or &l_float. */
     SV *longest_fixed;      /* longest fixed string found in pattern */
-    I32 offset_fixed;       /* offset where it starts */
-    I32 *minlen_fixed;      /* pointer to the minlen relevant to the string */
+    SSize_t offset_fixed;   /* offset where it starts */
+    SSize_t *minlen_fixed;  /* pointer to the minlen relevant to the string */
     I32 lookbehind_fixed;   /* is the position of the string modfied by LB */
     SV *longest_float;      /* longest floating string found in pattern */
-    I32 offset_float_min;   /* earliest point in string it can appear */
-    I32 offset_float_max;   /* latest point in string it can appear */
-    I32 *minlen_float;      /* pointer to the minlen relevant to the string */
-    I32 lookbehind_float;   /* is the position of the string modified by LB */
+    SSize_t offset_float_min; /* earliest point in string it can appear */
+    SSize_t offset_float_max; /* latest point in string it can appear */
+    SSize_t *minlen_float;  /* pointer to the minlen relevant to the string */
+    SSize_t lookbehind_float; /* is the pos of the string modified by LB */
     I32 flags;
     I32 whilem_c;
-    I32 *last_closep;
+    SSize_t *last_closep;
     struct regnode_charclass_class *start_class;
 } scan_data_t;
 
+/* The below is perhaps overboard, but this allows us to save a test at the
+ * expense of a mask.  This is because on both EBCDIC and ASCII machines, 'A'
+ * and 'a' differ by a single bit; the same with the upper and lower case of
+ * all other ASCII-range alphabetics.  On ASCII platforms, they are 32 apart;
+ * on EBCDIC, they are 64.  This uses an exclusive 'or' to find that bit and
+ * then inverts it to form a mask, with just a single 0, in the bit position
+ * where the upper- and lowercase differ.  XXX There are about 40 other
+ * instances in the Perl core where this micro-optimization could be used.
+ * Should decide if maintenance cost is worse, before changing those
+ *
+ * Returns a boolean as to whether or not 'v' is either a lowercase or
+ * uppercase instance of 'c', where 'c' is in [A-Za-z].  If 'c' is a
+ * compile-time constant, the generated code is better than some optimizing
+ * compilers figure out, amounting to a mask and test.  The results are
+ * meaningless if 'c' is not one of [A-Za-z] */
+#define isARG2_lower_or_UPPER_ARG1(c, v) \
+                              (((v) & ~('A' ^ 'a')) ==  ((c) & ~('A' ^ 'a')))
+
 /*
  * Forward declarations for pregcomp()'s friends.
  */
@@ -730,7 +748,8 @@ DEBUG_OPTIMISE_MORE_r(if(data){                                      \
    floating substrings if needed. */
 
 STATIC void
-S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
+S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
+                    SSize_t *minlenp, int is_inf)
 {
     const STRLEN l = CHR_SVLEN(data->last_found);
     const STRLEN old_l = CHR_SVLEN(*data->longest);
@@ -754,9 +773,12 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min
            data->offset_float_min = l ? data->last_start_min : data->pos_min;
            data->offset_float_max = (l
                                      ? data->last_start_max
-                                     : (data->pos_delta == I32_MAX ? I32_MAX : data->pos_min + data->pos_delta));
-           if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
-               data->offset_float_max = I32_MAX;
+                                     : (data->pos_delta == SSize_t_MAX
+                                        ? SSize_t_MAX
+                                        : data->pos_min + data->pos_delta));
+           if (is_inf
+                || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
+               data->offset_float_max = SSize_t_MAX;
            if (data->flags & SF_BEFORE_EOL)
                data->flags
                    |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
@@ -1685,7 +1707,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
                                           regardless of encoding */
             if (OP( noper ) == EXACTFU_SS) {
                 /* false positives are ok, so just set this */
-                TRIE_BITMAP_SET(trie,0xDF);
+                TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
             }
         }
         for ( ; uc < e ; uc += len ) {
@@ -2938,16 +2960,6 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
              * have to find at least two characters for a multi-fold */
            const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
 
-            /* The below is perhaps overboard, but this allows us to save a
-             * test each time through the loop at the expense of a mask.  This
-             * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
-             * by a single bit.  On ASCII they are 32 apart; on EBCDIC, they
-             * are 64.  This uses an exclusive 'or' to find that bit and then
-             * inverts it to form a mask, with just a single 0, in the bit
-             * position where 'S' and 's' differ. */
-            const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
-            const U8 s_masked = 's' & S_or_s_mask;
-
            while (s < upper) {
                 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
                 if (! len) {    /* Not a multi-char fold. */
@@ -2960,8 +2972,8 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                 }
 
                 if (len == 2
-                    && ((*s & S_or_s_mask) == s_masked)
-                    && ((*(s+1) & S_or_s_mask) == s_masked))
+                    && isARG2_lower_or_UPPER_ARG1('s', *s)
+                    && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
                 {
 
                     /* EXACTF nodes need to know that the minimum length
@@ -3020,9 +3032,9 @@ typedef struct scan_frame {
 
 #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
 
-STATIC I32
+STATIC SSize_t
 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
-                        I32 *minlenp, I32 *deltap,
+                        SSize_t *minlenp, SSize_t *deltap,
                        regnode *last,
                        scan_data_t *data,
                        I32 stopparen,
@@ -3038,17 +3050,18 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
 {
     dVAR;
-    I32 min = 0;    /* There must be at least this number of characters to match */
+    /* There must be at least this number of characters to match */
+    SSize_t min = 0;
     I32 pars = 0, code;
     regnode *scan = *scanp, *next;
-    I32 delta = 0;
+    SSize_t delta = 0;
     int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
     int is_inf_internal = 0;           /* The studied chunk is infinite */
     I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
     scan_data_t data_fake;
     SV *re_trie_maxbuff = NULL;
     regnode *first_non_open = scan;
-    I32 stopmin = I32_MAX;
+    SSize_t stopmin = SSize_t_MAX;
     scan_frame *frame = NULL;
     GET_RE_DEBUG_FLAGS_DECL;
 
@@ -3116,7 +3129,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                /* NOTE - There is similar code to this block below for handling
                   TRIE nodes on a re-study.  If you change stuff here check there
                   too. */
-               I32 max1 = 0, min1 = I32_MAX, num = 0;
+               SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
                struct regnode_charclass_class accum;
                regnode * const startbranch=scan;
 
@@ -3126,7 +3139,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    cl_init_zero(pRExC_state, &accum);
 
                while (OP(scan) == code) {
-                   I32 deltanext, minnext, f = 0, fake;
+                   SSize_t deltanext, minnext, fake;
+                   I32 f = 0;
                    struct regnode_charclass_class this_class;
 
                    num++;
@@ -3157,9 +3171,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                                          stopparen, recursed, NULL, f,depth+1);
                    if (min1 > minnext)
                        min1 = minnext;
-                   if (deltanext == I32_MAX) {
+                   if (deltanext == SSize_t_MAX) {
                        is_inf = is_inf_internal = 1;
-                       max1 = I32_MAX;
+                       max1 = SSize_t_MAX;
                    } else if (max1 < minnext + deltanext)
                        max1 = minnext + deltanext;
                    scan = next;
@@ -3184,16 +3198,17 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    min1 = 0;
                if (flags & SCF_DO_SUBSTR) {
                    data->pos_min += min1;
-                   if (data->pos_delta >= I32_MAX - (max1 - min1))
-                       data->pos_delta = I32_MAX;
+                   if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
+                       data->pos_delta = SSize_t_MAX;
                    else
                        data->pos_delta += max1 - min1;
                    if (max1 != min1 || is_inf)
                        data->longest = &(data->longest_float);
                }
                min += min1;
-               if (delta == I32_MAX || I32_MAX - delta - (max1 - min1) < 0)
-                   delta = I32_MAX;
+               if (delta == SSize_t_MAX
+                || SSize_t_MAX - delta - (max1 - min1) < 0)
+                   delta = SSize_t_MAX;
                else
                    delta += max1 - min1;
                if (flags & SCF_DO_STCLASS_OR) {
@@ -3581,7 +3596,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            }
        }
        else if (OP(scan) == EXACT) {
-           I32 l = STR_LEN(scan);
+           SSize_t l = STR_LEN(scan);
            UV uc;
            if (UTF) {
                const U8 * const s = (U8*)STRING(scan);
@@ -3597,7 +3612,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                if (data->last_end == -1) { /* Update the start info. */
                    data->last_start_min = data->pos_min;
                    data->last_start_max = is_inf
-                       ? I32_MAX : data->pos_min + data->pos_delta;
+                       ? SSize_t_MAX : data->pos_min + data->pos_delta;
                }
                sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
                if (UTF)
@@ -3670,7 +3685,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            flags &= ~SCF_DO_STCLASS;
        }
        else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
-           I32 l = STR_LEN(scan);
+           SSize_t l = STR_LEN(scan);
            UV uc = *((U8*)STRING(scan));
 
            /* Search for fixed substrings supports EXACT only. */
@@ -3787,8 +3802,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            flags &= ~SCF_DO_STCLASS;
        }
        else if (REGNODE_VARIES(OP(scan))) {
-           I32 mincount, maxcount, minnext, deltanext, fl = 0;
-           I32 f = flags, pos_before = 0;
+           SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
+           I32 fl = 0, f = flags;
            regnode * const oscan = scan;
            struct regnode_charclass_class this_class;
            struct regnode_charclass_class *oclass = NULL;
@@ -3925,11 +3940,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                }
 
                min += minnext * mincount;
-               is_inf_internal |= deltanext == I32_MAX
+               is_inf_internal |= deltanext == SSize_t_MAX
                                     || (maxcount == REG_INFTY && minnext + deltanext > 0);
                is_inf |= is_inf_internal;
                if (is_inf)
-                   delta = I32_MAX;
+                   delta = SSize_t_MAX;
                else
                    delta += (minnext + deltanext) * maxcount - minnext * mincount;
 
@@ -4059,10 +4074,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
 
                    if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
 #if defined(SPARC64_GCC_WORKAROUND)
-                       I32 b = 0;
+                       SSize_t b = 0;
                        STRLEN l = 0;
                        const char *s = NULL;
-                       I32 old = 0;
+                       SSize_t old = 0;
 
                        if (pos_before >= data->last_start_min)
                            b = pos_before;
@@ -4074,11 +4089,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        old = b - data->last_start_min;
 
 #else
-                       I32 b = pos_before >= data->last_start_min
+                       SSize_t b = pos_before >= data->last_start_min
                            ? pos_before : data->last_start_min;
                        STRLEN l;
                        const char * const s = SvPV_const(data->last_found, l);
-                       I32 old = b - data->last_start_min;
+                       SSize_t old = b - data->last_start_min;
 #endif
 
                        if (UTF)
@@ -4110,20 +4125,26 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                        } else {
                            /* start offset must point into the last copy */
                            data->last_start_min += minnext * (mincount - 1);
-                           data->last_start_max += is_inf ? I32_MAX
+                           data->last_start_max += is_inf ? SSize_t_MAX
                                : (maxcount - 1) * (minnext + data->pos_delta);
                        }
                    }
                    /* It is counted once already... */
                    data->pos_min += minnext * (mincount - counted);
 #if 0
-PerlIO_printf(Perl_debug_log, "counted=%d deltanext=%d I32_MAX=%d minnext=%d maxcount=%d mincount=%d\n",
-    counted, deltanext, I32_MAX, minnext, maxcount, mincount);
-if (deltanext != I32_MAX)
-PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount, I32_MAX - data->pos_delta);
+PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
+                              " SSize_t_MAX=%"UVdf" minnext=%"UVdf
+                              " maxcount=%"UVdf" mincount=%"UVdf"\n",
+    (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
+    (UV)mincount);
+if (deltanext != SSize_t_MAX)
+PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
+    (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
+          - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
 #endif
-                   if (deltanext == I32_MAX || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= I32_MAX - data->pos_delta)
-                       data->pos_delta = I32_MAX;
+                   if (deltanext == SSize_t_MAX ||
+                       -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= SSize_t_MAX - data->pos_delta)
+                       data->pos_delta = SSize_t_MAX;
                    else
                        data->pos_delta += - counted * deltanext +
                        (minnext + deltanext) * maxcount - minnext * mincount;
@@ -4143,7 +4164,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                            data->last_start_min =
                                data->pos_min - CHR_SVLEN(last_str);
                            data->last_start_max = is_inf
-                               ? I32_MAX
+                               ? SSize_t_MAX
                                : data->pos_min + data->pos_delta
                                - CHR_SVLEN(last_str);
                        }
@@ -4255,8 +4276,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum) + 1);
                             for (value = 0; value < loop_max; value++) {
-                                if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
-                                    ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
+                                if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+                                    ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
                                 }
                             }
                        }
@@ -4271,8 +4292,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                         * in case it isn't a true locale-node.  This will
                         * create false positives if it truly is locale */
                         for (value = 0; value < loop_max; value++) {
-                            if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
-                                ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
+                            if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+                                ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
                             }
                         }
                         }
@@ -4289,8 +4310,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                        if (!(data->start_class->flags & ANYOF_LOCALE)) {
                            ANYOF_CLASS_CLEAR(data->start_class, classnum_to_namedclass(classnum));
                             for (value = 0; value < loop_max; value++) {
-                                if (_generic_isCC(UNI_TO_NATIVE(value), classnum)) {
-                                    ANYOF_BITMAP_CLEAR(data->start_class, UNI_TO_NATIVE(value));
+                                if (_generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+                                    ANYOF_BITMAP_CLEAR(data->start_class, LATIN1_TO_NATIVE(value));
                                 }
                             }
                        }
@@ -4305,8 +4326,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                         * case it isn't a true locale-node.  This will create
                         * false positives if it truly is locale */
                         for (value = 0; value < loop_max; value++) {
-                            if (! _generic_isCC(UNI_TO_NATIVE(value), classnum)) {
-                                ANYOF_BITMAP_SET(data->start_class, UNI_TO_NATIVE(value));
+                            if (! _generic_isCC(LATIN1_TO_NATIVE(value), classnum)) {
+                                ANYOF_BITMAP_SET(data->start_class, LATIN1_TO_NATIVE(value));
                             }
                         }
                         if (PL_regkind[OP(scan)] == NPOSIXD) {
@@ -4365,7 +4386,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                    In this case we can't do fixed string optimisation.
                 */
 
-                I32 deltanext, minnext, fake = 0;
+                SSize_t deltanext, minnext, fake = 0;
                 regnode *nscan;
                 struct regnode_charclass_class intrnl;
                 int f = 0;
@@ -4434,7 +4455,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                    length of the pattern, something we won't know about
                    until after the recurse.
                 */
-                I32 deltanext, fake = 0;
+                SSize_t deltanext;
+                I32 fake = 0;
                 regnode *nscan;
                 struct regnode_charclass_class intrnl;
                 int f = 0;
@@ -4444,8 +4466,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                     have to worry about freeing them when we know
                     they wont be used, which would be a pain.
                  */
-                I32 *minnextp;
-                Newx( minnextp, 1, I32 );
+                SSize_t *minnextp;
+                Newx( minnextp, 1, SSize_t );
                 SAVEFREEPV(minnextp);
 
                 if (data) {
@@ -4574,7 +4596,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
            {
                if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
                    RExC_rx->extflags |= RXf_ANCH_GPOS;
-               if (RExC_rx->gofs < (U32)min)
+               if (RExC_rx->gofs < (STRLEN)min)
                    RExC_rx->gofs = min;
             } else {
                 RExC_rx->extflags |= RXf_GPOS_FLOAT;
@@ -4590,7 +4612,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
             regnode *trie_node= scan;
             regnode *tail= regnext(scan);
             reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
-            I32 max1 = 0, min1 = I32_MAX;
+            SSize_t max1 = 0, min1 = SSize_t_MAX;
             struct regnode_charclass_class accum;
 
             if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
@@ -4607,7 +4629,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                 
                 for ( word=1 ; word <= trie->wordcount ; word++) 
                 {
-                    I32 deltanext=0, minnext=0, f = 0, fake;
+                    SSize_t deltanext=0, minnext=0, f = 0, fake;
                     struct regnode_charclass_class this_class;
                     
                     data_fake.flags = 0;
@@ -4641,12 +4663,12 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
                     if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
                         nextbranch= regnext((regnode*)nextbranch);
                     
-                    if (min1 > (I32)(minnext + trie->minlen))
+                    if (min1 > (SSize_t)(minnext + trie->minlen))
                         min1 = minnext + trie->minlen;
-                    if (deltanext == I32_MAX) {
+                    if (deltanext == SSize_t_MAX) {
                         is_inf = is_inf_internal = 1;
-                        max1 = I32_MAX;
-                    } else if (max1 < (I32)(minnext + deltanext + trie->maxlen))
+                        max1 = SSize_t_MAX;
+                    } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
                         max1 = minnext + deltanext + trie->maxlen;
                     
                     if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
@@ -4740,9 +4762,9 @@ PerlIO_printf(Perl_debug_log, "LHS=%d RHS=%d\n", -counted * deltanext + (minnext
     DEBUG_STUDYDATA("pre-fin:",data,depth);
 
     *scanp = scan;
-    *deltap = is_inf_internal ? I32_MAX : delta;
+    *deltap = is_inf_internal ? SSize_t_MAX : delta;
     if (flags & SCF_DO_SUBSTR && is_inf)
-       data->pos_delta = I32_MAX - data->pos_min;
+       data->pos_delta = SSize_t_MAX - data->pos_min;
     if (is_par > (I32)U8_MAX)
        is_par = 0;
     if (is_par && pars==1 && data) {
@@ -4928,12 +4950,11 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state,
     Newx(dst, *plen_p * 2 + 1, U8);
 
     while (s < *plen_p) {
-        const UV uv = NATIVE_TO_ASCII(src[s]);
-        if (UNI_IS_INVARIANT(uv))
-            dst[d]   = (U8)UTF_TO_NATIVE(uv);
+        if (NATIVE_IS_INVARIANT(src[s]))
+            dst[d]   = src[s];
         else {
-            dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
-            dst[d]   = (U8)UTF8_EIGHT_BIT_LO(uv);
+            dst[d++] = UTF8_EIGHT_BIT_HI(src[s]);
+            dst[d]   = UTF8_EIGHT_BIT_LO(src[s]);
         }
         if (n < num_code_blocks) {
             if (!do_end && pRExC_state->code_blocks[n].start == s) {
@@ -5410,13 +5431,15 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
 
 
 STATIC bool
-S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
+S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, SSize_t* rx_end_shift,
+                     SSize_t lookbehind, SSize_t offset, SSize_t *minlen, STRLEN longest_length, bool eol, bool meol)
 {
     /* This is the common code for setting up the floating and fixed length
      * string data extracted from Perl_re_op_compile() below.  Returns a boolean
      * as to whether succeeded or not */
 
-    I32 t,ml;
+    I32 t;
+    SSize_t ml;
 
     if (! (longest_length
            || (eol /* Can't have SEOL and MULTI */
@@ -5441,7 +5464,7 @@ S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, S
         follow this item. We calculate it ahead of time as once the
         lookbehind offset is added in we lose the ability to correctly
         calculate it.*/
-    ml = minlen ? *(minlen) : (I32)longest_length;
+    ml = minlen ? *(minlen) : (SSize_t)longest_length;
     *rx_end_shift = ml - offset
         - longest_length + (SvTAIL(sv_longest) != 0)
         + lookbehind;
@@ -5510,7 +5533,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
     char *exp;
     regnode *scan;
     I32 flags;
-    I32 minlen = 0;
+    SSize_t minlen = 0;
     U32 rx_flags;
     SV *pat;
     SV *code_blocksv = NULL;
@@ -6067,11 +6090,11 @@ reStudy:
     /* testing for BRANCH here tells us whether there is "must appear"
        data in the pattern. If there is then we can use it for optimisations */
     if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /*  Only one top-level choice. */
-       I32 fake;
+       SSize_t fake;
        STRLEN longest_float_length, longest_fixed_length;
        struct regnode_charclass_class ch_class; /* pointed to by data */
        int stclass_flag;
-       I32 last_close = 0; /* pointed to by data */
+       SSize_t last_close = 0; /* pointed to by data */
         regnode *first= scan;
         regnode *first_next= regnext(first);
        /*
@@ -6265,7 +6288,7 @@ reStudy:
         {
            r->float_min_offset = data.offset_float_min - data.lookbehind_float;
            r->float_max_offset = data.offset_float_max;
-           if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
+           if (data.offset_float_max < SSize_t_MAX) /* Don't offset infinity */
                r->float_max_offset -= data.lookbehind_float;
            SvREFCNT_inc_simple_void_NN(data.longest_float);
        }
@@ -6353,9 +6376,9 @@ reStudy:
     }
     else {
        /* Several toplevels. Best we can is to set minlen. */
-       I32 fake;
+       SSize_t fake;
        struct regnode_charclass_class ch_class;
-       I32 last_close = 0;
+       SSize_t last_close = 0;
 
        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
 
@@ -6468,8 +6491,8 @@ reStudy:
     });
 #ifdef RE_TRACK_PATTERN_OFFSETS
     DEBUG_OFFSETS_r(if (ri->u.offsets) {
-        const U32 len = ri->u.offsets[0];
-        U32 i;
+        const STRLEN len = ri->u.offsets[0];
+        STRLEN i;
         GET_RE_DEBUG_FLAGS_DECL;
         PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
         for (i = 1; i <= len; i++) {
@@ -6657,7 +6680,7 @@ Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
 {
     SV *ret;
     AV *av;
-    I32 length;
+    SSize_t length;
     struct regexp *const rx = ReANY(r);
 
     PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
@@ -6720,19 +6743,29 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
 {
     struct regexp *const rx = ReANY(r);
     char *s = NULL;
-    I32 i = 0;
-    I32 s1, t1;
+    SSize_t i = 0;
+    SSize_t s1, t1;
     I32 n = paren;
 
     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
         
-    if ( (    n == RX_BUFF_IDX_CARET_PREMATCH
+    if (      n == RX_BUFF_IDX_CARET_PREMATCH
            || n == RX_BUFF_IDX_CARET_FULLMATCH
            || n == RX_BUFF_IDX_CARET_POSTMATCH
-         )
-         && !(rx->extflags & RXf_PMf_KEEPCOPY)
-    )
-        goto ret_undef;
+       )
+    {
+        bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
+        if (!keepcopy) {
+            /* on something like
+             *    $r = qr/.../;
+             *    /$qr/p;
+             * the KEEPCOPY is set on the PMOP rather than the regex */
+            if (PL_curpm && r == PM_GETRE(PL_curpm))
+                 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
+        }
+        if (!keepcopy)
+            goto ret_undef;
+    }
 
     if (!rx->subbeg)
         goto ret_undef;
@@ -6769,7 +6802,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
     }          
 
     assert(s >= rx->subbeg);
-    assert(rx->sublen >= (s - rx->subbeg) + i );
+    assert((STRLEN)rx->sublen >= (STRLEN)((s - rx->subbeg) + i) );
     if (i >= 0) {
 #if NO_TAINT_SUPPORT
         sv_setpvn(sv, s, i);
@@ -6838,13 +6871,27 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
 
     PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
 
+    if (   paren == RX_BUFF_IDX_CARET_PREMATCH
+        || paren == RX_BUFF_IDX_CARET_FULLMATCH
+        || paren == RX_BUFF_IDX_CARET_POSTMATCH
+    )
+    {
+        bool keepcopy = cBOOL(rx->extflags & RXf_PMf_KEEPCOPY);
+        if (!keepcopy) {
+            /* on something like
+             *    $r = qr/.../;
+             *    /$qr/p;
+             * the KEEPCOPY is set on the PMOP rather than the regex */
+            if (PL_curpm && r == PM_GETRE(PL_curpm))
+                 keepcopy = cBOOL(PL_curpm->op_pmflags & PMf_KEEPCOPY);
+        }
+        if (!keepcopy)
+            goto warn_undef;
+    }
+
     /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
     switch (paren) {
       case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
-         if (!(rx->extflags & RXf_PMf_KEEPCOPY))
-            goto warn_undef;
-        /*FALLTHROUGH*/
-
       case RX_BUFF_IDX_PREMATCH:       /* $` */
         if (rx->offs[0].start != -1) {
                        i = rx->offs[0].start;
@@ -6857,8 +6904,6 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
         return 0;
 
       case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
-         if (!(rx->extflags & RXf_PMf_KEEPCOPY))
-            goto warn_undef;
       case RX_BUFF_IDX_POSTMATCH:       /* $' */
            if (rx->offs[0].end != -1) {
                        i = rx->sublen - rx->offs[0].end;
@@ -6870,13 +6915,7 @@ Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
            }
         return 0;
 
-      case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
-         if (!(rx->extflags & RXf_PMf_KEEPCOPY))
-            goto warn_undef;
-        /*FALLTHROUGH*/
-
-      /* $& / ${^MATCH}, $1, $2, ... */
-      default:
+      default: /* $& / ${^MATCH}, $1, $2, ... */
            if (paren <= (I32)rx->nparens &&
             (s1 = rx->offs[paren].start) != -1 &&
             (t1 = rx->offs[paren].end) != -1)
@@ -8277,37 +8316,53 @@ Perl__invlist_contents(pTHX_ SV* const invlist)
 }
 #endif
 
-#ifdef PERL_ARGS_ASSERT__INVLIST_DUMP
+#ifndef PERL_IN_XSUB_RE
 void
-Perl__invlist_dump(pTHX_ SV* const invlist, const char * const header)
+Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, const char * const indent, SV* const invlist)
 {
-    /* Dumps out the ranges in an inversion list.  The string 'header'
-     * if present is output on a line before the first range */
+    /* Designed to be called only by do_sv_dump().  Dumps out the ranges of the
+     * inversion list 'invlist' to 'file' at 'level'  Each line is prefixed by
+     * the string 'indent'.  The output looks like this:
+         [0] 0x000A .. 0x000D
+         [2] 0x0085
+         [4] 0x2028 .. 0x2029
+         [6] 0x3104 .. INFINITY
+     * This means that the first range of code points matched by the list are
+     * 0xA through 0xD; the second range contains only the single code point
+     * 0x85, etc.  An inversion list is an array of UVs.  Two array elements
+     * are used to define each range (except if the final range extends to
+     * infinity, only a single element is needed).  The array index of the
+     * first element for the corresponding range is given in brackets. */
 
     UV start, end;
+    STRLEN count = 0;
 
     PERL_ARGS_ASSERT__INVLIST_DUMP;
 
-    if (header && strlen(header)) {
-       PerlIO_printf(Perl_debug_log, "%s\n", header);
-    }
     if (invlist_is_iterating(invlist)) {
-        PerlIO_printf(Perl_debug_log, "Can't dump because is in middle of iterating\n");
+        Perl_dump_indent(aTHX_ level, file,
+             "%sCan't dump inversion list because is in middle of iterating\n",
+             indent);
         return;
     }
 
     invlist_iterinit(invlist);
     while (invlist_iternext(invlist, &start, &end)) {
        if (end == UV_MAX) {
-           PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
+           Perl_dump_indent(aTHX_ level, file,
+                                       "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n",
+                                   indent, (UV)count, start);
        }
        else if (end != start) {
-           PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n",
-                                                start,         end);
+           Perl_dump_indent(aTHX_ level, file,
+                                    "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n",
+                               indent, (UV)count, start,         end);
        }
        else {
-           PerlIO_printf(Perl_debug_log, "0x%04"UVXf"\n", start);
+           Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n",
+                                            indent, (UV)count, start);
        }
+        count += 2;
     }
 }
 #endif
@@ -10712,7 +10767,7 @@ tryagain:
 
        defchar: {
            STRLEN len = 0;
-           UV ender;
+           UV ender = 0;
            char *p;
            char *s;
 #define MAX_NODE_STRING_SIZE 127
@@ -10720,17 +10775,22 @@ tryagain:
            char *s0;
            U8 upper_parse = MAX_NODE_STRING_SIZE;
            STRLEN foldlen;
-            U8 node_type;
+            U8 node_type = compute_EXACTish(pRExC_state);
             bool next_is_quantifier;
             char * oldp = NULL;
 
+            /* We can convert EXACTF nodes to EXACTFU if they contain only
+             * characters that match identically regardless of the target
+             * string's UTF8ness.  The reason to do this is that EXACTF is not
+             * trie-able, EXACTFU is.  (We don't need to figure this out until
+             * pass 2) */
+            bool maybe_exactfu = node_type == EXACTF && PASS2;
+
             /* If a folding node contains only code points that don't
              * participate in folds, it can be changed into an EXACT node,
              * which allows the optimizer more things to look for */
             bool maybe_exact;
 
-           ender = 0;
-            node_type = compute_EXACTish(pRExC_state);
            ret = reg_node(pRExC_state, node_type);
 
             /* In pass1, folded, we use a temporary buffer instead of the
@@ -10743,8 +10803,8 @@ tryagain:
 
             /* We do the EXACTFish to EXACT node only if folding, and not if in
              * locale, as whether a character folds or not isn't known until
-             * runtime */
-            maybe_exact = FOLD && ! LOC;
+             * runtime.  (And we don't need to figure this out until pass 2) */
+            maybe_exact = FOLD && ! LOC && PASS2;
 
            /* XXX The node can hold up to 255 bytes, yet this only goes to
              * 127.  I (khw) do not know why.  Keeping it somewhat less than
@@ -10857,7 +10917,7 @@ tryagain:
                        p++;
                        break;
                    case 'a':
-                         ender = ASCII_TO_NATIVE('\007');
+                         ender = '\a';
                        p++;
                        break;
                    case 'o':
@@ -11051,15 +11111,29 @@ tryagain:
                         REGC((char)ender, s++);
                     }
                 }
-                else /* FOLD */
-                     if (! ( UTF
+                else /* FOLD */ if (! ( UTF
                         /* See comments for join_exact() as to why we fold this
                          * non-UTF at compile time */
                         || (node_type == EXACTFU
                             && ender == LATIN_SMALL_LETTER_SHARP_S)))
                 {
+                    if (IS_IN_SOME_FOLD_L1(ender)) {
+                        maybe_exact = FALSE;
+
+                        /* See if the character's fold differs between /d and
+                         * /u.  This includes the multi-char fold SHARP S to
+                         * 'ss' */
+                        if (maybe_exactfu
+                            && (PL_fold[ender] != PL_fold_latin1[ender]
+                                || ender == LATIN_SMALL_LETTER_SHARP_S
+                                || (len > 0
+                                   && isARG2_lower_or_UPPER_ARG1('s', ender)
+                                   && isARG2_lower_or_UPPER_ARG1('s', *(s-1)))))
+                        {
+                            maybe_exactfu = FALSE;
+                        }
+                    }
                     *(s++) = (char) ender;
-                    maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
                 }
                 else {  /* UTF */
 
@@ -11069,7 +11143,7 @@ tryagain:
                      * utf8.  If we start to fold non-UTF patterns, be sure to
                      * update join_exact() */
                     if (LOC && ender < 256) {
-                        if (UNI_IS_INVARIANT(ender)) {
+                        if (NATIVE_IS_INVARIANT(ender)) {
                             *s = (U8) ender;
                             foldlen = 1;
                         } else {
@@ -11248,6 +11322,15 @@ tryagain:
                  * do any better */
                 if (len == 0) {
                     len = full_len;
+
+                    /* If the node ends in an 's' we make sure it stays EXACTF,
+                     * as if it turns into an EXACTFU, it could later get
+                     * joined with another 's' that would then wrongly match
+                     * the sharp s */
+                    if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
+                    {
+                        maybe_exactfu = FALSE;
+                    }
                 } else {
 
                     /* Here, the node does contain some characters that aren't
@@ -11306,12 +11389,19 @@ tryagain:
             if (len == 0) {
                 OP(ret) = NOTHING;
             }
-            else{
-
-                /* If 'maybe_exact' is still set here, means there are no
-                 * code points in the node that participate in folds */
-                if (FOLD && maybe_exact) {
-                    OP(ret) = EXACT;
+            else {
+                if (FOLD) {
+                    /* If 'maybe_exact' is still set here, means there are no
+                     * code points in the node that participate in folds;
+                     * similarly for 'maybe_exactfu' and code points that match
+                     * differently depending on UTF8ness of the target string
+                     * */
+                    if (maybe_exact) {
+                        OP(ret) = EXACT;
+                    }
+                    else if (maybe_exactfu) {
+                        OP(ret) = EXACTFU;
+                    }
                 }
                 alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
             }
@@ -12189,7 +12279,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth,
     regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
         case we need to change the emitted regop to an EXACT. */
     const char * orig_parse = RExC_parse;
-    const I32 orig_size = RExC_size;
+    const SSize_t orig_size = RExC_size;
     GET_RE_DEBUG_FLAGS_DECL;
 
     PERL_ARGS_ASSERT_REGCLASS;
@@ -12487,7 +12577,7 @@ parseit:
            case 'f':   value = '\f';                   break;
            case 'b':   value = '\b';                   break;
            case 'e':   value = ASCII_TO_NATIVE('\033');break;
-           case 'a':   value = ASCII_TO_NATIVE('\007');break;
+           case 'a':   value = '\a';                   break;
            case 'o':
                RExC_parse--;   /* function expects to be pointed at the 'o' */
                {
@@ -14768,93 +14858,61 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
         /* output information about the unicode matching */
        if (flags & ANYOF_UNICODE_ALL)
            sv_catpvs(sv, "{unicode_all}");
-       else if (ANYOF_NONBITMAP(o))
-           sv_catpvs(sv, "{unicode}");
-       if (flags & ANYOF_NONBITMAP_NON_UTF8)
-           sv_catpvs(sv, "{outside bitmap}");
-
-       if (ANYOF_NONBITMAP(o)) {
-           SV *lv; /* Set if there is something outside the bit map */
-           SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
+       else if (ANYOF_NONBITMAP(o)) {
+            SV *lv; /* Set if there is something outside the bit map. */
             bool byte_output = FALSE;   /* If something in the bitmap has been
                                            output */
 
-           if (lv && lv != &PL_sv_undef) {
-               if (sw) {
-                    int rangestart = -1;
-                    int i;
-                   U8 s[UTF8_MAXBYTES_CASE+1];
-
-                   for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
-                       uvchr_to_utf8(s, i);
-
-                       if (i < 256
-                            && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
-                                                               things already
-                                                               output as part
-                                                               of the bitmap */
-                            && swash_fetch(sw, s, TRUE))
-                        {
-                           if (rangestart == -1)
-                               rangestart = i;
-                       } else if (rangestart != -1) {
-                            byte_output = TRUE;
-                           if (i <= rangestart + 3)
-                               for (; rangestart < i; rangestart++) {
-                                   put_byte(sv, rangestart);
-                               }
-                           else {
-                               put_byte(sv, rangestart);
-                               sv_catpvs(sv, "-");
-                               put_byte(sv, i-1);
-                           }
-                           rangestart = -1;
-                       }
-                   }
-               }
+            if (flags & ANYOF_NONBITMAP_NON_UTF8) {
+                sv_catpvs(sv, "{outside bitmap}");
+            }
+            else {
+                sv_catpvs(sv, "{utf8}");
+            }
 
-               {
-                   char *s = savesvpv(lv);
-                   char * const origs = s;
+            /* Get the stuff that wasn't in the bitmap */
+           (void) regclass_swash(prog, o, FALSE, &lv, NULL);
+           if (lv && lv != &PL_sv_undef) {
+                char *s = savesvpv(lv);
+                char * const origs = s;
 
-                   while (*s && *s != '\n')
-                       s++;
+                while (*s && *s != '\n')
+                    s++;
 
-                   if (*s == '\n') {
-                       const char * const t = ++s;
+                if (*s == '\n') {
+                    const char * const t = ++s;
 
-                        if (byte_output) {
-                            sv_catpvs(sv, " ");
-                        }
+                    if (byte_output) {
+                        sv_catpvs(sv, " ");
+                    }
 
-                       while (*s) {
-                           if (*s == '\n') {
+                    while (*s) {
+                        if (*s == '\n') {
 
-                                /* Truncate very long output */
-                               if (s - origs > 256) {
-                                   Perl_sv_catpvf(aTHX_ sv,
-                                                  "%.*s...",
-                                                  (int) (s - origs - 1),
-                                                  t);
-                                   goto out_dump;
-                               }
-                               *s = ' ';
-                           }
-                           else if (*s == '\t') {
-                               *s = '-';
-                           }
-                           s++;
-                       }
-                       if (s[-1] == ' ')
-                           s[-1] = 0;
+                            /* Truncate very long output */
+                            if (s - origs > 256) {
+                                Perl_sv_catpvf(aTHX_ sv,
+                                               "%.*s...",
+                                               (int) (s - origs - 1),
+                                               t);
+                                goto out_dump;
+                            }
+                            *s = ' ';
+                        }
+                        else if (*s == '\t') {
+                            *s = '-';
+                        }
+                        s++;
+                    }
+                    if (s[-1] == ' ')
+                        s[-1] = 0;
 
-                       sv_catpv(sv, t);
-                   }
+                    sv_catpv(sv, t);
+                }
 
-               out_dump:
+            out_dump:
 
-                   Safefree(origs);
-               }
+                Safefree(origs);
                SvREFCNT_dec_NN(lv);
            }
        }
@@ -15491,12 +15549,7 @@ S_put_byte(pTHX_ SV *sv, int c)
             case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
 
             default:
-                if (c < 256) {
-                    Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
-                }
-                else {
-                    Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
-                }
+                Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
                 break;
         }
     }
@@ -15526,13 +15579,37 @@ S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
             if (rangestart == -1)
                 rangestart = i;
         } else if (rangestart != -1) {
-            if (i <= rangestart + 3)
+            int j = i - 1;
+            if (i <= rangestart + 3) {  /* Individual chars in short ranges */
                 for (; rangestart < i; rangestart++)
                     put_byte(sv, rangestart);
-            else {
+            }
+            else if (   j > 255
+                     || ! isALPHANUMERIC(rangestart)
+                     || ! isALPHANUMERIC(j)
+                     || isDIGIT(rangestart) != isDIGIT(j)
+                     || isUPPER(rangestart) != isUPPER(j)
+                     || isLOWER(rangestart) != isLOWER(j)
+
+                        /* This final test should get optimized out except
+                         * on EBCDIC platforms, where it causes ranges that
+                         * cross discontinuities like i/j to be shown as hex
+                         * instead of the misleading, e.g. H-K (since that
+                         * range includes more than H, I, J, K). */
+                     || (j - rangestart)
+                         != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
+            {
+                Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
+                               rangestart,
+                               (j < 256) ? j : 255);
+            }
+            else { /* Here, the ends of the range are both digits, or both
+                      uppercase, or both lowercase; and there's no
+                      discontinuity in the range (which could happen on EBCDIC
+                      platforms) */
                 put_byte(sv, rangestart);
                 sv_catpvs(sv, "-");
-                put_byte(sv, i - 1);
+                put_byte(sv, j);
             }
             rangestart = -1;
             has_output_anything = TRUE;