This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: in debug output, don't duplicate code points
[perl5.git] / regcomp.c
index 752baad..0c30210 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -689,7 +689,7 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min
                    |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
            else
                data->flags &= ~SF_FIX_BEFORE_EOL;
-           data->minlen_fixed=minlenp; 
+           data->minlen_fixed=minlenp;
            data->lookbehind_fixed=0;
        }
        else { /* *data->longest == data->longest_float */
@@ -1705,7 +1705,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs
         DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, 
             "%*sCompiling trie using list compiler\n",
             (int)depth * 2 + 2, ""));
-       
+
        trie->states = (reg_trie_state *)
            PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
                                  sizeof(reg_trie_state) );
@@ -2774,7 +2774,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
            int noff;
            regnode *n = scan;
-       
+
            /* Skip NOTHING and LONGJMP. */
            while ((n = regnext(n))
                   && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
@@ -2796,7 +2796,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
            next = regnext(scan);
            code = OP(scan);
            /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
-       
+
            if (OP(next) == code || code == IFTHEN) {
                /* NOTE - There is similar code to this block below for handling
                   TRIE nodes on a re-study.  If you change stuff here check there
@@ -2804,7 +2804,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                I32 max1 = 0, min1 = I32_MAX, num = 0;
                struct regnode_charclass_class accum;
                regnode * const startbranch=scan;
-               
+
                if (flags & SCF_DO_SUBSTR)
                    SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
                if (flags & SCF_DO_STCLASS)
@@ -2941,7 +2941,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                  a nested if into a case structure of sorts.
 
                */
-               
+
                    int made=0;
                    if (!re_trie_maxbuff) {
                        re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
@@ -3091,7 +3091,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                         
                         if ( last && TRIE_TYPE_IS_SAFE ) {
                             made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
-#ifdef TRIE_STUDY_OPT  
+#ifdef TRIE_STUDY_OPT
                             if ( ((made == MADE_EXACT_TRIE && 
                                  startbranch == first) 
                                  || ( first_non_open == first )) && 
@@ -3982,7 +3982,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                    break;
                CASE_SYNST_FNC(VERTWS);
                CASE_SYNST_FNC(HORIZWS);
-               
+
                }
                if (flags & SCF_DO_STCLASS_OR)
                    cl_and(data->start_class, and_withp);
@@ -4363,7 +4363,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
                flags &= ~SCF_DO_SUBSTR; 
        }
 #endif /* old or new */
-#endif /* TRIE_STUDY_OPT */    
+#endif /* TRIE_STUDY_OPT */
 
        /* Else: zero-length, ignore. */
        scan = regnext(scan);
@@ -4919,7 +4919,7 @@ reStudy:
                    sawplus = 1;
                else
                    first += regarglen[OP(first)];
-               
+
                first = NEXTOPER(first);
                first_next= regnext(first);
        }
@@ -4934,7 +4934,7 @@ reStudy:
            else
                ri->regstclass = first;
        }
-#ifdef TRIE_STCLASS    
+#ifdef TRIE_STCLASS
        else if (PL_regkind[OP(first)] == TRIE &&
                ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) 
        {
@@ -4955,7 +4955,7 @@ reStudy:
             make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
            ri->regstclass = trie_op;
        }
-#endif 
+#endif
        else if (REGNODE_SIMPLE(OP(first)))
            ri->regstclass = first;
        else if (PL_regkind[OP(first)] == BOUND ||
@@ -5021,7 +5021,7 @@ reStudy:
        * it happens that c_offset_min has been invalidated, since the
        * earlier string may buy us something the later one won't.]
        */
-       
+
        data.longest_fixed = newSVpvs("");
        data.longest_float = newSVpvs("");
        data.last_found = newSVpvs("");
@@ -5039,7 +5039,7 @@ reStudy:
             &data, -1, NULL, NULL,
             SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
 
-       
+
         CHECK_RESTUDY_GOTO;
 
 
@@ -5207,7 +5207,7 @@ reStudy:
        I32 fake;
        struct regnode_charclass_class ch_class;
        I32 last_close = 0;
-       
+
        DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
 
        scan = ri->program + 1;
@@ -6270,9 +6270,9 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV
 void
 Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
 {
-    /* Take the union of two inversion lists and point 'result' to it.  If
-     * 'output' on input points to one of the two lists, the reference count to
-     * that list will be decremented.
+    /* Take the union of two inversion lists and point <output> to it.  *output
+     * should be defined upon input, and if it points to one of the two lists,
+     * the reference count to that list will be decremented.
      * The basis for this comes from "Unicode Demystified" Chapter 13 by
      * Richard Gillam, published by Addison-Wesley, and explained at some
      * length there.  The preface says to incorporate its examples into your
@@ -6313,19 +6313,19 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
     /* If either one is empty, the union is the other one */
     len_a = invlist_len(a);
     if (len_a == 0) {
-       if (output == &a) {
+       if (*output == a) {
            SvREFCNT_dec(a);
        }
-       else if (output != &b) {
+       if (*output != b) {
            *output = invlist_clone(b);
        } /* else *output already = b; */
        return;
     }
     else if ((len_b = invlist_len(b)) == 0) {
-       if (output == &b) {
+       if (*output == b) {
            SvREFCNT_dec(b);
        }
-       else if (output != &a) {
+       if (*output != a) {
            *output = invlist_clone(a);
        }
        /* else *output already = a; */
@@ -6450,7 +6450,7 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
     }
 
     /*  We may be removing a reference to one of the inputs */
-    if (&a == output || &b == output) {
+    if (a == *output || b == *output) {
        SvREFCNT_dec(*output);
     }
 
@@ -6461,9 +6461,9 @@ Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
 void
 Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
 {
-    /* Take the intersection of two inversion lists and point 'i' to it.  If
-     * 'i' on input points to one of the two lists, the reference count to that
-     * list will be decremented.
+    /* Take the intersection of two inversion lists and point <i> to it.  *i
+     * should be defined upon input, and if it points to one of the two lists,
+     * the reference count to that list will be decremented.
      * The basis for this comes from "Unicode Demystified" Chapter 13 by
      * Richard Gillam, published by Addison-Wesley, and explained at some
      * length there.  The preface says to incorporate its examples into your
@@ -6503,10 +6503,10 @@ Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
 
        /* If the result is the same as one of the inputs, the input is being
         * overwritten */
-       if (i == &a) {
+       if (*i == a) {
            SvREFCNT_dec(a);
        }
-       else if (i == &b) {
+       else if (*i == b) {
            SvREFCNT_dec(b);
        }
 
@@ -6622,7 +6622,7 @@ Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
     }
 
     /*  We may be removing a reference to one of the inputs */
-    if (&a == i || &b == i) {
+    if (a == *i || b == *i) {
        SvREFCNT_dec(*i);
     }
 
@@ -6772,7 +6772,8 @@ void
 Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
 {
     /* Point <result> to an inversion list which consists of all elements in
-     * <a> that aren't also in <b> */
+     * <a> that aren't also in <b>.  *result should be defined upon input, and
+     * if it points to C<b> its reference count will be decremented. */
 
     PERL_ARGS_ASSERT__INVLIST_SUBTRACT;
     assert(a != b);
@@ -6786,7 +6787,7 @@ Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
 
        /* If the result is not to be the same variable as the original, create
         * a copy */
-       if (result != &a) {
+       if (*result != a) {
            *result = invlist_clone(a);
        }
     } else {
@@ -6860,6 +6861,37 @@ S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
     return TRUE;
 }
 
+#ifndef PERL_IN_XSUB_RE
+SV *
+Perl__invlist_contents(pTHX_ SV* const invlist)
+{
+    /* Get the contents of an inversion list into a string SV so that they can
+     * be printed out.  It uses the format traditionally done for debug tracing
+     */
+
+    UV start, end;
+    SV* output = newSVpvs("\n");
+
+    PERL_ARGS_ASSERT__INVLIST_CONTENTS;
+
+    invlist_iterinit(invlist);
+    while (invlist_iternext(invlist, &start, &end)) {
+       if (end == UV_MAX) {
+           Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
+       }
+       else if (end != start) {
+           Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
+                   start,       end);
+       }
+       else {
+           Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
+       }
+    }
+
+    return output;
+}
+#endif
+
 #if 0
 void
 S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
@@ -7273,7 +7305,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                        RExC_parse++;
                if (*RExC_parse!=')') 
                    vFAIL("Expecting close bracket");
-                       
+
               gen_recurse_regop:
                 if ( paren == '-' ) {
                     /*
@@ -7350,7 +7382,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                    RExC_parse++;
                }
                if (*RExC_parse != ')') {
-                   RExC_parse = s;             
+                   RExC_parse = s;
                    vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
                }
                if (!SIZE_ONLY) {
@@ -7408,7 +7440,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
                        || RExC_parse[1] == '<'
                        || RExC_parse[1] == '{') { /* Lookahead or eval. */
                        I32 flag;
-                       
+
                        ret = reg_node(pRExC_state, LOGICAL);
                        if (!SIZE_ONLY)
                            ret->flags = 1;
@@ -7910,7 +7942,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
             Set_Node_Length(ret, 1);
         }
     }
-       
+
     if (!first && SIZE_ONLY)
        RExC_extralen += 1;                     /* BRANCHJ */
 
@@ -8780,7 +8812,7 @@ tryagain:
            break;          
        case 'p':
        case 'P':
-           {   
+           {
                char* const oldregxend = RExC_end;
 #ifdef DEBUGGING
                char* parse_start = RExC_parse - 2;
@@ -9107,7 +9139,7 @@ tryagain:
                    case 'x':
                        if (*++p == '{') {
                            char* const e = strchr(p, '}');
-       
+
                            if (!e) {
                                RExC_parse = p + 1;
                                vFAIL("Missing right brace on \\x{}");
@@ -9553,7 +9585,7 @@ tryagain:
                *flagp |= HASWIDTH;
            if (len == 1 && UNI_IS_INVARIANT(ender))
                *flagp |= SIMPLE;
-               
+
            if (SIZE_ONLY)
                RExC_size += STR_SZ(len);
            else {
@@ -9622,7 +9654,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
        POSIXCC(UCHARAT(RExC_parse))) {
        const char c = UCHARAT(RExC_parse);
        char* const s = RExC_parse++;
-       
+
        while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
            RExC_parse++;
        if (RExC_parse == RExC_end)
@@ -10050,8 +10082,23 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
     SV *listsv = NULL;
     STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
                                      than just initialized.  */
+    SV* properties = NULL;    /* Code points that match \p{} \P{} */
+    UV element_count = 0;   /* Number of distinct elements in the class.
+                              Optimizations may be possible if this is tiny */
     UV n;
 
+    /* Unicode properties are stored in a swash; this holds the current one
+     * being parsed.  If this swash is the only above-latin1 component of the
+     * character class, an optimization is to pass it directly on to the
+     * execution engine.  Otherwise, it is set to NULL to indicate that there
+     * are other things in the class that have to be dealt with at execution
+     * time */
+    SV* swash = NULL;          /* Code points that match \p{} \P{} */
+
+    /* Set if a component of this character class is user-defined; just passed
+     * on to the engine */
+    UV has_user_defined_property = 0;
+
     /* code points this node matches that can't be stored in the bitmap */
     SV* nonbitmap = NULL;
 
@@ -10145,8 +10192,10 @@ parseit:
 
        namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
 
-       if (!range)
+       if (!range) {
            rangebegin = RExC_parse;
+           element_count++;
+       }
        if (UTF) {
            value = utf8n_to_uvchr((U8*)RExC_parse,
                                   RExC_end - RExC_parse,
@@ -10222,6 +10271,9 @@ parseit:
                    n = 1;
                }
                if (!SIZE_ONLY) {
+                    SV** invlistsvp;
+                    SV* invlist;
+                    char* name;
                    if (UCHARAT(RExC_parse) == '^') {
                         RExC_parse++;
                         n--;
@@ -10231,18 +10283,106 @@ parseit:
                              n--;
                         }
                    }
+                    /* Try to get the definition of the property into
+                     * <invlist>.  If /i is in effect, the effective property
+                     * will have its name be <__NAME_i>.  The design is
+                     * discussed in commit
+                     * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
+                    Newx(name, n + sizeof("_i__\n"), char);
+
+                    sprintf(name, "%s%.*s%s\n",
+                                    (FOLD) ? "__" : "",
+                                    (int)n,
+                                    RExC_parse,
+                                    (FOLD) ? "_i" : ""
+                    );
+
+                    /* Look up the property name, and get its swash and
+                     * inversion list, if the property is found  */
+                    if (! (ANYOF_FLAGS(ret) & ANYOF_INVERT)) {
+                    if (swash) {
+                        SvREFCNT_dec(swash);
+                    }
+                    swash = _core_swash_init("utf8", name, &PL_sv_undef,
+                                             1, /* binary */
+                                             0, /* not tr/// */
+                                             TRUE, /* this routine will handle
+                                                      undefined properties */
+                                             NULL, FALSE /* No inversion list */
+                                            );
+                    }
+
+                    if (   ANYOF_FLAGS(ret) & ANYOF_INVERT
+                        || ! swash
+                        || ! SvROK(swash)
+                        || ! SvTYPE(SvRV(swash)) == SVt_PVHV
+                        || ! (invlistsvp =
+                               hv_fetchs(MUTABLE_HV(SvRV(swash)),
+                                "INVLIST", FALSE))
+                        || ! (invlist = *invlistsvp))
+                   {
+                        if (swash) {
+                            SvREFCNT_dec(swash);
+                            swash = NULL;
+                        }
+
+                        /* Here didn't find it.  It could be a user-defined
+                         * property that will be available at run-time.  Add it
+                         * to the list to look up then */
+                        Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
+                                        (value == 'p' ? '+' : '!'),
+                                        name);
+                        has_user_defined_property = 1;
+
+                        /* We don't know yet, so have to assume that the
+                         * property could match something in the Latin1 range,
+                         * hence something that isn't utf8 */
+                        ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
+                    }
+                    else {
+
+                        /* Here, did get the swash and its inversion list.  If
+                         * the swash is from a user-defined property, then this
+                         * whole character class should be regarded as such */
+                        SV** user_defined_svp =
+                                            hv_fetchs(MUTABLE_HV(SvRV(swash)),
+                                                        "USER_DEFINED", FALSE);
+                        if (user_defined_svp) {
+                            has_user_defined_property
+                                                    |= SvUV(*user_defined_svp);
+                        }
 
-                   /* Add the property name to the list.  If /i matching, give
-                    * a different name which consists of the normal name
-                    * sandwiched between two underscores and '_i'.  The design
-                    * is discussed in the commit message for this. */
-                   Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%.*s%s\n",
-                                       (value=='p' ? '+' : '!'),
-                                       (FOLD) ? "__" : "",
-                                       (int)n,
-                                       RExC_parse,
-                                       (FOLD) ? "_i" : ""
-                                   );
+                        /* Invert if asking for the complement */
+                        if (value == 'P') {
+
+                           /* Add to any existing list */
+                           if (! properties) {
+                               properties = invlist_clone(invlist);
+                               _invlist_invert(properties);
+                           }
+                           else {
+                               invlist = invlist_clone(invlist);
+                               _invlist_invert(invlist);
+                               _invlist_union(properties, invlist, &properties);
+                               SvREFCNT_dec(invlist);
+                           }
+
+                            /* The swash can't be used as-is, because we've
+                            * inverted things; delay removing it to here after
+                            * have copied its invlist above */
+                            SvREFCNT_dec(swash);
+                            swash = NULL;
+                        }
+                        else {
+                           if (! properties) {
+                               properties = invlist_clone(invlist);
+                           }
+                           else {
+                               _invlist_union(properties, invlist, &properties);
+                           }
+                       }
+                   }
+                   Safefree(name);
                }
                RExC_parse = e + 1;
 
@@ -10386,8 +10526,6 @@ parseit:
                range = 0; /* this was not a true range */
            }
 
-
-    
            if (!SIZE_ONLY) {
                const char *what = NULL;
                char yesno = 0;
@@ -10580,7 +10718,7 @@ parseit:
     if (FOLD && nonbitmap) {
        UV start, end;  /* End points of code point ranges */
 
-       SV* fold_intersection;
+       SV* fold_intersection = NULL;
 
        /* This is a list of all the characters that participate in folds
            * (except marks, etc in multi-char folds */
@@ -10762,6 +10900,20 @@ parseit:
        }
     }
 
+    /* And combine the result (if any) with any inversion list from properties.
+     * The lists are kept separate up to now because we don't want to fold the
+     * properties */
+    if (properties) {
+       if (nonbitmap) {
+           _invlist_union(nonbitmap, properties, &nonbitmap);
+           SvREFCNT_dec(properties);
+       }
+       else {
+           nonbitmap = properties;
+       }
+    }
+
+
     /* Here, we have calculated what code points should be in the character
      * class.
      *
@@ -10918,24 +11070,16 @@ parseit:
         return ret;
     }
 
-    if (nonbitmap) {
-       UV start, end;
-       invlist_iterinit(nonbitmap);
-       while (invlist_iternext(nonbitmap, &start, &end)) {
-           if (start == end) {
-               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
-           }
-           else {
-               /* The \t sets the whole range */
-               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
-                       /* XXX EBCDIC */
-                                  start, end);
-           }
-       }
-       SvREFCNT_dec(nonbitmap);
+    /* If there is a swash and more than one element, we can't use the swash in
+     * the optimization below. */
+    if (swash && element_count > 1) {
+       SvREFCNT_dec(swash);
+       swash = NULL;
     }
-
-    if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
+    if (! nonbitmap
+       && SvCUR(listsv) == initial_listsv_len
+       && ! unicode_alternate)
+    {
        ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
        SvREFCNT_dec(listsv);
        SvREFCNT_dec(unicode_alternate);
@@ -10949,9 +11093,25 @@ parseit:
         * to initialize the appropriate swash (which gets stored in
         * element [1]), and also useful for dumping the regnode.
         * Element [2] stores the multicharacter foldings,
-        * used later (regexec.c:S_reginclass()). */
-       av_store(av, 0, listsv);
-       av_store(av, 1, NULL);  /* Placeholder for generated swash */
+        * used later (regexec.c:S_reginclass()).
+        * Element [3] stores the nonbitmap inversion list for use in addition
+        * or instead of element [0].
+        * Element [4] is set if any component of the class is from a
+        * user-defined property */
+       av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
+                       ? &PL_sv_undef
+                       : listsv);
+       if (swash) {
+           av_store(av, 1, swash);
+           SvREFCNT_dec(nonbitmap);
+       }
+       else {
+           av_store(av, 1, NULL);
+           if (nonbitmap) {
+               av_store(av, 3, nonbitmap);
+               av_store(av, 4, newSVuv(has_user_defined_property));
+           }
+       }
 
         /* Store any computed multi-char folds only if we are allowing
          * them */
@@ -11028,8 +11188,11 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state)
     PERL_ARGS_ASSERT_NEXTCHAR;
 
     for (;;) {
-       if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
-               RExC_parse[2] == '#') {
+       if (RExC_end - RExC_parse >= 3
+           && *RExC_parse == '('
+           && RExC_parse[1] == '?'
+           && RExC_parse[2] == '#')
+       {
            while (*RExC_parse != ')') {
                if (RExC_parse == RExC_end)
                    FAIL("Sequence (?#... not terminated");
@@ -11113,7 +11276,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
           We can't do this:
           
           assert(2==regarglen[op]+1); 
-       
+
           Anything larger than this has to allocate the extra amount.
           If we changed this to be:
           
@@ -11719,7 +11882,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
        Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
        if (flags & ANYOF_INVERT)
            sv_catpvs(sv, "^");
-       
+
        /* output what the standard cp 0-255 bitmap matches */
        for (i = 0; i <= 256; i++) {
            if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
@@ -11765,65 +11928,73 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
        if (ANYOF_NONBITMAP(o)) {
            SV *lv; /* Set if there is something outside the bit map */
            SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
-       
-           if (lv) {
+            bool byte_output = FALSE;   /* If something in the bitmap has been
+                                           output */
+
+           if (lv && lv != &PL_sv_undef) {
                if (sw) {
                    U8 s[UTF8_MAXBYTES_CASE+1];
 
                    for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
                        uvchr_to_utf8(s, i);
-                       
-                       if (i < 256 && swash_fetch(sw, s, TRUE)) {
+
+                       if (i < 256
+                            && ! ANYOF_BITMAP_TEST(o, i)    /* Don't duplicate
+                                                               things already
+                                                               output as part
+                                                               of the bitmap */
+                            && swash_fetch(sw, s, TRUE))
+                        {
                            if (rangestart == -1)
                                rangestart = i;
                        } else if (rangestart != -1) {
+                            byte_output = TRUE;
                            if (i <= rangestart + 3)
                                for (; rangestart < i; rangestart++) {
-                                   const U8 * const e = uvchr_to_utf8(s,rangestart);
-                                   U8 *p;
-                                   for(p = s; p < e; p++)
-                                       put_byte(sv, *p);
+                                   put_byte(sv, rangestart);
                                }
                            else {
-                               const U8 *e = uvchr_to_utf8(s,rangestart);
-                               U8 *p;
-                               for (p = s; p < e; p++)
-                                   put_byte(sv, *p);
+                               put_byte(sv, rangestart);
                                sv_catpvs(sv, "-");
-                               e = uvchr_to_utf8(s, i-1);
-                               for (p = s; p < e; p++)
-                                   put_byte(sv, *p);
-                               }
-                               rangestart = -1;
+                               put_byte(sv, i-1);
                            }
+                           rangestart = -1;
                        }
-                       
-                   sv_catpvs(sv, "..."); /* et cetera */
+                   }
                }
 
                {
                    char *s = savesvpv(lv);
                    char * const origs = s;
-               
+
                    while (*s && *s != '\n')
                        s++;
-               
+
                    if (*s == '\n') {
                        const char * const t = ++s;
-                       
+
+                        if (byte_output) {
+                            sv_catpvs(sv, " ");
+                        }
+
                        while (*s) {
                            if (*s == '\n')
                                *s = ' ';
+                           }
+                           else if (*s == '\t') {
+                               *s = '-';
+                           }
                            s++;
                        }
                        if (s[-1] == ' ')
                            s[-1] = 0;
-                       
+
                        sv_catpv(sv, t);
                    }
-               
+
                    Safefree(origs);
                }
+               SvREFCNT_dec(lv);
            }
        }
 
@@ -12352,7 +12523,7 @@ Perl_regnext(pTHX_ register regnode *p)
 }
 #endif
 
-STATIC void    
+STATIC void
 S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
 {
     va_list args;
@@ -12529,7 +12700,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
                goto after_print;
        } else
            CLEAR_OPTSTART;
-       
+
        regprop(r, sv, node);
        PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
                      (int)(2*indent + 1), "", SvPVX_const(sv));
@@ -12577,7 +12748,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
             sv_setpvs(sv, "");
            for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
                SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
-               
+
                 PerlIO_printf(Perl_debug_log, "%*s%s ",
                    (int)(2*(indent+3)), "",
                     elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,