From 9e55ce066d52428ee12b0c4df544c9a64f88c082 Mon Sep 17 00:00:00 2001 From: Jarkko Hietaniemi Date: Sat, 5 Jan 2002 22:09:20 +0000 Subject: [PATCH] Finish up (ha!) the Unicode case folding; enhance regex dumping code. p4raw-id: //depot/perl@14096 --- MANIFEST | 1 + embed.fnc | 3 +- embed.h | 4 +- lib/unifold.t | 45 ++++++++++++++++++++ pp_hot.c | 5 ++- proto.h | 3 +- regcomp.c | 70 ++++++++++++++++++++++++------- regcomp.h | 4 +- regexec.c | 132 ++++++++++++++++++++++++++++++++++++++++++++-------------- t/op/pat.t | 3 +- utf8.c | 8 +++- utf8.h | 2 + 12 files changed, 224 insertions(+), 56 deletions(-) create mode 100644 lib/unifold.t diff --git a/MANIFEST b/MANIFEST index 401b91a..deaa26a 100644 --- a/MANIFEST +++ b/MANIFEST @@ -1698,6 +1698,7 @@ lib/unicore/UCD.html Unicode character database lib/unicore/Unicode.html Unicode character database lib/unicore/Unicode.txt Unicode character database lib/unicore/version The version of the Unicode +lib/unifold.t See if Unicode folding works lib/UNIVERSAL.pm Base class for ALL classes lib/User/grent.pm By-name interface to Perl's builtin getgr* lib/User/grent.t See if User::grwent works diff --git a/embed.fnc b/embed.fnc index da7e2ce..e534f52 100644 --- a/embed.fnc +++ b/embed.fnc @@ -584,7 +584,7 @@ Ap |void |push_scope p |OP* |ref |OP* o|I32 type p |OP* |refkids |OP* o|I32 type Ap |void |regdump |regexp* r -Ap |SV* |regclass_swash |struct regnode *n|bool doinit|SV **initsvp +Ap |SV* |regclass_swash |struct regnode *n|bool doinit|SV **listsvp|SV **altsvp Ap |I32 |pregexec |regexp* prog|char* stringarg \ |char* strend|char* strbeg|I32 minend \ |SV* screamer|U32 nosave @@ -1134,6 +1134,7 @@ s |I32 |regrepeat |regnode *p|I32 max s |I32 |regrepeat_hard |regnode *p|I32 max|I32 *lp s |I32 |regtry |regexp *prog|char *startpos s |bool |reginclass |regnode *n|U8 *p|bool do_utf8sv_is_utf8 +s |bool |reginclasslen |regnode *n|U8 *p|STRLEN *lenp|bool do_utf8sv_is_utf8 s |CHECKPOINT|regcppush |I32 parenfloor s |char*|regcppop s |char*|regcp_set_to |I32 ss diff --git a/embed.h b/embed.h index 8a5cc4e..6203634 100644 --- a/embed.h +++ b/embed.h @@ -1049,6 +1049,7 @@ #define regrepeat_hard S_regrepeat_hard #define regtry S_regtry #define reginclass S_reginclass +#define reginclasslen S_reginclasslen #define regcppush S_regcppush #define regcppop S_regcppop #define regcp_set_to S_regcp_set_to @@ -2104,7 +2105,7 @@ #define ref(a,b) Perl_ref(aTHX_ a,b) #define refkids(a,b) Perl_refkids(aTHX_ a,b) #define regdump(a) Perl_regdump(aTHX_ a) -#define regclass_swash(a,b,c) Perl_regclass_swash(aTHX_ a,b,c) +#define regclass_swash(a,b,c,d) Perl_regclass_swash(aTHX_ a,b,c,d) #define pregexec(a,b,c,d,e,f,g) Perl_pregexec(aTHX_ a,b,c,d,e,f,g) #define pregfree(a) Perl_pregfree(aTHX_ a) #define pregcomp(a,b,c) Perl_pregcomp(aTHX_ a,b,c) @@ -2588,6 +2589,7 @@ #define regrepeat_hard(a,b,c) S_regrepeat_hard(aTHX_ a,b,c) #define regtry(a,b) S_regtry(aTHX_ a,b) #define reginclass(a,b,c) S_reginclass(aTHX_ a,b,c) +#define reginclasslen(a,b,c,d) S_reginclasslen(aTHX_ a,b,c,d) #define regcppush(a) S_regcppush(aTHX_ a) #define regcppop() S_regcppop(aTHX) #define regcp_set_to(a) S_regcp_set_to(aTHX_ a) diff --git a/lib/unifold.t b/lib/unifold.t new file mode 100644 index 0000000..d4e819e --- /dev/null +++ b/lib/unifold.t @@ -0,0 +1,45 @@ +BEGIN { + chdir 't' if -d 't'; + @INC = '../lib'; +} + +use File::Spec; + +my $CF = File::Spec->catfile(File::Spec->catdir(File::Spec->updir, + "lib", "unicore"), + "CaseFold.txt"); + +if (open(CF, $CF)) { + my @CF; + + while () { + if (/^([0-9A-F]+); ([CFSI]); ((?:[0-9A-F]+)(?: [0-9A-F]+)*); \# (.+)/) { + next if $2 eq 'S'; # we are going for 'F'ull case folding + push @CF, [$1, $2, $3, $4]; + } + } + + die qq[$0: failed to find casefoldings from "$CF"\n] unless @CF; + + print "1..", scalar @CF, "\n"; + + my $i = 0; + for my $cf (@CF) { + my ($code, $status, $mapping, $name) = @$cf; + $i++; + my $a = pack("U0U*", hex $code); + my $b = pack("U0U*", map { hex } split " ", $mapping); + my $t0 = ":$a:" =~ /:$a:/ ? 1 : 0; + my $t1 = ":$a:" =~ /:$a:/i ? 1 : 0; + my $t2 = ":$a:" =~ /:[$a]:/i ? 1 : 0; + my $t3 = ":$a:" =~ /:$b:/i ? 1 : 0; + my $t4 = ":$a:" =~ /:[$b]:/i ? 1 : 0; + my $t5 = ":$b:" =~ /:$a:/i ? 1 : 0; + my $t6 = ":$b:" =~ /:[$a]:/i ? 1 : 0; + print $t0 && $t1 && $t2 && $t3 && $t4 && $t5 && $t6 ? + "ok $i \# - $code - $name - $mapping - - $status\n" : + "not ok $i \# - $code - $name - $mapping - $t0 $t1 $t2 $t3 $t4 $t5 $t6 - $status\n"; + } +} else { + die qq[$0: failed to open "$CF": $!\n]; +} diff --git a/pp_hot.c b/pp_hot.c index 29ec96b..df52bb1 100644 --- a/pp_hot.c +++ b/pp_hot.c @@ -1235,7 +1235,10 @@ PP(pp_match) pm = PL_curpm; rx = PM_GETRE(pm); } - if (rx->minlen > len) goto failure; + if (rx->minlen > len && + !PL_reg_match_utf8 /* ANYOFs can balloon to EXACTFs */ + ) + goto failure; truebase = t = s; diff --git a/proto.h b/proto.h index 52d634e..ea837ec 100644 --- a/proto.h +++ b/proto.h @@ -619,7 +619,7 @@ PERL_CALLCONV void Perl_push_scope(pTHX); PERL_CALLCONV OP* Perl_ref(pTHX_ OP* o, I32 type); PERL_CALLCONV OP* Perl_refkids(pTHX_ OP* o, I32 type); PERL_CALLCONV void Perl_regdump(pTHX_ regexp* r); -PERL_CALLCONV SV* Perl_regclass_swash(pTHX_ struct regnode *n, bool doinit, SV **initsvp); +PERL_CALLCONV SV* Perl_regclass_swash(pTHX_ struct regnode *n, bool doinit, SV **listsvp, SV **altsvp); PERL_CALLCONV I32 Perl_pregexec(pTHX_ regexp* prog, char* stringarg, char* strend, char* strbeg, I32 minend, SV* screamer, U32 nosave); PERL_CALLCONV void Perl_pregfree(pTHX_ struct regexp* r); PERL_CALLCONV regexp* Perl_pregcomp(pTHX_ char* exp, char* xend, PMOP* pm); @@ -1164,6 +1164,7 @@ STATIC I32 S_regrepeat(pTHX_ regnode *p, I32 max); STATIC I32 S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp); STATIC I32 S_regtry(pTHX_ regexp *prog, char *startpos); STATIC bool S_reginclass(pTHX_ regnode *n, U8 *p, bool do_utf8sv_is_utf8); +STATIC bool S_reginclasslen(pTHX_ regnode *n, U8 *p, STRLEN *lenp, bool do_utf8sv_is_utf8); STATIC CHECKPOINT S_regcppush(pTHX_ I32 parenfloor); STATIC char* S_regcppop(pTHX); STATIC char* S_regcp_set_to(pTHX_ I32 ss); diff --git a/regcomp.c b/regcomp.c index aacae22..d7ae068 100644 --- a/regcomp.c +++ b/regcomp.c @@ -3427,7 +3427,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) SV *listsv = Nullsv; register char *e; UV n; - bool optimize_invert = TRUE; + bool optimize_invert = TRUE; + AV* unicode_alternate = 0; ret = reganode(pRExC_state, ANYOF, 0); @@ -4028,18 +4029,38 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) /* If folding and foldable and a single * character, insert also the folded version * to the charclass. */ - if (f != value && foldlen == UNISKIP(f)) - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", f); + if (f != value) { + if (foldlen == UNISKIP(f)) + Perl_sv_catpvf(aTHX_ listsv, + "%04"UVxf"\n", f); + else { + /* Any multicharacter foldings + * require the following transform: + * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) + * where E folds into "pq" and F folds + * into "rst", all other characters + * fold to single characters. We save + * away these multicharacter foldings, + * to be later saved as part of the + * additional "s" data. */ + SV *sv; + + if (!unicode_alternate) + unicode_alternate = newAV(); + sv = newSVpvn((char*)foldbuf, foldlen); + SvUTF8_on(sv); + av_push(unicode_alternate, sv); + } + } /* If folding and the value is one of the Greek * sigmas insert a few more sigmas to make the * folding rules of the sigmas to work right. * Note that not all the possible combinations * are handled here: some of them are handled - * handled by the standard folding rules, and - * some of them (literal or EXACTF cases) are - * handled during runtime in - * regexec.c:S_find_byclass(). */ + * by the standard folding rules, and some of + * them (literal or EXACTF cases) are handled + * during runtime in regexec.c:S_find_byclass(). */ if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) { Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA); @@ -4094,8 +4115,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state) AV *av = newAV(); SV *rv; + /* The 0th element stores the character class description + * in its textual form: used later (regexec.c:Perl_regclass_swatch()) + * to initialize the appropriate swash (which gets stored in + * the 1st element), and also useful for dumping the regnode. + * The 2nd element stores the multicharacter foldings, + * used later (regexec.c:s_reginclasslen()). */ av_store(av, 0, listsv); av_store(av, 1, NULL); + av_store(av, 2, (SV*)unicode_alternate); rv = newRV_noinc((SV*)av); n = add_data(pRExC_state, 1, "s"); RExC_rx->data->data[n] = (void*)rv; @@ -4625,7 +4653,7 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) { SV *lv; - SV *sw = regclass_swash(o, FALSE, &lv); + SV *sw = regclass_swash(o, FALSE, &lv, 0); if (lv) { if (sw) { @@ -4714,16 +4742,26 @@ Perl_re_intuit_string(pTHX_ regexp *prog) void Perl_pregfree(pTHX_ struct regexp *r) { - DEBUG_r(if (!PL_colorset) reginitcolors()); +#ifdef DEBUGGING + SV *dsv = PERL_DEBUG_PAD_ZERO(0); +#endif if (!r || (--r->refcnt > 0)) return; - DEBUG_r(PerlIO_printf(Perl_debug_log, - "%sFreeing REx:%s `%s%.60s%s%s'\n", - PL_colors[4],PL_colors[5],PL_colors[0], - r->precomp, - PL_colors[1], - (strlen(r->precomp) > 60 ? "..." : ""))); + DEBUG_r({ + bool utf8 = r->reganch & ROPT_UTF8; + char *s = pv_uni_display(dsv, (U8*)r->precomp, r->prelen, 60, + UNI_DISPLAY_ISPRINT); + int len = SvCUR(dsv); + if (!PL_colorset) + reginitcolors(); + PerlIO_printf(Perl_debug_log, + "%sFreeing REx:%s `%s%*.*s%s%s'\n", + PL_colors[4],PL_colors[5],PL_colors[0], + len, len, s, + PL_colors[1], + len > 60 ? "..." : ""); + }); if (r->precomp) Safefree(r->precomp); @@ -4779,7 +4817,7 @@ Perl_pregfree(pTHX_ struct regexp *r) new_comppad = NULL; break; case 'n': - break; + break; default: Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]); } diff --git a/regcomp.h b/regcomp.h index 16cf957..9053242 100644 --- a/regcomp.h +++ b/regcomp.h @@ -365,7 +365,9 @@ typedef struct re_scream_pos_data_s * n - Root of op tree for (?{EVAL}) item * o - Start op for (?{EVAL}) item * p - Pad for (?{EVAL} item - * s - swash for unicode-style character class + * s - swash for unicode-style character class, and the multicharacter + * strings resulting from casefolding the single-character entries + * in the character class * 20010712 mjd@plover.com * (Remember to update re_dup() and pregfree() if you add any items.) */ diff --git a/regexec.c b/regexec.c index fe9ad4b..ee8f602 100644 --- a/regexec.c +++ b/regexec.c @@ -1535,7 +1535,8 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * SV* oreplsv = GvSV(PL_replgv); bool do_utf8 = DO_UTF8(sv); #ifdef DEBUGGING - SV *dsv = PERL_DEBUG_PAD_ZERO(0); + SV *dsv0 = PERL_DEBUG_PAD_ZERO(0); + SV *dsv1 = PERL_DEBUG_PAD_ZERO(1); #endif PL_regcc = 0; @@ -1552,7 +1553,9 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * } minlen = prog->minlen; - if (strend - startpos < minlen) { + if (strend - startpos < minlen && + !PL_reg_match_utf8 /* ANYOFs can balloon to EXACTFs */ + ) { DEBUG_r(PerlIO_printf(Perl_debug_log, "String too short [regexec_flags]...\n")); goto phooey; @@ -1621,20 +1624,26 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * } DEBUG_r({ - char *s = do_utf8 ? sv_uni_display(dsv, sv, 60, 0) : startpos; - int len = do_utf8 ? strlen(s) : strend - startpos; + char *s0 = UTF ? + pv_uni_display(dsv0, (U8*)prog->precomp, prog->prelen, 60, + UNI_DISPLAY_ISPRINT) : + prog->precomp; + int len0 = UTF ? SvCUR(dsv0) : prog->prelen; + char *s1 = do_utf8 ? sv_uni_display(dsv1, sv, 60, + UNI_DISPLAY_ISPRINT) : startpos; + int len1 = do_utf8 ? SvCUR(dsv1) : strend - startpos; if (!PL_colorset) reginitcolors(); PerlIO_printf(Perl_debug_log, - "%sMatching REx%s `%s%.60s%s%s' against `%s%.*s%s%s'\n", + "%sMatching REx%s `%s%*.*s%s%s' against `%s%.*s%s%s'\n", PL_colors[4],PL_colors[5],PL_colors[0], - prog->precomp, + len0, len0, s0, PL_colors[1], - (strlen(prog->precomp) > 60 ? "..." : ""), + len0 > 60 ? "..." : "", PL_colors[0], - (int)(len > 60 ? 60 : len), - s, PL_colors[1], - (len > 60 ? "..." : "") + (int)(len1 > 60 ? 60 : len1), + s1, PL_colors[1], + (len1 > 60 ? "..." : "") ); }); @@ -1805,8 +1814,24 @@ Perl_regexec_flags(pTHX_ register regexp *prog, char *stringarg, register char * strend = HOPc(strend, -(minlen - 1)); DEBUG_r({ SV *prop = sv_newmortal(); + char *s0; + char *s1; + int len0; + int len1; + regprop(prop, c); - PerlIO_printf(Perl_debug_log, "Matching stclass `%s' against `%s'\n", SvPVX(prop), UTF ? sv_uni_display(dsv, sv, 60, 0) : s); + s0 = UTF ? + pv_uni_display(dsv0, (U8*)SvPVX(prop), SvCUR(prop), 60, + UNI_DISPLAY_ISPRINT) : + SvPVX(prop); + len0 = UTF ? SvCUR(dsv0) : SvCUR(prop); + s1 = UTF ? + sv_uni_display(dsv1, sv, 60, UNI_DISPLAY_ISPRINT) : s; + len1 = UTF ? SvCUR(dsv1) : strend - s; + PerlIO_printf(Perl_debug_log, + "Matching stclass `%*.*s' against `%*.*s'\n", + len0, len0, s0, + len1, len1, s1); }); if (find_byclass(prog, c, s, strend, startpos, 0)) goto got_it; @@ -2369,11 +2394,13 @@ S_regmatch(pTHX_ regnode *prog) break; case ANYOF: if (do_utf8) { - if (!reginclass(scan, (U8*)locinput, do_utf8)) + STRLEN inclasslen = PL_regeol - locinput; + + if (!reginclasslen(scan, (U8*)locinput, &inclasslen, do_utf8)) sayNO; if (locinput >= PL_regeol) sayNO; - locinput += PL_utf8skip[nextchr]; + locinput += inclasslen; nextchr = UCHARAT(locinput); } else { @@ -4107,10 +4134,11 @@ S_regrepeat_hard(pTHX_ regnode *p, I32 max, I32 *lp) */ SV * -Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp) +Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** listsvp, SV **altsvp) { - SV *sw = NULL; - SV *si = NULL; + SV *sw = NULL; + SV *si = NULL; + SV *alt = NULL; if (PL_regdata && PL_regdata->count) { U32 n = ARG(node); @@ -4118,10 +4146,14 @@ Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp) if (PL_regdata->what[n] == 's') { SV *rv = (SV*)PL_regdata->data[n]; AV *av = (AV*)SvRV((SV*)rv); - SV **a; + SV **a, **b; - si = *av_fetch(av, 0, FALSE); - a = av_fetch(av, 1, FALSE); + /* See the end of regcomp.c:S_reglass() for + * documentation of these array elements. */ + + si = *av_fetch(av, 0, FALSE); + a = av_fetch(av, 1, FALSE); + b = av_fetch(av, 2, FALSE); if (a) sw = *a; @@ -4129,11 +4161,15 @@ Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp) sw = swash_init("utf8", "", si, 1, 0); (void)av_store(av, 1, sw); } + if (b) + alt = *b; } } - if (initsvp) - *initsvp = si; + if (listsvp) + *listsvp = si; + if (altsvp) + *altsvp = alt; return sw; } @@ -4143,16 +4179,20 @@ Perl_regclass_swash(pTHX_ register regnode* node, bool doinit, SV** initsvp) */ STATIC bool -S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) +S_reginclasslen(pTHX_ register regnode *n, register U8* p, STRLEN* lenp, register bool do_utf8) { char flags = ANYOF_FLAGS(n); bool match = FALSE; UV c; STRLEN len = 0; + STRLEN plen; c = do_utf8 ? utf8_to_uvchr(p, &len) : *p; + plen = lenp ? *lenp : UNISKIP(c); if (do_utf8 || (flags & ANYOF_UNICODE)) { + if (lenp) + *lenp = 0; if (do_utf8 && !ANYOF_RUNTIME(n)) { if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c)) match = TRUE; @@ -4160,24 +4200,46 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256) match = TRUE; if (!match) { - SV *sw = regclass_swash(n, TRUE, 0); + AV *av; + SV *sw = regclass_swash(n, TRUE, 0, (SV**)&av); if (sw) { if (swash_fetch(sw, p, do_utf8)) match = TRUE; else if (flags & ANYOF_FOLD) { - U8 foldbuf[UTF8_MAXLEN_FOLD+1]; - STRLEN foldlen; - - to_utf8_fold(p, foldbuf, &foldlen); - if (swash_fetch(sw, foldbuf, do_utf8)) - match = TRUE; - to_utf8_upper(p, foldbuf, &foldlen); - if (swash_fetch(sw, foldbuf, do_utf8)) - match = TRUE; + U8 tmpbuf[UTF8_MAXLEN_FOLD+1]; + STRLEN tmplen; + + if (!match && lenp && av) { + I32 i; + + for (i = 0; i <= av_len(av); i++) { + SV* sv = *av_fetch(av, i, FALSE); + STRLEN len; + char *s = SvPV(sv, len); + + if (len <= plen && memEQ(s, p, len)) { + *lenp = len; + match = TRUE; + break; + } + } + } + if (!match) { + to_utf8_fold(p, tmpbuf, &tmplen); + if (swash_fetch(sw, tmpbuf, do_utf8)) + match = TRUE; + } + if (!match) { + to_utf8_upper(p, tmpbuf, &tmplen); + if (swash_fetch(sw, tmpbuf, do_utf8)) + match = TRUE; + } } } } + if (match && lenp && *lenp == 0) + *lenp = UNISKIP(c); } if (!match && c < 256) { if (ANYOF_BITMAP_TEST(n, c)) @@ -4238,6 +4300,12 @@ S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) return (flags & ANYOF_INVERT) ? !match : match; } +STATIC bool +S_reginclass(pTHX_ register regnode *n, register U8* p, register bool do_utf8) +{ + return S_reginclasslen(aTHX_ n, p, 0, do_utf8); +} + STATIC U8 * S_reghop(pTHX_ U8 *s, I32 off) { diff --git a/t/op/pat.t b/t/op/pat.t index bc2ed37..a504186 100755 --- a/t/op/pat.t +++ b/t/op/pat.t @@ -2602,7 +2602,8 @@ print "# some Unicode properties\n"; print "SS" =~ /\N{LATIN SMALL LETTER SHARP S}/i ? "ok 840\n" : "not ok 840\n"; -# Fix coming up. +# These are a bit tricky. Since the LATIN SMALL LETTER SHARP S is U+00DF, +# the ANYOF reduces to a byte. The Unicodeness needs to be caught earlier. # print "ss" =~ # /[\N{LATIN SMALL LETTER SHARP S}]/i ? "ok 841\n" : "not ok 841\n"; # diff --git a/utf8.c b/utf8.c index 93c1128..0a25c03 100644 --- a/utf8.c +++ b/utf8.c @@ -1663,7 +1663,8 @@ Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) Build to the scalar dsv a displayable version of the string spv, length len, the displayable version being at most pvlim bytes long (if longer, the rest is truncated and "..." will be appended). -The flags argument is currently unused but available for future extensions. +The flags argument can have UNI_DISPLAY_ISPRINT set to display +isprint() characters as themselves. The pointer to the PV of the dsv is returned. =cut */ @@ -1681,7 +1682,10 @@ Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags) break; } u = utf8_to_uvchr((U8*)s, 0); - Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u); + if ((flags & UNI_DISPLAY_ISPRINT) && u < 256 && isprint(u)) + Perl_sv_catpvf(aTHX_ dsv, "%c", u); + else + Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u); } if (truncated) sv_catpvn(dsv, "...", 3); diff --git a/utf8.h b/utf8.h index d907d26..96f1b74 100644 --- a/utf8.h +++ b/utf8.h @@ -193,3 +193,5 @@ END_EXTERN_C #define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2 #define UNICODE_GREEK_SMALL_LETTER_SIGMA 0x03C3 +#define UNI_DISPLAY_ISPRINT 0x0001 + -- 1.8.3.1