This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate changes #8267,8272[perlio],8274,8298,8300,8303,
authorJarkko Hietaniemi <jhi@iki.fi>
Sun, 28 Jan 2001 02:04:49 +0000 (02:04 +0000)
committerJarkko Hietaniemi <jhi@iki.fi>
Sun, 28 Jan 2001 02:04:49 +0000 (02:04 +0000)
8305,8323,8324 from mainline.  The 8267,8272, and 8298 were
not really integrated but instead salvaged by hand
(they had too many dependencies on the development release
to be cleanly integratable).

Subject: more UTF8 test suites and an UTF8 patch

Tweak for MULTIPLICITY/USE_PERLIO

Signedness nit.

Turn SvUTF8 off if not required in pp_chr and pp_stringify.

Use the UTF8_XXX macros in is_utf8_char().

Rewrite pp_concat() in terms of sv_catsv().  The . operator
should now be UTF-8-proof.

Subject: [PATCH perl@8269] scanning two hex-constants
fails on EBCDIC environment (script length.t)

Add some Unicode chop() tests.

p4raw-link: @8272 on //depot/perlio: 7948272db9c63907ea8e92fdd3436cdaab2f9cce
p4raw-link: @8267 on //depot/perl: 1aa99e6b6d14c469ac825dde483d9c9f913a3ee2

p4raw-id: //depot/maint-5.6/perl@8572
p4raw-integrated: from //depot/perl@8571 'copy in' t/op/chop.t
(@3319..) 'merge in' utf8.h (@8289..)
p4raw-edited: from //depot/perl@8323 'edit in' doop.c (@8289..) pp.c
(@8299..) utf8.c (@8300..) toke.c (@8306..)
p4raw-integrated: from //depot/perl@8323 'merge in' sv.c (@8301..)
p4raw-edited: from //depot/perl@8303 'edit in' pp_hot.c (@8299..)

doop.c
mg.c
pp.c
pp_hot.c
sv.c
t/op/chop.t
toke.c
utf8.c
utf8.h

diff --git a/doop.c b/doop.c
index 61a40da..90cd171 100644 (file)
--- a/doop.c
+++ b/doop.c
 #include <signal.h>
 #endif
 
-#define HALF_UTF8_UPGRADE(start,end) \
-    STMT_START {                               \
-      if ((start)<(end)) {                     \
-       U8* NeWsTr;                             \
-       STRLEN LeN = (end) - (start);           \
-       NeWsTr = bytes_to_utf8(start, &LeN);    \
-       Safefree(start);                        \
-       (start) = NeWsTr;                       \
-       (end) = (start) + LeN;                  \
-      }                                                \
-    } STMT_END
-
 STATIC I32
 S_do_trans_simple(pTHX_ SV *sv)
 {
@@ -40,7 +28,6 @@ S_do_trans_simple(pTHX_ SV *sv)
     U8 *send;
     U8 *dstart;
     I32 matches = 0;
-    I32 sutf = SvUTF8(sv);
     STRLEN len;
     short *tbl;
     I32 ch;
@@ -53,7 +40,7 @@ S_do_trans_simple(pTHX_ SV *sv)
     send = s + len;
 
     /* First, take care of non-UTF8 input strings, because they're easy */
-    if (!sutf) {
+    if (!SvUTF8(sv)) {
        while (s < send) {
            if ((ch = tbl[*s]) >= 0) {
                matches++;
@@ -78,7 +65,10 @@ S_do_trans_simple(pTHX_ SV *sv)
        c = utf8_to_uv(s, send - s, &ulen, 0);
         if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
             matches++;
-           d = uv_to_utf8(d, ch);
+            if (ch < 0x80)
+                *d++ = ch;
+            else
+                d = uv_to_utf8(d,ch);
             s += ulen;
         }
        else { /* No match -> copy */
@@ -87,8 +77,7 @@ S_do_trans_simple(pTHX_ SV *sv)
         }
     }
     *d = '\0';
-    sv_setpvn(sv, (const char*)dstart, d - dstart);
-    Safefree(dstart);
+    sv_setpvn(sv, (char*)dstart, d - dstart);
     SvUTF8_on(sv);
     SvSETMAGIC(sv);
     return matches;
@@ -101,7 +90,6 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
     U8 *s;
     U8 *send;
     I32 matches = 0;
-    I32 hasutf = SvUTF8(sv);
     STRLEN len;
     short *tbl;
 
@@ -112,22 +100,20 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
     s = (U8*)SvPV(sv, len);
     send = s + len;
 
-    while (s < send) {
-        if (hasutf && *s & 0x80)
-            s += UTF8SKIP(s);
-        else {
-            UV c;
-            STRLEN ulen;
-            ulen = 1;
-            if (hasutf)
-                c = utf8_to_uv(s, send - s, &ulen, 0);
-            else
-                c = *s;
-            if (c < 0x100 && tbl[c] >= 0)
+    if (!SvUTF8(sv))
+       while (s < send) {
+            if (tbl[*s++] >= 0)
                 matches++;
-            s += ulen;
-        }
-    }
+       }
+    else
+       while (s < send) {
+           UV c;
+           STRLEN ulen;
+           c = utf8_to_uv(s, send - s, &ulen, 0);
+           if (c < 0x100 && tbl[c] >= 0)
+               matches++;
+           s += ulen;
+       }
 
     return matches;
 }
@@ -140,7 +126,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
     U8 *send;
     U8 *d;
     U8 *dstart;
-    I32 hasutf = SvUTF8(sv);
+    I32 isutf8;
     I32 matches = 0;
     STRLEN len;
     short *tbl;
@@ -151,64 +137,96 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
        Perl_croak(aTHX_ "panic: do_trans_complex");
 
     s = (U8*)SvPV(sv, len);
+    isutf8 = SvUTF8(sv);
     send = s + len;
 
-    Newz(0, d, len*2+1, U8);
-    dstart = d;
-
-    if (PL_op->op_private & OPpTRANS_SQUASH) {
-       U8* p = send;
-
-       while (s < send) {
-            if (hasutf && *s & 0x80)
-                s += UTF8SKIP(s);
-            else {
-               if ((ch = tbl[*s]) >= 0) {
+    if (!isutf8) {
+       dstart = d = s;
+       if (PL_op->op_private & OPpTRANS_SQUASH) {
+           U8* p = send;
+           while (s < send) {
+               if ((ch = tbl[*s]) >= 0) {
                    *d = ch;
                    matches++;
-           if (p != d - 1 || *p != *d)
-                       p = d++;
-               }
-               else if (ch == -1)      /* -1 is unmapped character */
-                   *d++ = *s;          /* -2 is delete character */
-               s++;
-            }
+                   if (p != d - 1 || *p != *d)
+                       p = d++;
+               }
+               else if (ch == -1)      /* -1 is unmapped character */
+                   *d++ = *s;  
+               else if (ch == -2)      /* -2 is delete character */
+                   matches++;
+               s++;
+           }
        }
+       else {
+           while (s < send) {
+               if ((ch = tbl[*s]) >= 0) {
+                   matches++;
+                   *d++ = ch;
+               }
+               else if (ch == -1)      /* -1 is unmapped character */
+                   *d++ = *s;
+               else if (ch == -2)      /* -2 is delete character */
+                   matches++;
+               s++;
+           }
+       }
+       SvCUR_set(sv, d - dstart);
     }
-    else {
-       while (s < send) {
-           UV comp;
-            if (hasutf && *s & 0x80)
-                comp = utf8_to_uv_simple(s, NULL);
-           else
-                comp = *s;
-           
-           ch = tbl[comp];
-           
-           if (ch == -1) { /* -1 is unmapped character */
-                ch = comp;
-               matches--;
-           }
-
-           if (ch >= 0)
-               d = uv_to_utf8(d, ch);
-           
-           matches++;
-
-           s += hasutf && *s & 0x80 ? UNISKIP(*s) : 1;
-            
+    else { /* isutf8 */
+       Newz(0, d, len*2+1, U8);
+       dstart = d;
+
+       if (PL_op->op_private & OPpTRANS_SQUASH) {
+           U8* p = send;
+           UV pch = 0xfeedface;
+           while (s < send) {
+               STRLEN len;
+               UV comp = utf8_to_uv_simple(s, &len);
+
+               if (comp > 0xff)
+                   d = uv_to_utf8(d, comp);    /* always unmapped */
+               else if ((ch = tbl[comp]) >= 0) {
+                   matches++;
+                   if (ch != pch) {
+                       d = uv_to_utf8(d, ch);
+                       pch = ch;
+                   }
+                   s += len;
+                   continue;
+               }
+               else if (ch == -1)      /* -1 is unmapped character */
+                   d = uv_to_utf8(d, comp);
+               else if (ch == -2)      /* -2 is delete character */
+                   matches++;
+               s += len;
+               pch = 0xfeedface;
+           }
+       }
+       else {
+           while (s < send) {
+               STRLEN len;
+               UV comp = utf8_to_uv_simple(s, &len);
+               if (comp > 0xff)
+                   d = uv_to_utf8(d, comp);    /* always unmapped */
+               else if ((ch = tbl[comp]) >= 0) {
+                   d = uv_to_utf8(d, ch);
+                   matches++;
+               }
+               else if (ch == -1) {    /* -1 is unmapped character */
+                   d = uv_to_utf8(d, comp);
+               }
+               else if (ch == -2)      /* -2 is delete character */
+                   matches++;
+               s += len;
+           }
        }
+       *d = '\0';
+       sv_setpvn(sv, (char*)dstart, d - dstart);
+       SvUTF8_on(sv);
     }
-
-    *d = '\0';
-
-    sv_setpvn(sv, (const char*)dstart, d - dstart);
-    Safefree(dstart);
-    if (hasutf)
-        SvUTF8_on(sv);
     SvSETMAGIC(sv);
     return matches;
-
 }
 
 STATIC I32
@@ -219,7 +237,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
     U8 *send;
     U8 *d;
     U8 *start;
-    U8 *dstart;
+    U8 *dstart, *dend;
     I32 matches = 0;
     STRLEN len;
 
@@ -230,11 +248,19 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
     UV extra = none + 1;
     UV final;
     UV uv;
-    I32 isutf;
-    I32 howmany;
+    I32 isutf8;
+    U8 hibit = 0;
 
-    isutf = SvUTF8(sv);
     s = (U8*)SvPV(sv, len);
+    isutf8 = SvUTF8(sv);
+    if (!isutf8) {
+       U8 *t = s, *e = s + len;
+       while (t < e)
+           if ((hibit = *t++ & 0x80))
+               break;
+       if (hibit)
+           s = bytes_to_utf8(s, &len);
+    }
     send = s + len;
     start = s;
 
@@ -243,41 +269,46 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
        final = SvUV(*svp);
 
     /* d needs to be bigger than s, in case e.g. upgrading is required */
-    Newz(0, d, len*2+1, U8);
+    New(0, d, len*3+UTF8_MAXLEN, U8);
+    dend = d + len * 3;
     dstart = d;
+
     while (s < send) {
        if ((uv = swash_fetch(rv, s)) < none) {
            s += UTF8SKIP(s);
            matches++;
-            if ((uv & 0x80) && !isutf++)
-                HALF_UTF8_UPGRADE(dstart,d);
            d = uv_to_utf8(d, uv);
        }
        else if (uv == none) {
-           int i;
-           i = UTF8SKIP(s);
-            if (i > 1 && !isutf++)
-                HALF_UTF8_UPGRADE(dstart,d);
+           int i = UTF8SKIP(s);
            while(i--)
                *d++ = *s++;
        }
        else if (uv == extra) {
-           int i;
-           i = UTF8SKIP(s);
+           int i = UTF8SKIP(s);
            s += i;
            matches++;
-            if (i > 1 && !isutf++)
-                HALF_UTF8_UPGRADE(dstart,d);
            d = uv_to_utf8(d, final);
        }
        else
            s += UTF8SKIP(s);
+
+       if (d >= dend) {
+           STRLEN clen = d - dstart;
+           STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
+           Renew(dstart, nlen+UTF8_MAXLEN, U8);
+           d = dstart + clen;
+           dend = dstart + nlen;
+       }
     }
     *d = '\0';
-    sv_setpvn(sv, (const char*)dstart, d - dstart);
+    sv_setpvn(sv, (char*)dstart, d - dstart);
     SvSETMAGIC(sv);
-    if (isutf)
-        SvUTF8_on(sv);
+    SvUTF8_on(sv);
+    if (hibit)
+       Safefree(start);
+    if (!isutf8 && !(PL_hints & HINT_UTF8))
+       sv_utf8_downgrade(sv, TRUE);
 
     return matches;
 }
@@ -287,7 +318,7 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
 {
     dTHR;
     U8 *s;
-    U8 *send;
+    U8 *start, *send;
     I32 matches = 0;
     STRLEN len;
 
@@ -296,10 +327,17 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
     SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
     UV none = svp ? SvUV(*svp) : 0x7fffffff;
     UV uv;
+    U8 hibit = 0;
 
     s = (U8*)SvPV(sv, len);
-    if (!SvUTF8(sv))
-        s = bytes_to_utf8(s, &len);
+    if (!SvUTF8(sv)) {
+       U8 *t = s, *e = s + len;
+       while (t < e)
+           if ((hibit = *t++ & 0x80))
+               break;
+       if (hibit)
+           start = s = bytes_to_utf8(s, &len);
+    }
     send = s + len;
 
     while (s < send) {
@@ -307,6 +345,8 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
            matches++;
        s += UTF8SKIP(s);
     }
+    if (hibit)
+        Safefree(start);
 
     return matches;
 }
@@ -316,7 +356,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
 {
     dTHR;
     U8 *s;
-    U8 *send;
+    U8 *start, *send;
     U8 *d;
     I32 matches = 0;
     I32 squash   = PL_op->op_private & OPpTRANS_SQUASH;
@@ -329,41 +369,45 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
     UV final;
     UV uv;
     STRLEN len;
-    U8 *dst;
-    I32 isutf = SvUTF8(sv);
+    U8 *dstart, *dend;
+    I32 isutf8;
+    U8 hibit = 0;
 
     s = (U8*)SvPV(sv, len);
+    isutf8 = SvUTF8(sv);
+    if (!isutf8) {
+       U8 *t = s, *e = s + len;
+       while (t < e)
+           if ((hibit = *t++ & 0x80))
+               break;
+       if (hibit)
+           s = bytes_to_utf8(s, &len);
+    }
     send = s + len;
+    start = s;
 
     svp = hv_fetch(hv, "FINAL", 5, FALSE);
     if (svp)
        final = SvUV(*svp);
 
-    Newz(0, d, len*2+1, U8);
-       dst = d;
+    New(0, d, len*3+UTF8_MAXLEN, U8);
+    dend = d + len * 3;
+    dstart = d;
 
     if (squash) {
        UV puv = 0xfeedface;
        while (s < send) {
-            if (SvUTF8(sv))
-               uv = swash_fetch(rv, s);
-           else {
-               U8 tmpbuf[2];
-               uv = *s++;
-               if (uv < 0x80)
-                   tmpbuf[0] = uv;
-               else {
-                   tmpbuf[0] = (( uv >>  6)         | 0xc0);
-                   tmpbuf[1] = (( uv        & 0x3f) | 0x80);
-               }
-               uv = swash_fetch(rv, tmpbuf);
+           uv = swash_fetch(rv, s);
+           
+           if (d >= dend) {
+               STRLEN clen = d - dstart, nlen = dend - dstart + len;
+               Renew(dstart, nlen+UTF8_MAXLEN, U8);
+               d = dstart + clen;
+               dend = dstart + nlen;
            }
-
            if (uv < none) {
                matches++;
                if (uv != puv) {
-                    if ((uv & 0x80) && !isutf++)
-                        HALF_UTF8_UPGRADE(dst,d);
                    d = uv_to_utf8(d, uv);
                    puv = uv;
                }
@@ -371,9 +415,9 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
                continue;
            }
            else if (uv == none) {      /* "none" is unmapped character */
-               STRLEN ulen;
-               *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0);
-               s += ulen;
+               int i = UTF8SKIP(s);
+               while(i--)
+                   *d++ = *s++;
                puv = 0xfeedface;
                continue;
            }
@@ -392,18 +436,12 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
     }
     else {
        while (s < send) {
-            if (SvUTF8(sv))
-               uv = swash_fetch(rv, s);
-           else {
-               U8 tmpbuf[2];
-               uv = *s++;
-               if (uv < 0x80)
-                   tmpbuf[0] = uv;
-               else {
-                   tmpbuf[0] = (( uv >>  6)         | 0xc0);
-                   tmpbuf[1] = (( uv        & 0x3f) | 0x80);
-               }
-               uv = swash_fetch(rv, tmpbuf);
+           uv = swash_fetch(rv, s);
+           if (d >= dend) {
+               STRLEN clen = d - dstart, nlen = dend - dstart + len;
+               Renew(dstart, nlen+UTF8_MAXLEN, U8);
+               d = dstart + clen;
+               dend = dstart + nlen;
            }
            if (uv < none) {
                matches++;
@@ -412,9 +450,9 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
                continue;
            }
            else if (uv == none) {      /* "none" is unmapped character */
-               STRLEN ulen;
-               *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0);
-               s += ulen;
+               int i = UTF8SKIP(s);
+               while(i--)
+                   *d++ = *s++;
                continue;
            }
            else if (uv == extra && !del) {
@@ -427,12 +465,13 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
            s += UTF8SKIP(s);
        }
     }
-    if (dst)
-       sv_usepvn(sv, (char*)dst, d - dst);
-    else {
-       *d = '\0';
-       SvCUR_set(sv, d - (U8*)SvPVX(sv));
-    }
+    *d = '\0';
+    sv_setpvn(sv, (char*)dstart, d - dstart);
+    SvUTF8_on(sv);
+    if (hibit)
+       Safefree(start);
+    if (!isutf8 && !(PL_hints & HINT_UTF8))
+       sv_utf8_downgrade(sv, TRUE);
     SvSETMAGIC(sv);
 
     return matches;
@@ -803,15 +842,15 @@ Perl_do_chop(pTHX_ register SV *astr, register SV *sv)
            char *send = s + len;
            char *start = s;
            s = send - 1;
-           while ((*s & 0xc0) == 0x80)
-               --s;
-           if (UTF8SKIP(s) != send - s && ckWARN_d(WARN_UTF8))
-               Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
-           sv_setpvn(astr, s, send - s);
-           *s = '\0';
-           SvCUR_set(sv, s - start);
-           SvNIOK_off(sv);
-           SvUTF8_on(astr);
+           while (s > start && UTF8_IS_CONTINUATION(*s))
+               s--;
+           if (utf8_to_uv_simple((U8*)s, 0)) {
+               sv_setpvn(astr, s, send - s);
+               *s = '\0';
+               SvCUR_set(sv, s - start);
+               SvNIOK_off(sv);
+               SvUTF8_on(astr);
+           }
        }
        else
            sv_setpvn(astr, "", 0);
diff --git a/mg.c b/mg.c
index bf89742..fd811ad 100644 (file)
--- a/mg.c
+++ b/mg.c
@@ -1422,7 +1422,14 @@ Perl_magic_setsubstr(pTHX_ SV *sv, MAGIC *mg)
 {
     STRLEN len;
     char *tmps = SvPV(sv,len);
-    sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len);
+    if (DO_UTF8(sv)) {
+       sv_utf8_upgrade(LvTARG(sv));
+       sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len);
+       SvUTF8_on(LvTARG(sv));
+    }
+    else
+        sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len);
+
     return 0;
 }
 
diff --git a/pp.c b/pp.c
index 2f38542..0fa680d 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -2268,6 +2268,9 @@ PP(pp_chr)
        XPUSHs(TARG);
        RETURN;
     }
+    else {
+       SvUTF8_off(TARG);
+    }
 
     SvGROW(TARG,2);
     SvCUR_set(TARG, 1);
@@ -3266,20 +3269,17 @@ PP(pp_reverse)
                U8* s = (U8*)SvPVX(TARG);
                U8* send = (U8*)(s + len);
                while (s < send) {
-                   if (*s < 0x80) {
+                   if (UTF8_IS_ASCII(*s)) {
                        s++;
                        continue;
                    }
                    else {
+                       if (!utf8_to_uv_simple(s, 0))
+                           break;
                        up = (char*)s;
                        s += UTF8SKIP(s);
                        down = (char*)(s - 1);
-                       if (s > send || !((*down & 0xc0) == 0x80)) {
-                           if (ckWARN_d(WARN_UTF8))
-                               Perl_warner(aTHX_ WARN_UTF8,
-                                           "Malformed UTF-8 character");
-                           break;
-                       }
+                       /* reverse this character */
                        while (down > up) {
                            tmp = *up;
                            *up++ = *down;
@@ -5026,9 +5026,9 @@ PP(pp_split)
     AV *ary;
     register IV limit = POPi;                  /* note, negative is forever */
     SV *sv = POPs;
-    bool do_utf8 = DO_UTF8(sv);
     STRLEN len;
     register char *s = SvPV(sv, len);
+    bool do_utf8 = DO_UTF8(sv);
     char *strend = s + len;
     register PMOP *pm;
     register REGEXP *rx;
@@ -5165,7 +5165,7 @@ PP(pp_split)
        SV *csv = CALLREG_INTUIT_STRING(aTHX_ rx);
 
        len = rx->minlen;
-       if (len == 1 && !tail) {
+       if (len == 1 && !(rx->reganch & ROPT_UTF8) && !tail) {
            STRLEN n_a;
            char c = *SvPV(csv, n_a);
            while (--limit) {
@@ -5182,7 +5182,10 @@ PP(pp_split)
                XPUSHs(dstr);
                /* The rx->minlen is in characters but we want to step
                 * s ahead by bytes. */
-               s = m + (do_utf8 ? SvCUR(csv) : len);
+               if (do_utf8)
+                   s = (char*)utf8_hop((U8*)m, len);
+               else
+                   s = m + len; /* Fake \n at the end */
            }
        }
        else {
@@ -5201,7 +5204,10 @@ PP(pp_split)
                XPUSHs(dstr);
                /* The rx->minlen is in characters but we want to step
                 * s ahead by bytes. */
-               s = m + (do_utf8 ? SvCUR(csv) : len); /* Fake \n at the end */
+               if (do_utf8)
+                   s = (char*)utf8_hop((U8*)m, len);
+               else
+                   s = m + len; /* Fake \n at the end */
            }
        }
     }
index 91e2c12..0616433 100644 (file)
--- a/pp_hot.c
+++ b/pp_hot.c
@@ -78,6 +78,8 @@ PP(pp_stringify)
     sv_setpvn(TARG,s,len);
     if (SvUTF8(TOPs) && !IN_BYTE)
        SvUTF8_on(TARG);
+    else
+       SvUTF8_off(TARG);
     SETTARG;
     RETURN;
 }
@@ -140,103 +142,52 @@ PP(pp_concat)
   djSP; dATARGET; tryAMAGICbin(concat,opASSIGN);
   {
     dPOPTOPssrl;
-    STRLEN len;
-    U8 *s;
-    bool left_utf8;
-    bool right_utf8;
+    SV* rcopy = Nullsv;
 
-    if (TARG == right && SvGMAGICAL(right))
-        mg_get(right);
     if (SvGMAGICAL(left))
         mg_get(left);
+    if (TARG == right && SvGMAGICAL(right))
+        mg_get(right);
 
-    left_utf8  = DO_UTF8(left);
-    right_utf8 = DO_UTF8(right);
-    if (left_utf8 != right_utf8) {
-        if (TARG == right && !right_utf8) {
-            sv_utf8_upgrade(TARG); /* Now straight binary copy */
-            SvUTF8_on(TARG);
-        }
-        else {
-            /* Set TARG to PV(left), then add right */
-            U8 *l, *c, *olds = NULL;
-            STRLEN targlen;
-           s = (U8*)SvPV(right,len);
-           right_utf8 |= DO_UTF8(right);
-            if (TARG == right) {
-               /* Take a copy since we're about to overwrite TARG */
-               olds = s = (U8*)savepvn((char*)s, len);
-           }
-           if (!SvOK(left) && SvTYPE(left) <= SVt_PVMG) {
-               if (SvREADONLY(left))
-                   left = sv_2mortal(newSVsv(left));
-               else
-                   sv_setpv(left, ""); /* Suppress warning. */
-           }
-            l = (U8*)SvPV(left, targlen);
-           left_utf8 |= DO_UTF8(left);
-            if (TARG != left)
-                sv_setpvn(TARG, (char*)l, targlen);
-            if (!left_utf8)
-                sv_utf8_upgrade(TARG);
-            /* Extend TARG to length of right (s) */
-            targlen = SvCUR(TARG) + len;
-            if (!right_utf8) {
-                /* plus one for each hi-byte char if we have to upgrade */
-                for (c = s; c < s + len; c++)  {
-                    if (UTF8_IS_CONTINUED(*c))
-                        targlen++;
-                }
-            }
-            SvGROW(TARG, targlen+1);
-            /* And now copy, maybe upgrading right to UTF8 on the fly */
-           if (right_utf8)
-               Copy(s, SvEND(TARG), len, U8);
-           else {
-               for (c = (U8*)SvEND(TARG); len--; s++)
-                   c = uv_to_utf8(c, *s);
-           }
-            SvCUR_set(TARG, targlen);
-            *SvEND(TARG) = '\0';
-            SvUTF8_on(TARG);
-            SETs(TARG);
-           Safefree(olds);
-            RETURN;
-        }
-    }
-
-    if (TARG != left) {
-       s = (U8*)SvPV(left,len);
-       if (TARG == right) {
-           sv_insert(TARG, 0, 0, (char*)s, len);
-           SETs(TARG);
-           RETURN;
+    if (TARG == right && left != right)
+       /* Clone since otherwise we cannot prepend. */
+       rcopy = sv_2mortal(newSVsv(right));
+
+    if (TARG != left)
+       sv_setsv(TARG, left);
+
+    if (TARG == right) {
+       if (left == right) {
+           /*  $right = $right . $right; */
+           STRLEN rlen;
+           char *rpv = SvPV(right, rlen);
+
+           sv_catpvn(TARG, rpv, rlen);
        }
-       sv_setpvn(TARG, (char *)s, len);
+       else /* $right = $left  . $right; */
+           sv_catsv(TARG, rcopy);
     }
-    else if (!SvOK(TARG) && SvTYPE(TARG) <= SVt_PVMG)
-       sv_setpv(TARG, "");     /* Suppress warning. */
-    s = (U8*)SvPV(right,len);
-    if (SvOK(TARG)) {
+    else {
+       if (!SvOK(TARG)) /* Avoid warning when concatenating to undef. */
+           sv_setpv(TARG, "");
+       /* $other = $left . $right; */
+       /* $left  = $left . $right; */
+       sv_catsv(TARG, right);
+    }
+
 #if defined(PERL_Y2KWARN)
-       if ((SvIOK(right) || SvNOK(right)) && ckWARN(WARN_Y2K)) {
-           STRLEN n;
-           char *s = SvPV(TARG,n);
-           if (n >= 2 && s[n-2] == '1' && s[n-1] == '9'
-               && (n == 2 || !isDIGIT(s[n-3])))
-           {
-               Perl_warner(aTHX_ WARN_Y2K, "Possible Y2K bug: %s",
-                           "about to append an integer to '19'");
-           }
+    if ((SvIOK(right) || SvNOK(right)) && ckWARN(WARN_Y2K)) {
+       STRLEN n;
+       char *s = SvPV(TARG,n);
+       if (n >= 2 && s[n-2] == '1' && s[n-1] == '9'
+           && (n == 2 || !isDIGIT(s[n-3])))
+       {
+           Perl_warner(aTHX_ WARN_Y2K, "Possible Y2K bug: %s",
+                       "about to append an integer to '19'");
        }
-#endif
-       sv_catpvn(TARG, (char *)s, len);
     }
-    else
-       sv_setpvn(TARG, (char *)s, len);        /* suppress warning */
-    if (left_utf8)
-       SvUTF8_on(TARG);
+#endif
+
     SETTARG;
     RETURN;
   }
@@ -1076,7 +1027,8 @@ play_it_again:
        if (update_minmatch++)
            minmatch = had_zerolen;
     }
-    if (rx->reganch & RE_USE_INTUIT) {
+    if (rx->reganch & RE_USE_INTUIT &&
+       DO_UTF8(TARG) == ((rx->reganch & ROPT_UTF8) != 0)) {
        s = CALLREG_INTUIT_START(aTHX_ rx, TARG, s, strend, r_flags, NULL);
 
        if (!s)
@@ -1804,8 +1756,7 @@ PP(pp_subst)
        TARG = DEFSV;
        EXTEND(SP,1);
     }
-    PL_reg_sv = TARG;
-    do_utf8 = DO_UTF8(PL_reg_sv);
+    do_utf8 = DO_UTF8(TARG);
     if (SvFAKE(TARG) && SvREADONLY(TARG))
        sv_force_normal(TARG);
     if (SvREADONLY(TARG)
@@ -1972,6 +1923,8 @@ PP(pp_subst)
     if (CALLREGEXEC(aTHX_ rx, s, strend, orig, 0, TARG, NULL,
                    r_flags | REXEC_CHECKED))
     {
+       bool isutf8;
+
        if (force_on_match) {
            force_on_match = 0;
            s = SvPV_force(TARG, len);
@@ -2014,6 +1967,7 @@ PP(pp_subst)
        SvPVX(TARG) = SvPVX(dstr);
        SvCUR_set(TARG, SvCUR(dstr));
        SvLEN_set(TARG, SvLEN(dstr));
+       isutf8 = DO_UTF8(dstr);
        SvPVX(dstr) = 0;
        sv_free(dstr);
 
@@ -2022,6 +1976,8 @@ PP(pp_subst)
        PUSHs(sv_2mortal(newSViv((I32)iters)));
 
        (void)SvPOK_only(TARG);
+       if (isutf8)
+           SvUTF8_on(TARG);
        TAINT_IF(rxtainted);
        SvSETMAGIC(TARG);
        SvTAINT(TARG);
diff --git a/sv.c b/sv.c
index 6810d28..9b54302 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -4054,18 +4054,18 @@ Perl_sv_pos_b2u(pTHX_ register SV *sv, I32* offsetp)
 
     s = (U8*)SvPV(sv, len);
     if (len < *offsetp)
-       Perl_croak(aTHX_ "panic: bad byte offset");
+       Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset");
     send = s + *offsetp;
     len = 0;
     while (s < send) {
-       s += UTF8SKIP(s);
-       ++len;
-    }
-    if (s != send) {
-        dTHR;
-       if (ckWARN_d(WARN_UTF8))    
-           Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
-       --len;
+       STRLEN n;
+
+       if (utf8_to_uv(s, UTF8SKIP(s), &n, 0)) {
+           s += n;
+           len++;
+       }
+       else
+           break;
     }
     *offsetp = len;
     return;
index 6723ca3..65d0669 100755 (executable)
@@ -1,6 +1,6 @@
 #!./perl
 
-print "1..30\n";
+print "1..33\n";
 
 # optimized
 
@@ -89,3 +89,17 @@ $_ = "ab\n";
 $/ = \3;
 print chomp() == 0 ? "ok 29\n" : "not ok 29\n";
 print $_ eq "ab\n" ? "ok 30\n" : "not ok 30\n";
+
+# Go Unicode.
+
+$_ = "abc\x{1234}";
+chop;
+print $_ eq "abc" ? "ok 31\n" : "not ok 31\n";
+
+$_ = "abc\x{1234}d";
+chop;
+print $_ eq "abc\x{1234}" ? "ok 32\n" : "not ok 32\n";
+
+$_ = "\x{1234}\x{2345}";
+chop;
+print $_ eq "\x{1234}" ? "ok 33\n" : "not ok 33\n";
diff --git a/toke.c b/toke.c
index 398f0ad..43e8e28 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -1303,31 +1303,10 @@ S_scan_const(pTHX_ char *start)
                break;          /* in regexp, $ might be tail anchor */
        }
 
-       /* (now in tr/// code again) */
-
-       if (*s & 0x80 && (this_utf8 || has_utf8)) {
-           STRLEN len = (STRLEN) -1;
-           UV uv;
-           if (this_utf8) {
-               uv = utf8_to_uv((U8*)s, send - s, &len, UTF8_CHECK_ONLY);
-           }
-           if (len == (STRLEN)-1) {
-               /* Illegal UTF8 (a high-bit byte), make it valid. */
-               char *old_pvx = SvPVX(sv);
-               /* need space for one extra char (NOTE: SvCUR() not set here) */
-               d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
-               d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
-           }
-           else {
-               while (len--)
-                   *d++ = *s++;
-           }
-           has_utf8 = TRUE;
-           continue;
-       }
-
        /* backslashes */
        if (*s == '\\' && s+1 < send) {
+           bool to_be_utf8 = FALSE;
+
            s++;
 
            /* some backslashes we leave behind */
@@ -1398,7 +1377,7 @@ S_scan_const(pTHX_ char *start)
                    else {
                        STRLEN len = 1;         /* allow underscores */
                        uv = (UV)scan_hex(s + 1, e - s - 1, &len);
-                       has_utf8 = TRUE;
+                       to_be_utf8 = TRUE;
                    }
                    s = e + 1;
                }
@@ -1415,8 +1394,8 @@ S_scan_const(pTHX_ char *start)
                 * There will always enough room in sv since such escapes will
                 * be longer than any utf8 sequence they can end up as
                 */
-               if (uv > 127 || has_utf8) {
-                   if (!this_utf8 && !has_utf8 && uv > 255) {
+               if (uv > 127) {
+                   if (!has_utf8 && (to_be_utf8 || uv > 255)) {
                        /* might need to recode whatever we have accumulated so far
                         * if it contains any hibit chars
                         */
@@ -1448,7 +1427,7 @@ S_scan_const(pTHX_ char *start)
                         }
                     }
 
-                    if (has_utf8 || uv > 255) {
+                    if (to_be_utf8 || uv > 255) {
                        d = (char*)uv_to_utf8((U8*)d, uv);
                        has_utf8 = TRUE;
                     }
@@ -1560,6 +1539,29 @@ S_scan_const(pTHX_ char *start)
            continue;
        } /* end if (backslash) */
 
+       /* (now in tr/// code again) */
+
+       if (*s & 0x80 && (this_utf8 || has_utf8)) {
+           STRLEN len = (STRLEN) -1;
+           UV uv;
+           if (this_utf8) {
+               uv = utf8_to_uv((U8*)s, send - s, &len, 0);
+           }
+           if (len == (STRLEN)-1) {
+               /* Illegal UTF8 (a high-bit byte), make it valid. */
+               char *old_pvx = SvPVX(sv);
+               /* need space for one extra char (NOTE: SvCUR() not set here) */
+               d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
+               d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
+           }
+           else {
+               while (len--)
+                   *d++ = *s++;
+           }
+           has_utf8 = TRUE;
+           continue;
+       }
+
        *d++ = *s++;
     } /* while loop to process each character */
 
@@ -3116,6 +3118,9 @@ Perl_yylex(pTHX)
                if (*d == '}') {
                    char minus = (PL_tokenbuf[0] == '-');
                    s = force_word(s + minus, WORD, FALSE, TRUE, FALSE);
+                   if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, 0) &&
+                       PL_nextval[PL_nexttoke-1].opval)
+                     SvUTF8_on(((SVOP*)PL_nextval[PL_nexttoke-1].opval)->op_sv);
                    if (minus)
                        force_next('-');
                }
@@ -3770,6 +3775,8 @@ Perl_yylex(pTHX)
            CLINE;
            yylval.opval = (OP*)newSVOP(OP_CONST, 0, newSVpv(PL_tokenbuf,0));
            yylval.opval->op_private = OPpCONST_BARE;
+           if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, len))
+             SvUTF8_on(((SVOP*)yylval.opval)->op_sv);
            TERM(WORD);
        }
 
@@ -3929,6 +3936,8 @@ Perl_yylex(pTHX)
                if (*s == '=' && s[1] == '>') {
                    CLINE;
                    sv_setpv(((SVOP*)yylval.opval)->op_sv, PL_tokenbuf);
+                   if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, len))
+                     SvUTF8_on(((SVOP*)yylval.opval)->op_sv);
                    TERM(WORD);
                }
 
@@ -4630,6 +4639,7 @@ Perl_yylex(pTHX)
                int warned = 0;
                d = SvPV_force(PL_lex_stuff, len);
                while (len) {
+                   SV *sv;
                    for (; isSPACE(*d) && len; --len, ++d) ;
                    if (len) {
                        char *b = d;
@@ -4650,8 +4660,11 @@ Perl_yylex(pTHX)
                        else {
                            for (; !isSPACE(*d) && len; --len, ++d) ;
                        }
+                       sv = newSVpvn(b, d-b);
+                       if (DO_UTF8(PL_lex_stuff))
+                           SvUTF8_on(sv);
                        words = append_elem(OP_LIST, words,
-                                           newSVOP(OP_CONST, 0, tokeq(newSVpvn(b, d-b))));
+                                           newSVOP(OP_CONST, 0, tokeq(sv)));
                    }
                }
                if (words) {
@@ -6263,7 +6276,9 @@ S_scan_trans(pTHX_ char *start)
            squash = OPpTRANS_SQUASH;
        s++;
     }
-    o->op_private = del|squash|complement;
+    o->op_private = del|squash|complement|
+      (DO_UTF8(PL_lex_stuff)? OPpTRANS_FROM_UTF : 0)|
+      (DO_UTF8(PL_lex_repl) ? OPpTRANS_TO_UTF   : 0);
 
     PL_lex_op = o;
     yylval.ival = OP_TRANS;
@@ -6458,6 +6473,8 @@ retval:
        Renew(SvPVX(tmpstr), SvLEN(tmpstr), char);
     }
     SvREFCNT_dec(herewas);
+    if (UTF && !IN_BYTE && is_utf8_string((U8*)SvPVX(tmpstr), SvCUR(tmpstr)))
+       SvUTF8_on(tmpstr);
     PL_lex_stuff = tmpstr;
     yylval.ival = op_type;
     return s;
@@ -7228,7 +7245,8 @@ vstring:
                SvREADONLY_on(sv);
                if (utf8) {
                    SvUTF8_on(sv);
-                   sv_utf8_downgrade(sv, TRUE);
+                   if (!UTF||IN_BYTE)
+                     sv_utf8_downgrade(sv, TRUE);
                }
            }
        }
diff --git a/utf8.c b/utf8.c
index 0aa46f7..8e3ae1d 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -119,15 +119,15 @@ Perl_is_utf8_char(pTHX_ U8 *s)
     STRLEN slen, len;
     UV uv, ouv;
 
-    if (u <= 0x7f)
+    if (UTF8_IS_ASCII(u))
        return 1;
 
-    if (u >= 0x80 && u <= 0xbf)
+    if (!UTF8_IS_START(u))
        return 0;
 
     len = UTF8SKIP(s);
 
-    if (len < 2 || (u >= 0xc0 && u <= 0xfd && s[1] < 0x80))
+    if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
        return 0;
 
     slen = len - 1;
@@ -135,7 +135,7 @@ Perl_is_utf8_char(pTHX_ U8 *s)
     uv = u;
     ouv = uv;
     while (slen--) {
-       if ((*s & 0xc0) != 0x80)
+       if (!UTF8_IS_CONTINUATION(*s))
            return 0;
        uv = UTF8_ACCUMULATE(uv, *s);
        if (uv < ouv)
@@ -163,17 +163,21 @@ bool
 Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
 {
     U8* x = s;
-    U8* send = s + len;
+    U8* send;
     STRLEN c;
 
+    if (!len)
+       len = strlen((char *)s);
+    send = s + len;
+
     while (x < send) {
         c = is_utf8_char(x);
        if (!c)
            return FALSE;
         x += c;
-        if (x > send)
-            return FALSE;
     }
+    if (x != send)
+       return FALSE;
 
     return TRUE;
 }
@@ -210,11 +214,24 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
     bool dowarn = ckWARN_d(WARN_UTF8);
 #endif
     STRLEN expectlen = 0;
-    
-    if (curlen == 0) {
-       if (dowarn)
-           Perl_warner(aTHX_ WARN_UTF8,
-                       "Malformed UTF-8 character (an empty string)");
+    U32 warning = 0;
+
+/* This list is a superset of the UTF8_ALLOW_XXX. */
+
+#define UTF8_WARN_EMPTY                                 1
+#define UTF8_WARN_CONTINUATION                  2
+#define UTF8_WARN_NON_CONTINUATION              3
+#define UTF8_WARN_FE_FF                                 4
+#define UTF8_WARN_SHORT                                 5
+#define UTF8_WARN_OVERFLOW                      6
+#define UTF8_WARN_SURROGATE                     7
+#define UTF8_WARN_BOM                           8
+#define UTF8_WARN_LONG                          9
+#define UTF8_WARN_FFFF                         10
+
+    if (curlen == 0 &&
+       !(flags & UTF8_ALLOW_EMPTY)) {
+       warning = UTF8_WARN_EMPTY;
        goto malformed;
     }
 
@@ -226,28 +243,19 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 
     if (UTF8_IS_CONTINUATION(uv) &&
        !(flags & UTF8_ALLOW_CONTINUATION)) {
-       if (dowarn)
-           Perl_warner(aTHX_ WARN_UTF8,
-                       "Malformed UTF-8 character (unexpected continuation byte 0x%02"UVxf")",
-                       uv);
+       warning = UTF8_WARN_CONTINUATION;
        goto malformed;
     }
 
     if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
        !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
-       if (dowarn)
-           Perl_warner(aTHX_ WARN_UTF8,
-                       "Malformed UTF-8 character (unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
-                       (UV)s[1], uv);
+       warning = UTF8_WARN_NON_CONTINUATION;
        goto malformed;
     }
     
     if ((uv == 0xfe || uv == 0xff) &&
        !(flags & UTF8_ALLOW_FE_FF)) {
-       if (dowarn)
-           Perl_warner(aTHX_ WARN_UTF8,
-                       "Malformed UTF-8 character (byte 0x%02"UVxf")",
-                       uv);
+       warning = UTF8_WARN_FE_FF;
        goto malformed;
     }
        
@@ -266,10 +274,7 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 
     if ((curlen < expectlen) &&
        !(flags & UTF8_ALLOW_SHORT)) {
-       if (dowarn)
-           Perl_warner(aTHX_ WARN_UTF8,
-                       "Malformed UTF-8 character (%d byte%s, need %d)",
-                       curlen, curlen == 1 ? "" : "s", expectlen);
+       warning = UTF8_WARN_SHORT;
        goto malformed;
     }
 
@@ -280,21 +285,25 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
     while (len--) {
        if (!UTF8_IS_CONTINUATION(*s) &&
            !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
-           if (dowarn)
-               Perl_warner(aTHX_ WARN_UTF8,
-                           "Malformed UTF-8 character (unexpected non-continuation byte 0x%02x)",
-                           *s);
+           s--;
+           warning = UTF8_WARN_NON_CONTINUATION;
            goto malformed;
        }
        else
            uv = UTF8_ACCUMULATE(uv, *s);
-       if (uv < ouv) {
-           /* This cannot be allowed. */
-           if (dowarn)
-               Perl_warner(aTHX_ WARN_UTF8,
-                           "Malformed UTF-8 character (overflow at 0x%"UVxf", byte 0x%02x)",
-                           ouv, *s);
-           goto malformed;
+       if (!(uv > ouv)) {
+           /* These cannot be allowed. */
+           if (uv == ouv) {
+               if (!(flags & UTF8_ALLOW_LONG)) {
+                   warning = UTF8_WARN_LONG;
+                   goto malformed;
+               }
+           }
+           else { /* uv < ouv */
+               /* This cannot be allowed. */
+               warning = UTF8_WARN_OVERFLOW;
+               goto malformed;
+           }
        }
        s++;
        ouv = uv;
@@ -302,31 +311,19 @@ Perl_utf8_to_uv(pTHX_ U8* s, STRLEN curlen, STRLEN* retlen, U32 flags)
 
     if (UNICODE_IS_SURROGATE(uv) &&
        !(flags & UTF8_ALLOW_SURROGATE)) {
-       if (dowarn)
-           Perl_warner(aTHX_ WARN_UTF8,
-                       "Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")",
-                       uv);
+       warning = UTF8_WARN_SURROGATE;
        goto malformed;
     } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
               !(flags & UTF8_ALLOW_BOM)) {
-       if (dowarn)
-           Perl_warner(aTHX_ WARN_UTF8,
-                       "Malformed UTF-8 character (byte order mark 0x%04"UVxf")",
-                       uv);
+       warning = UTF8_WARN_BOM;
        goto malformed;
     } else if ((expectlen > UNISKIP(uv)) &&
               !(flags & UTF8_ALLOW_LONG)) {
-       if (dowarn)
-           Perl_warner(aTHX_ WARN_UTF8,
-                       "Malformed UTF-8 character (%d byte%s, need %d)",
-                       expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
+       warning = UTF8_WARN_LONG;
        goto malformed;
     } else if (UNICODE_IS_ILLEGAL(uv) &&
               !(flags & UTF8_ALLOW_FFFF)) {
-       if (dowarn)
-           Perl_warner(aTHX_ WARN_UTF8,
-                       "Malformed UTF-8 character (character 0x%04"UVxf")",
-                       uv);
+       warning = UTF8_WARN_FFFF;
        goto malformed;
     }
 
@@ -340,6 +337,61 @@ malformed:
        return 0;
     }
 
+    if (dowarn) {
+       SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
+
+       switch (warning) {
+       case 0: /* Intentionally empty. */ break;
+       case UTF8_WARN_EMPTY:
+           Perl_sv_catpvf(aTHX_ sv, "(empty string)");
+           break;
+       case UTF8_WARN_CONTINUATION:
+           Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
+           break;
+       case UTF8_WARN_NON_CONTINUATION:
+           Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
+                           (UV)s[1], uv);
+           break;
+       case UTF8_WARN_FE_FF:
+           Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
+           break;
+       case UTF8_WARN_SHORT:
+           Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
+                           curlen, curlen == 1 ? "" : "s", expectlen);
+           break;
+       case UTF8_WARN_OVERFLOW:
+           Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
+                           ouv, *s);
+           break;
+       case UTF8_WARN_SURROGATE:
+           Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
+           break;
+       case UTF8_WARN_BOM:
+           Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
+           break;
+       case UTF8_WARN_LONG:
+           Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
+                          expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
+           break;
+       case UTF8_WARN_FFFF:
+           Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
+           break;
+       default:
+           Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
+           break;
+       }
+       
+       if (warning) {
+           char *s = SvPVX(sv);
+
+           if (PL_op)
+               Perl_warner(aTHX_ WARN_UTF8,
+                           "%s in %s", s,  PL_op_desc[PL_op->op_type]);
+           else
+               Perl_warner(aTHX_ WARN_UTF8, "%s", s);
+       }
+    }
+
     if (retlen)
        *retlen = expectlen ? expectlen : len;
 
diff --git a/utf8.h b/utf8.h
index 8d46aa9..a43b945 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -33,16 +33,18 @@ END_EXTERN_C
 #define IN_BYTE (PL_curcop->op_private & HINT_BYTE)
 #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE)
 
-#define UTF8_ALLOW_CONTINUATION                0x0001
-#define UTF8_ALLOW_NON_CONTINUATION    0x0002
-#define UTF8_ALLOW_FE_FF               0x0004
-#define UTF8_ALLOW_SHORT               0x0008
-#define UTF8_ALLOW_SURROGATE           0x0010
-#define UTF8_ALLOW_BOM                 0x0020
-#define UTF8_ALLOW_FFFF                        0x0040
-#define UTF8_ALLOW_LONG                        0x0080
-#define UTF8_ALLOW_ANYUV               (UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF \
-                                       |UTF8_ALLOW_BOM|UTF8_ALLOW_SURROGATE)
+#define UTF8_ALLOW_EMPTY               0x0001
+#define UTF8_ALLOW_CONTINUATION                0x0002
+#define UTF8_ALLOW_NON_CONTINUATION    0x0004
+#define UTF8_ALLOW_FE_FF               0x0008
+#define UTF8_ALLOW_SHORT               0x0010
+#define UTF8_ALLOW_SURROGATE           0x0020
+#define UTF8_ALLOW_BOM                 0x0040
+#define UTF8_ALLOW_FFFF                        0x0080
+#define UTF8_ALLOW_LONG                        0x0100
+#define UTF8_ALLOW_ANYUV               (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\
+                                        UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|\
+                                        UTF8_ALLOW_FFFF|UTF8_ALLOW_LONG)
 #define UTF8_ALLOW_ANY                 0x00ff
 #define UTF8_CHECK_ONLY                        0x0100