This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate changes #9259,9260 from maintperl into mainline.
[perl5.git] / doop.c
diff --git a/doop.c b/doop.c
index 7e2b52f..823c88d 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -66,10 +66,10 @@ S_do_trans_simple(pTHX_ SV *sv)
         UV c;
 
         /* Need to check this, otherwise 128..255 won't match */
-       c = utf8_to_uv(s, send - s, &ulen, 0);
+       c = utf8n_to_uvchr(s, send - s, &ulen, 0);
         if (c < 0x100 && (ch = tbl[c]) >= 0) {
             matches++;
-           d = uv_to_utf8(d, ch);
+           d = uvchr_to_utf8(d, ch);
             s += ulen;
         }
        else { /* No match -> copy */
@@ -117,7 +117,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
        while (s < send) {
            UV c;
            STRLEN ulen;
-           c = utf8_to_uv(s, send - s, &ulen, 0);
+           c = utf8n_to_uvchr(s, send - s, &ulen, 0);
            if (c < 0x100) {
                if (tbl[c] >= 0)
                    matches++;
@@ -203,7 +203,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
            UV pch = 0xfeedface;
            while (s < send) {
                STRLEN len;
-               UV comp = utf8_to_uv_simple(s, &len);
+               UV comp = utf8_to_uvchr(s, &len);
 
                if (comp > 0xff) {
                    if (!complement) {
@@ -213,10 +213,11 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
                    else {
                        matches++;
                        if (!del) {
-                           ch = (comp - 0x100 < rlen) ?
+                           ch = (rlen == 0) ? comp :
+                               (comp - 0x100 < rlen) ?
                                tbl[comp+1] : tbl[0x100+rlen];
                            if (ch != pch) {
-                               d = uv_to_utf8(d, ch);
+                               d = uvchr_to_utf8(d, ch);
                                pch = ch;
                            }
                            s += len;
@@ -227,7 +228,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
                else if ((ch = tbl[comp]) >= 0) {
                    matches++;
                    if (ch != pch) {
-                       d = uv_to_utf8(d, ch);
+                       d = uvchr_to_utf8(d, ch);
                        pch = ch;
                    }
                    s += len;
@@ -246,7 +247,7 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
        else {
            while (s < send) {
                STRLEN len;
-               UV comp = utf8_to_uv_simple(s, &len);
+               UV comp = utf8_to_uvchr(s, &len);
                if (comp > 0xff) {
                    if (!complement) {
                        Copy(s, d, len, U8);
@@ -255,15 +256,15 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
                    else {
                        matches++;
                        if (!del) {
-                           if (comp - 0x100 < rlen) 
-                               d = uv_to_utf8(d, tbl[comp+1]);
+                           if (comp - 0x100 < rlen)
+                               d = uvchr_to_utf8(d, tbl[comp+1]);
                            else
-                               d = uv_to_utf8(d, tbl[0x100+rlen]);
+                               d = uvchr_to_utf8(d, tbl[0x100+rlen]);
                        }
                    }
                }
                else if ((ch = tbl[comp]) >= 0) {
-                   d = uv_to_utf8(d, ch);
+                   d = uvchr_to_utf8(d, ch);
                    matches++;
                }
                else if (ch == -1) {    /* -1 is unmapped character */
@@ -315,9 +316,11 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
     isutf8 = SvUTF8(sv);
     if (!isutf8) {
        U8 *t = s, *e = s + len;
-       while (t < e)
-           if ((hibit = UTF8_IS_CONTINUED(*t++)))
+       while (t < e) {
+           U8 ch = *t++;
+           if ((hibit = !NATIVE_IS_INVARIANT(ch)))
                break;
+       }
        if (hibit)
            s = bytes_to_utf8(s, &len);
     }
@@ -343,7 +346,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
        if ((uv = swash_fetch(rv, s)) < none) {
            s += UTF8SKIP(s);
            matches++;
-           d = uv_to_utf8(d, uv);
+           d = uvchr_to_utf8(d, uv);
        }
        else if (uv == none) {
            int i = UTF8SKIP(s);
@@ -355,7 +358,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
            int i = UTF8SKIP(s);
            s += i;
            matches++;
-           d = uv_to_utf8(d, final);
+           d = uvchr_to_utf8(d, final);
        }
        else
            s += UTF8SKIP(s);
@@ -382,6 +385,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
     }
     SvSETMAGIC(sv);
     SvUTF8_on(sv);
+    /* Downgrading just 'cos it will is suspect - NI-S */
     if (!isutf8 && !(PL_hints & HINT_UTF8))
        sv_utf8_downgrade(sv, TRUE);
 
@@ -400,22 +404,25 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
     HV* hv = (HV*)SvRV(rv);
     SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
     UV none = svp ? SvUV(*svp) : 0x7fffffff;
+    UV extra = none + 1;
     UV uv;
     U8 hibit = 0;
 
     s = (U8*)SvPV(sv, len);
     if (!SvUTF8(sv)) {
        U8 *t = s, *e = s + len;
-       while (t < e)
-           if ((hibit = !UTF8_IS_ASCII(*t++)))
+       while (t < e) {
+           U8 ch = *t++;
+           if ((hibit = !NATIVE_IS_INVARIANT(ch)))
                break;
+       }
        if (hibit)
            start = s = bytes_to_utf8(s, &len);
     }
     send = s + len;
 
     while (s < send) {
-       if ((uv = swash_fetch(rv, s)) < none)
+       if ((uv = swash_fetch(rv, s)) < none || uv == extra)
            matches++;
        s += UTF8SKIP(s);
     }
@@ -441,6 +448,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
     UV none = svp ? SvUV(*svp) : 0x7fffffff;
     UV extra = none + 1;
     UV final;
+    bool havefinal = FALSE;
     UV uv;
     STRLEN len;
     U8 *dstart, *dend;
@@ -451,9 +459,11 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
     isutf8 = SvUTF8(sv);
     if (!isutf8) {
        U8 *t = s, *e = s + len;
-       while (t < e)
-           if ((hibit = !UTF8_IS_ASCII(*t++)))
+       while (t < e) {
+           U8 ch = *t++;
+           if ((hibit = !NATIVE_IS_INVARIANT(ch)))
                break;
+       }
        if (hibit)
            s = bytes_to_utf8(s, &len);
     }
@@ -461,8 +471,10 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
     start = s;
 
     svp = hv_fetch(hv, "FINAL", 5, FALSE);
-    if (svp)
+    if (svp) {
        final = SvUV(*svp);
+       havefinal = TRUE;
+    }
 
     if (grows) {
        /* d needs to be bigger than s, in case e.g. upgrading is required */
@@ -479,7 +491,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
        UV puv = 0xfeedface;
        while (s < send) {
            uv = swash_fetch(rv, s);
-           
+       
            if (d > dend) {
                STRLEN clen = d - dstart;
                STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
@@ -491,11 +503,11 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
            }
            if (uv < none) {
                matches++;
+               s += UTF8SKIP(s);
                if (uv != puv) {
-                   d = uv_to_utf8(d, uv);
+                   d = uvchr_to_utf8(d, uv);
                    puv = uv;
                }
-               s += UTF8SKIP(s);
                continue;
            }
            else if (uv == none) {      /* "none" is unmapped character */
@@ -508,11 +520,23 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
            }
            else if (uv == extra && !del) {
                matches++;
-               if (uv != puv) {
-                   d = uv_to_utf8(d, final);
-                   puv = final;
+               if (havefinal) {
+                   s += UTF8SKIP(s);
+                   if (puv != final) {
+                       d = uvchr_to_utf8(d, final);
+                       puv = final;
+                   }
+               }
+               else {
+                   STRLEN len;
+                   uv = utf8_to_uvchr(s, &len);
+                   if (uv != puv) {
+                       Copy(s, d, len, U8);
+                       d += len;
+                       puv = uv;
+                   }
+                   s += len;
                }
-               s += UTF8SKIP(s);
                continue;
            }
            matches++;                  /* "none+1" is delete character */
@@ -534,7 +558,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
            if (uv < none) {
                matches++;
                s += UTF8SKIP(s);
-               d = uv_to_utf8(d, uv);
+               d = uvchr_to_utf8(d, uv);
                continue;
            }
            else if (uv == none) {      /* "none" is unmapped character */
@@ -546,8 +570,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
            }
            else if (uv == extra && !del) {
                matches++;
-               d = uv_to_utf8(d, final);
                s += UTF8SKIP(s);
+               d = uvchr_to_utf8(d, final);
                continue;
            }
            matches++;                  /* "none+1" is delete character */
@@ -600,6 +624,7 @@ Perl_do_trans(pTHX_ SV *sv)
            return do_trans_simple(sv);
 
     case OPpTRANS_IDENTICAL:
+    case OPpTRANS_IDENTICAL|OPpTRANS_COMPLEMENT:
        if (hasutf)
            return do_trans_count_utf8(sv);
        else
@@ -934,7 +959,7 @@ Perl_do_chop(pTHX_ register SV *astr, register SV *sv)
            s = send - 1;
            while (s > start && UTF8_IS_CONTINUATION(*s))
                s--;
-           if (utf8_to_uv_simple((U8*)s, 0)) {
+           if (utf8_to_uvchr((U8*)s, 0)) {
                sv_setpvn(astr, s, send - s);
                *s = '\0';
                SvCUR_set(sv, s - start);
@@ -1100,14 +1125,14 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
        switch (optype) {
        case OP_BIT_AND:
            while (lulen && rulen) {
-               luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
+               luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
+               ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
                rc += ulen;
                rulen -= ulen;
                duc = luc & ruc;
-               dc = (char*)uv_to_utf8((U8*)dc, duc);
+               dc = (char*)uvchr_to_utf8((U8*)dc, duc);
            }
            if (sv == left || sv == right)
                (void)sv_usepvn(sv, dcsave, needlen);
@@ -1115,26 +1140,26 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
            break;
        case OP_BIT_XOR:
            while (lulen && rulen) {
-               luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
+               luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
+               ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
                rc += ulen;
                rulen -= ulen;
                duc = luc ^ ruc;
-               dc = (char*)uv_to_utf8((U8*)dc, duc);
+               dc = (char*)uvchr_to_utf8((U8*)dc, duc);
            }
            goto mop_up_utf;
        case OP_BIT_OR:
            while (lulen && rulen) {
-               luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
+               luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
+               ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
                rc += ulen;
                rulen -= ulen;
                duc = luc | ruc;
-               dc = (char*)uv_to_utf8((U8*)dc, duc);
+               dc = (char*)uvchr_to_utf8((U8*)dc, duc);
            }
          mop_up_utf:
            if (sv == left || sv == right)