This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate changes #9259,9260 from maintperl into mainline.
[perl5.git] / doop.c
diff --git a/doop.c b/doop.c
index f323069..823c88d 100644 (file)
--- a/doop.c
+++ b/doop.c
@@ -66,10 +66,10 @@ S_do_trans_simple(pTHX_ SV *sv)
         UV c;
 
         /* Need to check this, otherwise 128..255 won't match */
-       c = utf8_to_uv(s, send - s, &ulen, 0);
+       c = utf8n_to_uvchr(s, send - s, &ulen, 0);
         if (c < 0x100 && (ch = tbl[c]) >= 0) {
             matches++;
-           d = uv_to_utf8(d, ch);
+           d = uvchr_to_utf8(d, ch);
             s += ulen;
         }
        else { /* No match -> copy */
@@ -99,6 +99,7 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
     I32 matches = 0;
     STRLEN len;
     short *tbl;
+    I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT;
 
     tbl = (short*)cPVOP->op_pv;
     if (!tbl)
@@ -116,8 +117,11 @@ S_do_trans_count(pTHX_ SV *sv)/* SPC - OK */
        while (s < send) {
            UV c;
            STRLEN ulen;
-           c = utf8_to_uv(s, send - s, &ulen, 0);
-           if (c < 0x100 && tbl[c] >= 0)
+           c = utf8n_to_uvchr(s, send - s, &ulen, 0);
+           if (c < 0x100) {
+               if (tbl[c] >= 0)
+                   matches++;
+           } else if (complement)
                matches++;
            s += ulen;
        }
@@ -135,7 +139,9 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
     I32 isutf8;
     I32 matches = 0;
     I32 grows = PL_op->op_private & OPpTRANS_GROWS;
-    STRLEN len;
+    I32 complement = PL_op->op_private & OPpTRANS_COMPLEMENT;
+    I32 del = PL_op->op_private & OPpTRANS_DELETE;
+    STRLEN len, rlen;
     short *tbl;
     I32 ch;
 
@@ -186,6 +192,8 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
        else
            d = s;
        dstart = d;
+       if (complement && !del)
+           rlen = tbl[0x100];
 
 #ifdef MACOS_TRADITIONAL
 #define comp CoMP   /* "comp" is a keyword in some compilers ... */
@@ -195,16 +203,32 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
            UV pch = 0xfeedface;
            while (s < send) {
                STRLEN len;
-               UV comp = utf8_to_uv_simple(s, &len);
+               UV comp = utf8_to_uvchr(s, &len);
 
-               if (comp > 0xff) {      /* always unmapped */   
-                   Copy(s, d, len, U8);
-                   d += len;
+               if (comp > 0xff) {
+                   if (!complement) {
+                       Copy(s, d, len, U8);
+                       d += len;
+                   }
+                   else {
+                       matches++;
+                       if (!del) {
+                           ch = (rlen == 0) ? comp :
+                               (comp - 0x100 < rlen) ?
+                               tbl[comp+1] : tbl[0x100+rlen];
+                           if (ch != pch) {
+                               d = uvchr_to_utf8(d, ch);
+                               pch = ch;
+                           }
+                           s += len;
+                           continue;
+                       }
+                   }
                }
                else if ((ch = tbl[comp]) >= 0) {
                    matches++;
                    if (ch != pch) {
-                       d = uv_to_utf8(d, ch);
+                       d = uvchr_to_utf8(d, ch);
                        pch = ch;
                    }
                    s += len;
@@ -223,13 +247,24 @@ S_do_trans_complex(pTHX_ SV *sv)/* SPC - NOT OK */
        else {
            while (s < send) {
                STRLEN len;
-               UV comp = utf8_to_uv_simple(s, &len);
-               if (comp > 0xff) {      /* always unmapped */
-                   Copy(s, d, len, U8);
-                   d += len;
+               UV comp = utf8_to_uvchr(s, &len);
+               if (comp > 0xff) {
+                   if (!complement) {
+                       Copy(s, d, len, U8);
+                       d += len;
+                   }
+                   else {
+                       matches++;
+                       if (!del) {
+                           if (comp - 0x100 < rlen)
+                               d = uvchr_to_utf8(d, tbl[comp+1]);
+                           else
+                               d = uvchr_to_utf8(d, tbl[0x100+rlen]);
+                       }
+                   }
                }
                else if ((ch = tbl[comp]) >= 0) {
-                   d = uv_to_utf8(d, ch);
+                   d = uvchr_to_utf8(d, ch);
                    matches++;
                }
                else if (ch == -1) {    /* -1 is unmapped character */
@@ -281,9 +316,11 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
     isutf8 = SvUTF8(sv);
     if (!isutf8) {
        U8 *t = s, *e = s + len;
-       while (t < e)
-           if ((hibit = UTF8_IS_CONTINUED(*t++)))
+       while (t < e) {
+           U8 ch = *t++;
+           if ((hibit = !NATIVE_IS_INVARIANT(ch)))
                break;
+       }
        if (hibit)
            s = bytes_to_utf8(s, &len);
     }
@@ -309,7 +346,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
        if ((uv = swash_fetch(rv, s)) < none) {
            s += UTF8SKIP(s);
            matches++;
-           d = uv_to_utf8(d, uv);
+           d = uvchr_to_utf8(d, uv);
        }
        else if (uv == none) {
            int i = UTF8SKIP(s);
@@ -321,7 +358,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
            int i = UTF8SKIP(s);
            s += i;
            matches++;
-           d = uv_to_utf8(d, final);
+           d = uvchr_to_utf8(d, final);
        }
        else
            s += UTF8SKIP(s);
@@ -348,6 +385,7 @@ S_do_trans_simple_utf8(pTHX_ SV *sv)/* SPC - OK */
     }
     SvSETMAGIC(sv);
     SvUTF8_on(sv);
+    /* Downgrading just 'cos it will is suspect - NI-S */
     if (!isutf8 && !(PL_hints & HINT_UTF8))
        sv_utf8_downgrade(sv, TRUE);
 
@@ -366,22 +404,25 @@ S_do_trans_count_utf8(pTHX_ SV *sv)/* SPC - OK */
     HV* hv = (HV*)SvRV(rv);
     SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
     UV none = svp ? SvUV(*svp) : 0x7fffffff;
+    UV extra = none + 1;
     UV uv;
     U8 hibit = 0;
 
     s = (U8*)SvPV(sv, len);
     if (!SvUTF8(sv)) {
        U8 *t = s, *e = s + len;
-       while (t < e)
-           if ((hibit = !UTF8_IS_ASCII(*t++)))
+       while (t < e) {
+           U8 ch = *t++;
+           if ((hibit = !NATIVE_IS_INVARIANT(ch)))
                break;
+       }
        if (hibit)
            start = s = bytes_to_utf8(s, &len);
     }
     send = s + len;
 
     while (s < send) {
-       if ((uv = swash_fetch(rv, s)) < none)
+       if ((uv = swash_fetch(rv, s)) < none || uv == extra)
            matches++;
        s += UTF8SKIP(s);
     }
@@ -407,6 +448,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
     UV none = svp ? SvUV(*svp) : 0x7fffffff;
     UV extra = none + 1;
     UV final;
+    bool havefinal = FALSE;
     UV uv;
     STRLEN len;
     U8 *dstart, *dend;
@@ -417,9 +459,11 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
     isutf8 = SvUTF8(sv);
     if (!isutf8) {
        U8 *t = s, *e = s + len;
-       while (t < e)
-           if ((hibit = !UTF8_IS_ASCII(*t++)))
+       while (t < e) {
+           U8 ch = *t++;
+           if ((hibit = !NATIVE_IS_INVARIANT(ch)))
                break;
+       }
        if (hibit)
            s = bytes_to_utf8(s, &len);
     }
@@ -427,8 +471,10 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
     start = s;
 
     svp = hv_fetch(hv, "FINAL", 5, FALSE);
-    if (svp)
+    if (svp) {
        final = SvUV(*svp);
+       havefinal = TRUE;
+    }
 
     if (grows) {
        /* d needs to be bigger than s, in case e.g. upgrading is required */
@@ -445,7 +491,7 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
        UV puv = 0xfeedface;
        while (s < send) {
            uv = swash_fetch(rv, s);
-           
+       
            if (d > dend) {
                STRLEN clen = d - dstart;
                STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
@@ -457,11 +503,11 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
            }
            if (uv < none) {
                matches++;
+               s += UTF8SKIP(s);
                if (uv != puv) {
-                   d = uv_to_utf8(d, uv);
+                   d = uvchr_to_utf8(d, uv);
                    puv = uv;
                }
-               s += UTF8SKIP(s);
                continue;
            }
            else if (uv == none) {      /* "none" is unmapped character */
@@ -474,11 +520,23 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
            }
            else if (uv == extra && !del) {
                matches++;
-               if (uv != puv) {
-                   d = uv_to_utf8(d, final);
-                   puv = final;
+               if (havefinal) {
+                   s += UTF8SKIP(s);
+                   if (puv != final) {
+                       d = uvchr_to_utf8(d, final);
+                       puv = final;
+                   }
+               }
+               else {
+                   STRLEN len;
+                   uv = utf8_to_uvchr(s, &len);
+                   if (uv != puv) {
+                       Copy(s, d, len, U8);
+                       d += len;
+                       puv = uv;
+                   }
+                   s += len;
                }
-               s += UTF8SKIP(s);
                continue;
            }
            matches++;                  /* "none+1" is delete character */
@@ -499,8 +557,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
            }
            if (uv < none) {
                matches++;
-               d = uv_to_utf8(d, uv);
                s += UTF8SKIP(s);
+               d = uvchr_to_utf8(d, uv);
                continue;
            }
            else if (uv == none) {      /* "none" is unmapped character */
@@ -512,8 +570,8 @@ S_do_trans_complex_utf8(pTHX_ SV *sv) /* SPC - NOT OK */
            }
            else if (uv == extra && !del) {
                matches++;
-               d = uv_to_utf8(d, final);
                s += UTF8SKIP(s);
+               d = uvchr_to_utf8(d, final);
                continue;
            }
            matches++;                  /* "none+1" is delete character */
@@ -566,6 +624,7 @@ Perl_do_trans(pTHX_ SV *sv)
            return do_trans_simple(sv);
 
     case OPpTRANS_IDENTICAL:
+    case OPpTRANS_IDENTICAL|OPpTRANS_COMPLEMENT:
        if (hasutf)
            return do_trans_count_utf8(sv);
        else
@@ -900,7 +959,7 @@ Perl_do_chop(pTHX_ register SV *astr, register SV *sv)
            s = send - 1;
            while (s > start && UTF8_IS_CONTINUATION(*s))
                s--;
-           if (utf8_to_uv_simple((U8*)s, 0)) {
+           if (utf8_to_uvchr((U8*)s, 0)) {
                sv_setpvn(astr, s, send - s);
                *s = '\0';
                SvCUR_set(sv, s - start);
@@ -1066,14 +1125,14 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
        switch (optype) {
        case OP_BIT_AND:
            while (lulen && rulen) {
-               luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
+               luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
+               ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
                rc += ulen;
                rulen -= ulen;
                duc = luc & ruc;
-               dc = (char*)uv_to_utf8((U8*)dc, duc);
+               dc = (char*)uvchr_to_utf8((U8*)dc, duc);
            }
            if (sv == left || sv == right)
                (void)sv_usepvn(sv, dcsave, needlen);
@@ -1081,26 +1140,26 @@ Perl_do_vop(pTHX_ I32 optype, SV *sv, SV *left, SV *right)
            break;
        case OP_BIT_XOR:
            while (lulen && rulen) {
-               luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
+               luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
+               ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
                rc += ulen;
                rulen -= ulen;
                duc = luc ^ ruc;
-               dc = (char*)uv_to_utf8((U8*)dc, duc);
+               dc = (char*)uvchr_to_utf8((U8*)dc, duc);
            }
            goto mop_up_utf;
        case OP_BIT_OR:
            while (lulen && rulen) {
-               luc = utf8_to_uv((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
+               luc = utf8n_to_uvchr((U8*)lc, lulen, &ulen, UTF8_ALLOW_ANYUV);
                lc += ulen;
                lulen -= ulen;
-               ruc = utf8_to_uv((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
+               ruc = utf8n_to_uvchr((U8*)rc, rulen, &ulen, UTF8_ALLOW_ANYUV);
                rc += ulen;
                rulen -= ulen;
                duc = luc | ruc;
-               dc = (char*)uv_to_utf8((U8*)dc, duc);
+               dc = (char*)uvchr_to_utf8((U8*)dc, duc);
            }
          mop_up_utf:
            if (sv == left || sv == right)
@@ -1195,7 +1254,7 @@ finish:
 OP *
 Perl_do_kv(pTHX)
 {
-    djSP;
+    dSP;
     HV *hv = (HV*)POPs;
     HV *keys;
     register HE *entry;