#include <signal.h>
#endif
-#define HALF_UTF8_UPGRADE(start,end) \
- STMT_START { \
- if ((start)<(end)) { \
- U8* NeWsTr; \
- STRLEN LeN = (end) - (start); \
- NeWsTr = bytes_to_utf8(start, &LeN); \
- Safefree(start); \
- (start) = NeWsTr; \
- (end) = (start) + LeN; \
- } \
- } STMT_END
-
STATIC I32
S_do_trans_simple(pTHX_ SV *sv)
{
U8 *send;
U8 *dstart;
I32 matches = 0;
- I32 sutf = SvUTF8(sv);
STRLEN len;
short *tbl;
I32 ch;
send = s + len;
/* First, take care of non-UTF8 input strings, because they're easy */
- if (!sutf) {
+ if (!SvUTF8(sv)) {
while (s < send) {
if ((ch = tbl[*s]) >= 0) {
matches++;
c = utf8_to_uv(s, send - s, &ulen, 0);
if (c < 0x100 && (ch = tbl[(short)c]) >= 0) {
matches++;
- d = uv_to_utf8(d, ch);
+ if (ch < 0x80)
+ *d++ = ch;
+ else
+ d = uv_to_utf8(d,ch);
s += ulen;
}
else { /* No match -> copy */
}
}
*d = '\0';
- sv_setpvn(sv, (const char*)dstart, d - dstart);
- Safefree(dstart);
+ sv_setpvn(sv, (char*)dstart, d - dstart);
SvUTF8_on(sv);
SvSETMAGIC(sv);
return matches;
U8 *s;
U8 *send;
I32 matches = 0;
- I32 hasutf = SvUTF8(sv);
STRLEN len;
short *tbl;
s = (U8*)SvPV(sv, len);
send = s + len;
- while (s < send) {
- if (hasutf && *s & 0x80)
- s += UTF8SKIP(s);
- else {
- UV c;
- STRLEN ulen;
- ulen = 1;
- if (hasutf)
- c = utf8_to_uv(s, send - s, &ulen, 0);
- else
- c = *s;
- if (c < 0x100 && tbl[c] >= 0)
+ if (!SvUTF8(sv))
+ while (s < send) {
+ if (tbl[*s++] >= 0)
matches++;
- s += ulen;
- }
- }
+ }
+ else
+ while (s < send) {
+ UV c;
+ STRLEN ulen;
+ c = utf8_to_uv(s, send - s, &ulen, 0);
+ if (c < 0x100 && tbl[c] >= 0)
+ matches++;
+ s += ulen;
+ }
return matches;
}
U8 *send;
U8 *d;
U8 *dstart;
- I32 hasutf = SvUTF8(sv);
+ I32 isutf8;
I32 matches = 0;
STRLEN len;
short *tbl;
Perl_croak(aTHX_ "panic: do_trans_complex");
s = (U8*)SvPV(sv, len);
+ isutf8 = SvUTF8(sv);
send = s + len;
- Newz(0, d, len*2+1, U8);
- dstart = d;
-
- if (PL_op->op_private & OPpTRANS_SQUASH) {
- U8* p = send;
-
- while (s < send) {
- if (hasutf && *s & 0x80)
- s += UTF8SKIP(s);
- else {
- if ((ch = tbl[*s]) >= 0) {
+ if (!isutf8) {
+ dstart = d = s;
+ if (PL_op->op_private & OPpTRANS_SQUASH) {
+ U8* p = send;
+ while (s < send) {
+ if ((ch = tbl[*s]) >= 0) {
*d = ch;
matches++;
- if (p != d - 1 || *p != *d)
- p = d++;
- }
- else if (ch == -1) /* -1 is unmapped character */
- *d++ = *s; /* -2 is delete character */
- s++;
- }
+ if (p != d - 1 || *p != *d)
+ p = d++;
+ }
+ else if (ch == -1) /* -1 is unmapped character */
+ *d++ = *s;
+ else if (ch == -2) /* -2 is delete character */
+ matches++;
+ s++;
+ }
}
+ else {
+ while (s < send) {
+ if ((ch = tbl[*s]) >= 0) {
+ matches++;
+ *d++ = ch;
+ }
+ else if (ch == -1) /* -1 is unmapped character */
+ *d++ = *s;
+ else if (ch == -2) /* -2 is delete character */
+ matches++;
+ s++;
+ }
+ }
+ SvCUR_set(sv, d - dstart);
}
- else {
- while (s < send) {
- UV comp;
- if (hasutf && *s & 0x80)
- comp = utf8_to_uv_simple(s, NULL);
- else
- comp = *s;
-
- ch = tbl[comp];
-
- if (ch == -1) { /* -1 is unmapped character */
- ch = comp;
- matches--;
- }
-
- if (ch >= 0)
- d = uv_to_utf8(d, ch);
-
- matches++;
-
- s += hasutf && *s & 0x80 ? UNISKIP(*s) : 1;
-
+ else { /* isutf8 */
+ Newz(0, d, len*2+1, U8);
+ dstart = d;
+
+ if (PL_op->op_private & OPpTRANS_SQUASH) {
+ U8* p = send;
+ UV pch = 0xfeedface;
+ while (s < send) {
+ STRLEN len;
+ UV comp = utf8_to_uv_simple(s, &len);
+
+ if (comp > 0xff)
+ d = uv_to_utf8(d, comp); /* always unmapped */
+ else if ((ch = tbl[comp]) >= 0) {
+ matches++;
+ if (ch != pch) {
+ d = uv_to_utf8(d, ch);
+ pch = ch;
+ }
+ s += len;
+ continue;
+ }
+ else if (ch == -1) /* -1 is unmapped character */
+ d = uv_to_utf8(d, comp);
+ else if (ch == -2) /* -2 is delete character */
+ matches++;
+ s += len;
+ pch = 0xfeedface;
+ }
+ }
+ else {
+ while (s < send) {
+ STRLEN len;
+ UV comp = utf8_to_uv_simple(s, &len);
+ if (comp > 0xff)
+ d = uv_to_utf8(d, comp); /* always unmapped */
+ else if ((ch = tbl[comp]) >= 0) {
+ d = uv_to_utf8(d, ch);
+ matches++;
+ }
+ else if (ch == -1) { /* -1 is unmapped character */
+ d = uv_to_utf8(d, comp);
+ }
+ else if (ch == -2) /* -2 is delete character */
+ matches++;
+ s += len;
+ }
}
+ *d = '\0';
+ sv_setpvn(sv, (char*)dstart, d - dstart);
+ SvUTF8_on(sv);
}
-
- *d = '\0';
-
- sv_setpvn(sv, (const char*)dstart, d - dstart);
- Safefree(dstart);
- if (hasutf)
- SvUTF8_on(sv);
SvSETMAGIC(sv);
return matches;
-
}
STATIC I32
U8 *send;
U8 *d;
U8 *start;
- U8 *dstart;
+ U8 *dstart, *dend;
I32 matches = 0;
STRLEN len;
UV extra = none + 1;
UV final;
UV uv;
- I32 isutf;
- I32 howmany;
+ I32 isutf8;
+ U8 hibit = 0;
- isutf = SvUTF8(sv);
s = (U8*)SvPV(sv, len);
+ isutf8 = SvUTF8(sv);
+ if (!isutf8) {
+ U8 *t = s, *e = s + len;
+ while (t < e)
+ if ((hibit = *t++ & 0x80))
+ break;
+ if (hibit)
+ s = bytes_to_utf8(s, &len);
+ }
send = s + len;
start = s;
final = SvUV(*svp);
/* d needs to be bigger than s, in case e.g. upgrading is required */
- Newz(0, d, len*2+1, U8);
+ New(0, d, len*3+UTF8_MAXLEN, U8);
+ dend = d + len * 3;
dstart = d;
+
while (s < send) {
if ((uv = swash_fetch(rv, s)) < none) {
s += UTF8SKIP(s);
matches++;
- if ((uv & 0x80) && !isutf++)
- HALF_UTF8_UPGRADE(dstart,d);
d = uv_to_utf8(d, uv);
}
else if (uv == none) {
- int i;
- i = UTF8SKIP(s);
- if (i > 1 && !isutf++)
- HALF_UTF8_UPGRADE(dstart,d);
+ int i = UTF8SKIP(s);
while(i--)
*d++ = *s++;
}
else if (uv == extra) {
- int i;
- i = UTF8SKIP(s);
+ int i = UTF8SKIP(s);
s += i;
matches++;
- if (i > 1 && !isutf++)
- HALF_UTF8_UPGRADE(dstart,d);
d = uv_to_utf8(d, final);
}
else
s += UTF8SKIP(s);
+
+ if (d >= dend) {
+ STRLEN clen = d - dstart;
+ STRLEN nlen = dend - dstart + len + UTF8_MAXLEN;
+ Renew(dstart, nlen+UTF8_MAXLEN, U8);
+ d = dstart + clen;
+ dend = dstart + nlen;
+ }
}
*d = '\0';
- sv_setpvn(sv, (const char*)dstart, d - dstart);
+ sv_setpvn(sv, (char*)dstart, d - dstart);
SvSETMAGIC(sv);
- if (isutf)
- SvUTF8_on(sv);
+ SvUTF8_on(sv);
+ if (hibit)
+ Safefree(start);
+ if (!isutf8 && !(PL_hints & HINT_UTF8))
+ sv_utf8_downgrade(sv, TRUE);
return matches;
}
{
dTHR;
U8 *s;
- U8 *send;
+ U8 *start, *send;
I32 matches = 0;
STRLEN len;
SV** svp = hv_fetch(hv, "NONE", 4, FALSE);
UV none = svp ? SvUV(*svp) : 0x7fffffff;
UV uv;
+ U8 hibit = 0;
s = (U8*)SvPV(sv, len);
- if (!SvUTF8(sv))
- s = bytes_to_utf8(s, &len);
+ if (!SvUTF8(sv)) {
+ U8 *t = s, *e = s + len;
+ while (t < e)
+ if ((hibit = *t++ & 0x80))
+ break;
+ if (hibit)
+ start = s = bytes_to_utf8(s, &len);
+ }
send = s + len;
while (s < send) {
matches++;
s += UTF8SKIP(s);
}
+ if (hibit)
+ Safefree(start);
return matches;
}
{
dTHR;
U8 *s;
- U8 *send;
+ U8 *start, *send;
U8 *d;
I32 matches = 0;
I32 squash = PL_op->op_private & OPpTRANS_SQUASH;
UV final;
UV uv;
STRLEN len;
- U8 *dst;
- I32 isutf = SvUTF8(sv);
+ U8 *dstart, *dend;
+ I32 isutf8;
+ U8 hibit = 0;
s = (U8*)SvPV(sv, len);
+ isutf8 = SvUTF8(sv);
+ if (!isutf8) {
+ U8 *t = s, *e = s + len;
+ while (t < e)
+ if ((hibit = *t++ & 0x80))
+ break;
+ if (hibit)
+ s = bytes_to_utf8(s, &len);
+ }
send = s + len;
+ start = s;
svp = hv_fetch(hv, "FINAL", 5, FALSE);
if (svp)
final = SvUV(*svp);
- Newz(0, d, len*2+1, U8);
- dst = d;
+ New(0, d, len*3+UTF8_MAXLEN, U8);
+ dend = d + len * 3;
+ dstart = d;
if (squash) {
UV puv = 0xfeedface;
while (s < send) {
- if (SvUTF8(sv))
- uv = swash_fetch(rv, s);
- else {
- U8 tmpbuf[2];
- uv = *s++;
- if (uv < 0x80)
- tmpbuf[0] = uv;
- else {
- tmpbuf[0] = (( uv >> 6) | 0xc0);
- tmpbuf[1] = (( uv & 0x3f) | 0x80);
- }
- uv = swash_fetch(rv, tmpbuf);
+ uv = swash_fetch(rv, s);
+
+ if (d >= dend) {
+ STRLEN clen = d - dstart, nlen = dend - dstart + len;
+ Renew(dstart, nlen+UTF8_MAXLEN, U8);
+ d = dstart + clen;
+ dend = dstart + nlen;
}
-
if (uv < none) {
matches++;
if (uv != puv) {
- if ((uv & 0x80) && !isutf++)
- HALF_UTF8_UPGRADE(dst,d);
d = uv_to_utf8(d, uv);
puv = uv;
}
continue;
}
else if (uv == none) { /* "none" is unmapped character */
- STRLEN ulen;
- *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0);
- s += ulen;
+ int i = UTF8SKIP(s);
+ while(i--)
+ *d++ = *s++;
puv = 0xfeedface;
continue;
}
}
else {
while (s < send) {
- if (SvUTF8(sv))
- uv = swash_fetch(rv, s);
- else {
- U8 tmpbuf[2];
- uv = *s++;
- if (uv < 0x80)
- tmpbuf[0] = uv;
- else {
- tmpbuf[0] = (( uv >> 6) | 0xc0);
- tmpbuf[1] = (( uv & 0x3f) | 0x80);
- }
- uv = swash_fetch(rv, tmpbuf);
+ uv = swash_fetch(rv, s);
+ if (d >= dend) {
+ STRLEN clen = d - dstart, nlen = dend - dstart + len;
+ Renew(dstart, nlen+UTF8_MAXLEN, U8);
+ d = dstart + clen;
+ dend = dstart + nlen;
}
if (uv < none) {
matches++;
continue;
}
else if (uv == none) { /* "none" is unmapped character */
- STRLEN ulen;
- *d++ = (U8)utf8_to_uv(s, send - s, &ulen, 0);
- s += ulen;
+ int i = UTF8SKIP(s);
+ while(i--)
+ *d++ = *s++;
continue;
}
else if (uv == extra && !del) {
s += UTF8SKIP(s);
}
}
- if (dst)
- sv_usepvn(sv, (char*)dst, d - dst);
- else {
- *d = '\0';
- SvCUR_set(sv, d - (U8*)SvPVX(sv));
- }
+ *d = '\0';
+ sv_setpvn(sv, (char*)dstart, d - dstart);
+ SvUTF8_on(sv);
+ if (hibit)
+ Safefree(start);
+ if (!isutf8 && !(PL_hints & HINT_UTF8))
+ sv_utf8_downgrade(sv, TRUE);
SvSETMAGIC(sv);
return matches;
char *send = s + len;
char *start = s;
s = send - 1;
- while ((*s & 0xc0) == 0x80)
- --s;
- if (UTF8SKIP(s) != send - s && ckWARN_d(WARN_UTF8))
- Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
- sv_setpvn(astr, s, send - s);
- *s = '\0';
- SvCUR_set(sv, s - start);
- SvNIOK_off(sv);
- SvUTF8_on(astr);
+ while (s > start && UTF8_IS_CONTINUATION(*s))
+ s--;
+ if (utf8_to_uv_simple((U8*)s, 0)) {
+ sv_setpvn(astr, s, send - s);
+ *s = '\0';
+ SvCUR_set(sv, s - start);
+ SvNIOK_off(sv);
+ SvUTF8_on(astr);
+ }
}
else
sv_setpvn(astr, "", 0);
{
STRLEN len;
char *tmps = SvPV(sv,len);
- sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len);
+ if (DO_UTF8(sv)) {
+ sv_utf8_upgrade(LvTARG(sv));
+ sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len);
+ SvUTF8_on(LvTARG(sv));
+ }
+ else
+ sv_insert(LvTARG(sv),LvTARGOFF(sv),LvTARGLEN(sv), tmps, len);
+
return 0;
}
XPUSHs(TARG);
RETURN;
}
+ else {
+ SvUTF8_off(TARG);
+ }
SvGROW(TARG,2);
SvCUR_set(TARG, 1);
U8* s = (U8*)SvPVX(TARG);
U8* send = (U8*)(s + len);
while (s < send) {
- if (*s < 0x80) {
+ if (UTF8_IS_ASCII(*s)) {
s++;
continue;
}
else {
+ if (!utf8_to_uv_simple(s, 0))
+ break;
up = (char*)s;
s += UTF8SKIP(s);
down = (char*)(s - 1);
- if (s > send || !((*down & 0xc0) == 0x80)) {
- if (ckWARN_d(WARN_UTF8))
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character");
- break;
- }
+ /* reverse this character */
while (down > up) {
tmp = *up;
*up++ = *down;
AV *ary;
register IV limit = POPi; /* note, negative is forever */
SV *sv = POPs;
- bool do_utf8 = DO_UTF8(sv);
STRLEN len;
register char *s = SvPV(sv, len);
+ bool do_utf8 = DO_UTF8(sv);
char *strend = s + len;
register PMOP *pm;
register REGEXP *rx;
SV *csv = CALLREG_INTUIT_STRING(aTHX_ rx);
len = rx->minlen;
- if (len == 1 && !tail) {
+ if (len == 1 && !(rx->reganch & ROPT_UTF8) && !tail) {
STRLEN n_a;
char c = *SvPV(csv, n_a);
while (--limit) {
XPUSHs(dstr);
/* The rx->minlen is in characters but we want to step
* s ahead by bytes. */
- s = m + (do_utf8 ? SvCUR(csv) : len);
+ if (do_utf8)
+ s = (char*)utf8_hop((U8*)m, len);
+ else
+ s = m + len; /* Fake \n at the end */
}
}
else {
XPUSHs(dstr);
/* The rx->minlen is in characters but we want to step
* s ahead by bytes. */
- s = m + (do_utf8 ? SvCUR(csv) : len); /* Fake \n at the end */
+ if (do_utf8)
+ s = (char*)utf8_hop((U8*)m, len);
+ else
+ s = m + len; /* Fake \n at the end */
}
}
}
sv_setpvn(TARG,s,len);
if (SvUTF8(TOPs) && !IN_BYTE)
SvUTF8_on(TARG);
+ else
+ SvUTF8_off(TARG);
SETTARG;
RETURN;
}
djSP; dATARGET; tryAMAGICbin(concat,opASSIGN);
{
dPOPTOPssrl;
- STRLEN len;
- U8 *s;
- bool left_utf8;
- bool right_utf8;
+ SV* rcopy = Nullsv;
- if (TARG == right && SvGMAGICAL(right))
- mg_get(right);
if (SvGMAGICAL(left))
mg_get(left);
+ if (TARG == right && SvGMAGICAL(right))
+ mg_get(right);
- left_utf8 = DO_UTF8(left);
- right_utf8 = DO_UTF8(right);
-
- if (left_utf8 != right_utf8) {
- if (TARG == right && !right_utf8) {
- sv_utf8_upgrade(TARG); /* Now straight binary copy */
- SvUTF8_on(TARG);
- }
- else {
- /* Set TARG to PV(left), then add right */
- U8 *l, *c, *olds = NULL;
- STRLEN targlen;
- s = (U8*)SvPV(right,len);
- right_utf8 |= DO_UTF8(right);
- if (TARG == right) {
- /* Take a copy since we're about to overwrite TARG */
- olds = s = (U8*)savepvn((char*)s, len);
- }
- if (!SvOK(left) && SvTYPE(left) <= SVt_PVMG) {
- if (SvREADONLY(left))
- left = sv_2mortal(newSVsv(left));
- else
- sv_setpv(left, ""); /* Suppress warning. */
- }
- l = (U8*)SvPV(left, targlen);
- left_utf8 |= DO_UTF8(left);
- if (TARG != left)
- sv_setpvn(TARG, (char*)l, targlen);
- if (!left_utf8)
- sv_utf8_upgrade(TARG);
- /* Extend TARG to length of right (s) */
- targlen = SvCUR(TARG) + len;
- if (!right_utf8) {
- /* plus one for each hi-byte char if we have to upgrade */
- for (c = s; c < s + len; c++) {
- if (UTF8_IS_CONTINUED(*c))
- targlen++;
- }
- }
- SvGROW(TARG, targlen+1);
- /* And now copy, maybe upgrading right to UTF8 on the fly */
- if (right_utf8)
- Copy(s, SvEND(TARG), len, U8);
- else {
- for (c = (U8*)SvEND(TARG); len--; s++)
- c = uv_to_utf8(c, *s);
- }
- SvCUR_set(TARG, targlen);
- *SvEND(TARG) = '\0';
- SvUTF8_on(TARG);
- SETs(TARG);
- Safefree(olds);
- RETURN;
- }
- }
-
- if (TARG != left) {
- s = (U8*)SvPV(left,len);
- if (TARG == right) {
- sv_insert(TARG, 0, 0, (char*)s, len);
- SETs(TARG);
- RETURN;
+ if (TARG == right && left != right)
+ /* Clone since otherwise we cannot prepend. */
+ rcopy = sv_2mortal(newSVsv(right));
+
+ if (TARG != left)
+ sv_setsv(TARG, left);
+
+ if (TARG == right) {
+ if (left == right) {
+ /* $right = $right . $right; */
+ STRLEN rlen;
+ char *rpv = SvPV(right, rlen);
+
+ sv_catpvn(TARG, rpv, rlen);
}
- sv_setpvn(TARG, (char *)s, len);
+ else /* $right = $left . $right; */
+ sv_catsv(TARG, rcopy);
}
- else if (!SvOK(TARG) && SvTYPE(TARG) <= SVt_PVMG)
- sv_setpv(TARG, ""); /* Suppress warning. */
- s = (U8*)SvPV(right,len);
- if (SvOK(TARG)) {
+ else {
+ if (!SvOK(TARG)) /* Avoid warning when concatenating to undef. */
+ sv_setpv(TARG, "");
+ /* $other = $left . $right; */
+ /* $left = $left . $right; */
+ sv_catsv(TARG, right);
+ }
+
#if defined(PERL_Y2KWARN)
- if ((SvIOK(right) || SvNOK(right)) && ckWARN(WARN_Y2K)) {
- STRLEN n;
- char *s = SvPV(TARG,n);
- if (n >= 2 && s[n-2] == '1' && s[n-1] == '9'
- && (n == 2 || !isDIGIT(s[n-3])))
- {
- Perl_warner(aTHX_ WARN_Y2K, "Possible Y2K bug: %s",
- "about to append an integer to '19'");
- }
+ if ((SvIOK(right) || SvNOK(right)) && ckWARN(WARN_Y2K)) {
+ STRLEN n;
+ char *s = SvPV(TARG,n);
+ if (n >= 2 && s[n-2] == '1' && s[n-1] == '9'
+ && (n == 2 || !isDIGIT(s[n-3])))
+ {
+ Perl_warner(aTHX_ WARN_Y2K, "Possible Y2K bug: %s",
+ "about to append an integer to '19'");
}
-#endif
- sv_catpvn(TARG, (char *)s, len);
}
- else
- sv_setpvn(TARG, (char *)s, len); /* suppress warning */
- if (left_utf8)
- SvUTF8_on(TARG);
+#endif
+
SETTARG;
RETURN;
}
if (update_minmatch++)
minmatch = had_zerolen;
}
- if (rx->reganch & RE_USE_INTUIT) {
+ if (rx->reganch & RE_USE_INTUIT &&
+ DO_UTF8(TARG) == ((rx->reganch & ROPT_UTF8) != 0)) {
s = CALLREG_INTUIT_START(aTHX_ rx, TARG, s, strend, r_flags, NULL);
if (!s)
TARG = DEFSV;
EXTEND(SP,1);
}
- PL_reg_sv = TARG;
- do_utf8 = DO_UTF8(PL_reg_sv);
+ do_utf8 = DO_UTF8(TARG);
if (SvFAKE(TARG) && SvREADONLY(TARG))
sv_force_normal(TARG);
if (SvREADONLY(TARG)
if (CALLREGEXEC(aTHX_ rx, s, strend, orig, 0, TARG, NULL,
r_flags | REXEC_CHECKED))
{
+ bool isutf8;
+
if (force_on_match) {
force_on_match = 0;
s = SvPV_force(TARG, len);
SvPVX(TARG) = SvPVX(dstr);
SvCUR_set(TARG, SvCUR(dstr));
SvLEN_set(TARG, SvLEN(dstr));
+ isutf8 = DO_UTF8(dstr);
SvPVX(dstr) = 0;
sv_free(dstr);
PUSHs(sv_2mortal(newSViv((I32)iters)));
(void)SvPOK_only(TARG);
+ if (isutf8)
+ SvUTF8_on(TARG);
TAINT_IF(rxtainted);
SvSETMAGIC(TARG);
SvTAINT(TARG);
s = (U8*)SvPV(sv, len);
if (len < *offsetp)
- Perl_croak(aTHX_ "panic: bad byte offset");
+ Perl_croak(aTHX_ "panic: sv_pos_b2u: bad byte offset");
send = s + *offsetp;
len = 0;
while (s < send) {
- s += UTF8SKIP(s);
- ++len;
- }
- if (s != send) {
- dTHR;
- if (ckWARN_d(WARN_UTF8))
- Perl_warner(aTHX_ WARN_UTF8, "Malformed UTF-8 character");
- --len;
+ STRLEN n;
+
+ if (utf8_to_uv(s, UTF8SKIP(s), &n, 0)) {
+ s += n;
+ len++;
+ }
+ else
+ break;
}
*offsetp = len;
return;
#!./perl
-print "1..30\n";
+print "1..33\n";
# optimized
$/ = \3;
print chomp() == 0 ? "ok 29\n" : "not ok 29\n";
print $_ eq "ab\n" ? "ok 30\n" : "not ok 30\n";
+
+# Go Unicode.
+
+$_ = "abc\x{1234}";
+chop;
+print $_ eq "abc" ? "ok 31\n" : "not ok 31\n";
+
+$_ = "abc\x{1234}d";
+chop;
+print $_ eq "abc\x{1234}" ? "ok 32\n" : "not ok 32\n";
+
+$_ = "\x{1234}\x{2345}";
+chop;
+print $_ eq "\x{1234}" ? "ok 33\n" : "not ok 33\n";
break; /* in regexp, $ might be tail anchor */
}
- /* (now in tr/// code again) */
-
- if (*s & 0x80 && (this_utf8 || has_utf8)) {
- STRLEN len = (STRLEN) -1;
- UV uv;
- if (this_utf8) {
- uv = utf8_to_uv((U8*)s, send - s, &len, UTF8_CHECK_ONLY);
- }
- if (len == (STRLEN)-1) {
- /* Illegal UTF8 (a high-bit byte), make it valid. */
- char *old_pvx = SvPVX(sv);
- /* need space for one extra char (NOTE: SvCUR() not set here) */
- d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
- d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
- }
- else {
- while (len--)
- *d++ = *s++;
- }
- has_utf8 = TRUE;
- continue;
- }
-
/* backslashes */
if (*s == '\\' && s+1 < send) {
+ bool to_be_utf8 = FALSE;
+
s++;
/* some backslashes we leave behind */
else {
STRLEN len = 1; /* allow underscores */
uv = (UV)scan_hex(s + 1, e - s - 1, &len);
- has_utf8 = TRUE;
+ to_be_utf8 = TRUE;
}
s = e + 1;
}
* There will always enough room in sv since such escapes will
* be longer than any utf8 sequence they can end up as
*/
- if (uv > 127 || has_utf8) {
- if (!this_utf8 && !has_utf8 && uv > 255) {
+ if (uv > 127) {
+ if (!has_utf8 && (to_be_utf8 || uv > 255)) {
/* might need to recode whatever we have accumulated so far
* if it contains any hibit chars
*/
}
}
- if (has_utf8 || uv > 255) {
+ if (to_be_utf8 || uv > 255) {
d = (char*)uv_to_utf8((U8*)d, uv);
has_utf8 = TRUE;
}
continue;
} /* end if (backslash) */
+ /* (now in tr/// code again) */
+
+ if (*s & 0x80 && (this_utf8 || has_utf8)) {
+ STRLEN len = (STRLEN) -1;
+ UV uv;
+ if (this_utf8) {
+ uv = utf8_to_uv((U8*)s, send - s, &len, 0);
+ }
+ if (len == (STRLEN)-1) {
+ /* Illegal UTF8 (a high-bit byte), make it valid. */
+ char *old_pvx = SvPVX(sv);
+ /* need space for one extra char (NOTE: SvCUR() not set here) */
+ d = SvGROW(sv, SvLEN(sv) + 1) + (d - old_pvx);
+ d = (char*)uv_to_utf8((U8*)d, (U8)*s++);
+ }
+ else {
+ while (len--)
+ *d++ = *s++;
+ }
+ has_utf8 = TRUE;
+ continue;
+ }
+
*d++ = *s++;
} /* while loop to process each character */
if (*d == '}') {
char minus = (PL_tokenbuf[0] == '-');
s = force_word(s + minus, WORD, FALSE, TRUE, FALSE);
+ if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, 0) &&
+ PL_nextval[PL_nexttoke-1].opval)
+ SvUTF8_on(((SVOP*)PL_nextval[PL_nexttoke-1].opval)->op_sv);
if (minus)
force_next('-');
}
CLINE;
yylval.opval = (OP*)newSVOP(OP_CONST, 0, newSVpv(PL_tokenbuf,0));
yylval.opval->op_private = OPpCONST_BARE;
+ if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, len))
+ SvUTF8_on(((SVOP*)yylval.opval)->op_sv);
TERM(WORD);
}
if (*s == '=' && s[1] == '>') {
CLINE;
sv_setpv(((SVOP*)yylval.opval)->op_sv, PL_tokenbuf);
+ if (UTF && !IN_BYTE && is_utf8_string((U8*)PL_tokenbuf, len))
+ SvUTF8_on(((SVOP*)yylval.opval)->op_sv);
TERM(WORD);
}
int warned = 0;
d = SvPV_force(PL_lex_stuff, len);
while (len) {
+ SV *sv;
for (; isSPACE(*d) && len; --len, ++d) ;
if (len) {
char *b = d;
else {
for (; !isSPACE(*d) && len; --len, ++d) ;
}
+ sv = newSVpvn(b, d-b);
+ if (DO_UTF8(PL_lex_stuff))
+ SvUTF8_on(sv);
words = append_elem(OP_LIST, words,
- newSVOP(OP_CONST, 0, tokeq(newSVpvn(b, d-b))));
+ newSVOP(OP_CONST, 0, tokeq(sv)));
}
}
if (words) {
squash = OPpTRANS_SQUASH;
s++;
}
- o->op_private = del|squash|complement;
+ o->op_private = del|squash|complement|
+ (DO_UTF8(PL_lex_stuff)? OPpTRANS_FROM_UTF : 0)|
+ (DO_UTF8(PL_lex_repl) ? OPpTRANS_TO_UTF : 0);
PL_lex_op = o;
yylval.ival = OP_TRANS;
Renew(SvPVX(tmpstr), SvLEN(tmpstr), char);
}
SvREFCNT_dec(herewas);
+ if (UTF && !IN_BYTE && is_utf8_string((U8*)SvPVX(tmpstr), SvCUR(tmpstr)))
+ SvUTF8_on(tmpstr);
PL_lex_stuff = tmpstr;
yylval.ival = op_type;
return s;
SvREADONLY_on(sv);
if (utf8) {
SvUTF8_on(sv);
- sv_utf8_downgrade(sv, TRUE);
+ if (!UTF||IN_BYTE)
+ sv_utf8_downgrade(sv, TRUE);
}
}
}
STRLEN slen, len;
UV uv, ouv;
- if (u <= 0x7f)
+ if (UTF8_IS_ASCII(u))
return 1;
- if (u >= 0x80 && u <= 0xbf)
+ if (!UTF8_IS_START(u))
return 0;
len = UTF8SKIP(s);
- if (len < 2 || (u >= 0xc0 && u <= 0xfd && s[1] < 0x80))
+ if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
return 0;
slen = len - 1;
uv = u;
ouv = uv;
while (slen--) {
- if ((*s & 0xc0) != 0x80)
+ if (!UTF8_IS_CONTINUATION(*s))
return 0;
uv = UTF8_ACCUMULATE(uv, *s);
if (uv < ouv)
Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
{
U8* x = s;
- U8* send = s + len;
+ U8* send;
STRLEN c;
+ if (!len)
+ len = strlen((char *)s);
+ send = s + len;
+
while (x < send) {
c = is_utf8_char(x);
if (!c)
return FALSE;
x += c;
- if (x > send)
- return FALSE;
}
+ if (x != send)
+ return FALSE;
return TRUE;
}
bool dowarn = ckWARN_d(WARN_UTF8);
#endif
STRLEN expectlen = 0;
-
- if (curlen == 0) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (an empty string)");
+ U32 warning = 0;
+
+/* This list is a superset of the UTF8_ALLOW_XXX. */
+
+#define UTF8_WARN_EMPTY 1
+#define UTF8_WARN_CONTINUATION 2
+#define UTF8_WARN_NON_CONTINUATION 3
+#define UTF8_WARN_FE_FF 4
+#define UTF8_WARN_SHORT 5
+#define UTF8_WARN_OVERFLOW 6
+#define UTF8_WARN_SURROGATE 7
+#define UTF8_WARN_BOM 8
+#define UTF8_WARN_LONG 9
+#define UTF8_WARN_FFFF 10
+
+ if (curlen == 0 &&
+ !(flags & UTF8_ALLOW_EMPTY)) {
+ warning = UTF8_WARN_EMPTY;
goto malformed;
}
if (UTF8_IS_CONTINUATION(uv) &&
!(flags & UTF8_ALLOW_CONTINUATION)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (unexpected continuation byte 0x%02"UVxf")",
- uv);
+ warning = UTF8_WARN_CONTINUATION;
goto malformed;
}
if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
- (UV)s[1], uv);
+ warning = UTF8_WARN_NON_CONTINUATION;
goto malformed;
}
if ((uv == 0xfe || uv == 0xff) &&
!(flags & UTF8_ALLOW_FE_FF)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (byte 0x%02"UVxf")",
- uv);
+ warning = UTF8_WARN_FE_FF;
goto malformed;
}
if ((curlen < expectlen) &&
!(flags & UTF8_ALLOW_SHORT)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (%d byte%s, need %d)",
- curlen, curlen == 1 ? "" : "s", expectlen);
+ warning = UTF8_WARN_SHORT;
goto malformed;
}
while (len--) {
if (!UTF8_IS_CONTINUATION(*s) &&
!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (unexpected non-continuation byte 0x%02x)",
- *s);
+ s--;
+ warning = UTF8_WARN_NON_CONTINUATION;
goto malformed;
}
else
uv = UTF8_ACCUMULATE(uv, *s);
- if (uv < ouv) {
- /* This cannot be allowed. */
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (overflow at 0x%"UVxf", byte 0x%02x)",
- ouv, *s);
- goto malformed;
+ if (!(uv > ouv)) {
+ /* These cannot be allowed. */
+ if (uv == ouv) {
+ if (!(flags & UTF8_ALLOW_LONG)) {
+ warning = UTF8_WARN_LONG;
+ goto malformed;
+ }
+ }
+ else { /* uv < ouv */
+ /* This cannot be allowed. */
+ warning = UTF8_WARN_OVERFLOW;
+ goto malformed;
+ }
}
s++;
ouv = uv;
if (UNICODE_IS_SURROGATE(uv) &&
!(flags & UTF8_ALLOW_SURROGATE)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (UTF-16 surrogate 0x%04"UVxf")",
- uv);
+ warning = UTF8_WARN_SURROGATE;
goto malformed;
} else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
!(flags & UTF8_ALLOW_BOM)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (byte order mark 0x%04"UVxf")",
- uv);
+ warning = UTF8_WARN_BOM;
goto malformed;
} else if ((expectlen > UNISKIP(uv)) &&
!(flags & UTF8_ALLOW_LONG)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (%d byte%s, need %d)",
- expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
+ warning = UTF8_WARN_LONG;
goto malformed;
} else if (UNICODE_IS_ILLEGAL(uv) &&
!(flags & UTF8_ALLOW_FFFF)) {
- if (dowarn)
- Perl_warner(aTHX_ WARN_UTF8,
- "Malformed UTF-8 character (character 0x%04"UVxf")",
- uv);
+ warning = UTF8_WARN_FFFF;
goto malformed;
}
return 0;
}
+ if (dowarn) {
+ SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
+
+ switch (warning) {
+ case 0: /* Intentionally empty. */ break;
+ case UTF8_WARN_EMPTY:
+ Perl_sv_catpvf(aTHX_ sv, "(empty string)");
+ break;
+ case UTF8_WARN_CONTINUATION:
+ Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
+ break;
+ case UTF8_WARN_NON_CONTINUATION:
+ Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
+ (UV)s[1], uv);
+ break;
+ case UTF8_WARN_FE_FF:
+ Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
+ break;
+ case UTF8_WARN_SHORT:
+ Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
+ curlen, curlen == 1 ? "" : "s", expectlen);
+ break;
+ case UTF8_WARN_OVERFLOW:
+ Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
+ ouv, *s);
+ break;
+ case UTF8_WARN_SURROGATE:
+ Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
+ break;
+ case UTF8_WARN_BOM:
+ Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
+ break;
+ case UTF8_WARN_LONG:
+ Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
+ expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
+ break;
+ case UTF8_WARN_FFFF:
+ Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
+ break;
+ default:
+ Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
+ break;
+ }
+
+ if (warning) {
+ char *s = SvPVX(sv);
+
+ if (PL_op)
+ Perl_warner(aTHX_ WARN_UTF8,
+ "%s in %s", s, PL_op_desc[PL_op->op_type]);
+ else
+ Perl_warner(aTHX_ WARN_UTF8, "%s", s);
+ }
+ }
+
if (retlen)
*retlen = expectlen ? expectlen : len;
#define IN_BYTE (PL_curcop->op_private & HINT_BYTE)
#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTE)
-#define UTF8_ALLOW_CONTINUATION 0x0001
-#define UTF8_ALLOW_NON_CONTINUATION 0x0002
-#define UTF8_ALLOW_FE_FF 0x0004
-#define UTF8_ALLOW_SHORT 0x0008
-#define UTF8_ALLOW_SURROGATE 0x0010
-#define UTF8_ALLOW_BOM 0x0020
-#define UTF8_ALLOW_FFFF 0x0040
-#define UTF8_ALLOW_LONG 0x0080
-#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_FE_FF|UTF8_ALLOW_FFFF \
- |UTF8_ALLOW_BOM|UTF8_ALLOW_SURROGATE)
+#define UTF8_ALLOW_EMPTY 0x0001
+#define UTF8_ALLOW_CONTINUATION 0x0002
+#define UTF8_ALLOW_NON_CONTINUATION 0x0004
+#define UTF8_ALLOW_FE_FF 0x0008
+#define UTF8_ALLOW_SHORT 0x0010
+#define UTF8_ALLOW_SURROGATE 0x0020
+#define UTF8_ALLOW_BOM 0x0040
+#define UTF8_ALLOW_FFFF 0x0080
+#define UTF8_ALLOW_LONG 0x0100
+#define UTF8_ALLOW_ANYUV (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\
+ UTF8_ALLOW_SURROGATE|UTF8_ALLOW_BOM|\
+ UTF8_ALLOW_FFFF|UTF8_ALLOW_LONG)
#define UTF8_ALLOW_ANY 0x00ff
#define UTF8_CHECK_ONLY 0x0100