when the source isn't utf8, as for
example when it is entirely composed
of hex constants */
+ STRLEN utf8_variant_count = 0; /* When not in UTF-8, this counts the
+ number of characters found so far
+ that will expand (into 2 bytes)
+ should we have to convert to
+ UTF-8) */
SV *res; /* result from charnames */
STRLEN offset_to_max; /* The offset in the output to where the range
high-end character is temporarily placed */
* Ranges entirely within Latin1 are expanded out entirely, in
* order to avoid the significant overhead of making a swash.
* Ranges that extend above Latin1 have to have a swash, so there
- * is no advantage to abbreviating them here, so they are stored
+ * is no advantage to expanding them here, so they are stored
* here as Min, ILLEGAL_UTF8_BYTE, Max. The illegal byte signifies
* a hyphen without any possible ambiguity. On EBCDIC machines, if
* the range is expressed as Unicode, the Latin1 portion is
* Unicode value (\N{...}), or if the range is a subset of
* [A-Z] or [a-z], and both ends are literal characters,
* like 'A', and not like \x{C1} */
- if ((convert_unicode
- = cBOOL(backslash_N) /* \N{} forces Unicode, hence
+ convert_unicode =
+ cBOOL(backslash_N) /* \N{} forces Unicode, hence
portable range */
|| ( ! non_portable_endpoint
&& (( isLOWER_A(range_min) && isLOWER_A(range_max))
- || (isUPPER_A(range_min) && isUPPER_A(range_max))))
- )) {
+ || (isUPPER_A(range_min) && isUPPER_A(range_max))));
+ if (convert_unicode) {
/* Special handling is needed for these portable ranges.
* They are defined to all be in Unicode terms, which
if (!esc)
in_charclass = TRUE;
}
-
- else if (*s == ']' && PL_lex_inpat && in_charclass) {
+ else if (*s == ']' && PL_lex_inpat && in_charclass) {
char *s1 = s-1;
int esc = 0;
while (s1 >= start && *s1-- == '\\')
}
else {
if (!has_utf8 && uv > 255) {
- /* Might need to recode whatever we have accumulated so
- * far if it contains any chars variant in utf8 or
- * utf-ebcdic. */
- SvCUR_set(sv, d - SvPVX_const(sv));
- SvPOK_on(sv);
- *d = '\0';
- /* See Note on sizing above. */
- sv_utf8_upgrade_flags_grow(
- sv,
- SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
- /* Above-latin1 in string
- * implies no encoding */
- |SV_UTF8_NO_ENCODING,
+ /* Here, 'uv' won't fit unless we convert to UTF-8.
+ * If we've only seen invariants so far, all we have to
+ * do is turn on the flag */
+ if (utf8_variant_count == 0) {
+ SvUTF8_on(sv);
+ }
+ else {
+ SvCUR_set(sv, d - SvPVX_const(sv));
+ SvPOK_on(sv);
+ *d = '\0';
+
+ sv_utf8_upgrade_flags_grow(
+ sv,
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+
/* Since we're having to grow here,
* make sure we have enough room for
* this escape and a NUL, so the
* code immediately below won't have
* to actually grow again */
- UVCHR_SKIP(uv) + (STRLEN)(send - s) + 1);
- d = SvPVX(sv) + SvCUR(sv);
- has_utf8 = TRUE;
+ UVCHR_SKIP(uv)
+ + (STRLEN)(send - s) + 1);
+ d = SvPVX(sv) + SvCUR(sv);
+ }
+
+ has_utf8 = TRUE;
}
- if (has_utf8) {
+ if (! has_utf8) {
+ *d++ = (char)uv;
+ utf8_variant_count++;
+ }
+ else {
/* Usually, there will already be enough room in 'sv'
* since such escapes are likely longer than any UTF-8
* sequence they can end up as. This isn't the case on
(PL_lex_repl ? OPpTRANS_FROM_UTF
: OPpTRANS_TO_UTF);
}
- }
- else {
- *d++ = (char)uv;
}
}
#ifdef EBCDIC
if (! has_utf8 && ( uv > 0xFF
|| PL_lex_inwhat != OP_TRANS))
{
+ /* See Note on sizing above. */
+ const STRLEN extra = OFFUNISKIP(uv) + (send - e) + 1;
+
SvCUR_set(sv, d - SvPVX_const(sv));
SvPOK_on(sv);
*d = '\0';
- /* See Note on sizing above. */
- sv_utf8_upgrade_flags_grow(
- sv,
- SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
- OFFUNISKIP(uv) + (STRLEN)(send - e) + 1);
- d = SvPVX(sv) + SvCUR(sv);
+
+ if (utf8_variant_count == 0) {
+ SvUTF8_on(sv);
+ d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
+ }
+ else {
+ sv_utf8_upgrade_flags_grow(
+ sv,
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+ extra);
+ d = SvPVX(sv) + SvCUR(sv);
+ }
+
has_utf8 = TRUE;
}
* \N{} implies Unicode semantics, and scalars have
* to be in utf8 to guarantee those semantics; but
* not needed in tr/// */
- sv_utf8_upgrade_flags(res, SV_UTF8_NO_ENCODING);
+ sv_utf8_upgrade_flags(res, 0);
str = SvPV_const(res, len);
}
/* Upgrade destination to be utf8 if this new
* component is */
if (! has_utf8 && SvUTF8(res)) {
+ /* See Note on sizing above. */
+ const STRLEN extra = len + (send - s) + 1;
+
SvCUR_set(sv, d - SvPVX_const(sv));
SvPOK_on(sv);
*d = '\0';
- /* See Note on sizing above. */
- sv_utf8_upgrade_flags_grow(sv,
+
+ if (utf8_variant_count == 0) {
+ SvUTF8_on(sv);
+ d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + extra);
+ }
+ else {
+ sv_utf8_upgrade_flags_grow(sv,
SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
- len + (STRLEN)(send - s) + 1);
- d = SvPVX(sv) + SvCUR(sv);
+ extra);
+ d = SvPVX(sv) + SvCUR(sv);
+ }
has_utf8 = TRUE;
} else if (len > (STRLEN)(e - s + 4)) { /* I _guess_ 4 is \N{} --jhi */
* to/from UTF-8.
*
* If the input has the same representation in UTF-8 as not, it will be
- * a single byte, and we don't care about UTF8ness; or if neither
- * source nor output is UTF-8, just copy the byte */
- if (NATIVE_BYTE_IS_INVARIANT((U8)(*s)) || (! this_utf8 && ! has_utf8))
- {
+ * a single byte, and we don't care about UTF8ness; just copy the byte */
+ if (NATIVE_BYTE_IS_INVARIANT((U8)(*s))) {
*d++ = *s++;
}
+ else if (! this_utf8 && ! has_utf8) {
+ /* If neither source nor output is UTF-8, is also a single byte,
+ * just copy it; but this byte counts should we later have to
+ * convert to UTF-8 */
+ *d++ = *s++;
+ utf8_variant_count++;
+ }
else if (this_utf8 && has_utf8) { /* Both UTF-8, can just copy */
const STRLEN len = UTF8SKIP(s);
const UV nextuv = (this_utf8)
? utf8n_to_uvchr((U8*)s, send - s, &len, 0)
: (UV) ((U8) *s);
- const STRLEN need = UVCHR_SKIP(nextuv);
+ STRLEN need = UVCHR_SKIP(nextuv);
+
if (!has_utf8) {
SvCUR_set(sv, d - SvPVX_const(sv));
SvPOK_on(sv);
*d = '\0';
- /* See Note on sizing above. */
- sv_utf8_upgrade_flags_grow(sv,
- SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
- need + (STRLEN)(send - s) + 1);
- d = SvPVX(sv) + SvCUR(sv);
+
+ /* See Note on sizing above. */
+ need += (STRLEN)(send - s) + 1;
+
+ if (utf8_variant_count == 0) {
+ SvUTF8_on(sv);
+ d = SvCUR(sv) + SvGROW(sv, SvCUR(sv) + need);
+ }
+ else {
+ sv_utf8_upgrade_flags_grow(sv,
+ SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
+ need);
+ d = SvPVX(sv) + SvCUR(sv);
+ }
has_utf8 = TRUE;
} else if (need > len) {
/* encoded value larger than old, may need extra space (NOTE: