if (flags & LEX_STUFF_UTF8) {
goto plain_copy;
} else {
- STRLEN highhalf = 0;
+ STRLEN highhalf = 0; /* Count of variants */
const char *p, *e = pv+len;
- for (p = pv; p != e; p++)
- highhalf += !!(((U8)*p) & 0x80);
+ for (p = pv; p != e; p++) {
+ if (! UTF8_IS_INVARIANT(*p)) {
+ highhalf++;
+ }
+ }
if (!highhalf)
goto plain_copy;
lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf);
PL_parser->bufend += len+highhalf;
for (p = pv; p != e; p++) {
U8 c = (U8)*p;
- if (c & 0x80) {
- *bufptr++ = (char)(0xc0 | (c >> 6));
- *bufptr++ = (char)(0x80 | (c & 0x3f));
+ if (! UTF8_IS_INVARIANT(c)) {
+ *bufptr++ = UTF8_TWO_BYTE_HI(c);
+ *bufptr++ = UTF8_TWO_BYTE_LO(c);
} else {
*bufptr++ = (char)c;
}
const char *p, *e = pv+len;
for (p = pv; p != e; p++) {
U8 c = (U8)*p;
- if (c >= 0xc4) {
+ if (UTF8_IS_ABOVE_LATIN1(c)) {
Perl_croak(aTHX_ "Lexing code attempted to stuff "
"non-Latin-1 character into Latin-1 input");
- } else if (c >= 0xc2 && p+1 != e &&
- (((U8)p[1]) & 0xc0) == 0x80) {
+ } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) {
p++;
highhalf++;
- } else if (c >= 0x80) {
+ } else if (! UTF8_IS_INVARIANT(c)) {
/* malformed UTF-8 */
ENTER;
SAVESPTR(PL_warnhook);
SvCUR_set(PL_parser->linestr,
SvCUR(PL_parser->linestr) + len-highhalf);
PL_parser->bufend += len-highhalf;
- for (p = pv; p != e; p++) {
- U8 c = (U8)*p;
- if (c & 0x80) {
- *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f));
- p++;
- } else {
- *bufptr++ = (char)c;
+ p = pv;
+ while (p < e) {
+ if (UTF8_IS_INVARIANT(*p)) {
+ *bufptr++ = *p;
+ p++;
}
+ else {
+ assert(p < e -1 );
+ *bufptr++ = TWO_BYTE_UTF8_TO_UNI(*p, *(p+1));
+ p += 2;
+ }
}
} else {
- plain_copy:
+ plain_copy:
lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len);
bufptr = PL_parser->bufptr;
Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char);
bufend = PL_parser->bufend;
}
head = (U8)*s;
- if (!(head & 0x80))
+ if (UTF8_IS_INVARIANT(head))
return head;
- if (head & 0x40) {
- len = PL_utf8skip[head];
+ if (UTF8_IS_START(head)) {
+ len = UTF8SKIP(&head);
while ((STRLEN)(bufend-s) < len) {
if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS))
break;
}
}
- /* A custom translator can leave res not in UTF-8, so make sure. XXX This
- * can be revisited to not use utf8 for characters that don't need it when
- * regexes don't have to be in utf8 for Unicode semantics. If doing so,
- * remember EBCDIC */
- if (! SvUTF8(res)) {
- sv_utf8_upgrade(res);
- }
- else { /* Don't accept malformed input */
+ if (SvUTF8(res)) { /* Don't accept malformed input */
const U8* first_bad_char_loc;
STRLEN len;
const char* const str = SvPV_const(res, len);
#ifdef EBCDIC
&& !native_range
#endif
- ) {
+ ) {
char * const c = (char*)utf8_hop((U8*)d, -1);
char *e = d++;
while (e-- > c)
/* Here it looks like a named character */
- if (PL_lex_inpat) {
-
- /* XXX This block is temporary code. \N{} implies that the
- * pattern is to have Unicode semantics, and therefore
- * currently has to be encoded in utf8. By putting it in
- * utf8 now, we save a whole pass in the regular expression
- * compiler. Once that code is changed so Unicode
- * semantics doesn't necessarily have to be in utf8, this
- * block should be removed. However, the code that parses
- * the output of this would have to be changed to not
- * necessarily expect utf8 */
- if (!has_utf8) {
- SvCUR_set(sv, d - SvPVX_const(sv));
- SvPOK_on(sv);
- *d = '\0';
- /* See Note on sizing above. */
- sv_utf8_upgrade_flags_grow(sv,
- SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
- /* 5 = '\N{' + cur char + NUL */
- (STRLEN)(send - s) + 5);
- d = SvPVX(sv) + SvCUR(sv);
- has_utf8 = TRUE;
- }
- }
-
if (*s == 'U' && s[1] == '+') { /* \N{U+...} */
I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
| PERL_SCAN_DISALLOW_PREFIX;
* returned by charnames */
const char *str_end = str + len;
- STRLEN char_length; /* cur char's byte length */
- STRLEN output_length; /* and the number of bytes
- after this is translated
- into hex digits */
const STRLEN off = d - SvPVX_const(sv);
- /* 2 hex per byte; 2 chars for '\N'; 2 chars for
- * max('U+', '.'); and 1 for NUL */
- char hex_string[2 * UTF8_MAXBYTES + 5];
-
- /* Get the first character of the result. */
- U32 uv = utf8n_to_uvuni((U8 *) str,
- len,
- &char_length,
- UTF8_ALLOW_ANYUV);
-
- /* The call to is_utf8_string() above hopefully
- * guarantees that there won't be an error. But
- * it's easy here to make sure. The function just
- * above warns and returns 0 if invalid utf8, but
- * it can also return 0 if the input is validly a
- * NUL. Disambiguate */
- if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
- uv = UNICODE_REPLACEMENT;
- }
-
- /* Convert first code point to hex, including the
- * boiler plate before it. For all these, we
- * convert to native format so that downstream code
- * can continue to assume the input is native */
- output_length =
- my_snprintf(hex_string, sizeof(hex_string),
- "\\N{U+%X",
- (unsigned int) UNI_TO_NATIVE(uv));
-
- /* Make sure there is enough space to hold it */
- d = off + SvGROW(sv, off
- + output_length
- + (STRLEN)(send - e)
- + 2); /* '}' + NUL */
- /* And output it */
- Copy(hex_string, d, output_length, char);
- d += output_length;
-
- /* For each subsequent character, append dot and
- * its ordinal in hex */
- while ((str += char_length) < str_end) {
- const STRLEN off = d - SvPVX_const(sv);
- U32 uv = utf8n_to_uvuni((U8 *) str,
- str_end - str,
- &char_length,
- UTF8_ALLOW_ANYUV);
- if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') {
- uv = UNICODE_REPLACEMENT;
- }
-
- output_length =
- my_snprintf(hex_string, sizeof(hex_string),
- ".%X",
- (unsigned int) UNI_TO_NATIVE(uv));
-
- d = off + SvGROW(sv, off
- + output_length
- + (STRLEN)(send - e)
- + 2); /* '}' + NUL */
- Copy(hex_string, d, output_length, char);
- d += output_length;
+ if (! SvUTF8(res)) {
+ /* For the non-UTF-8 case, we can determine the
+ * exact length needed without having to parse
+ * through the string. Each character takes up
+ * 2 hex digits plus either a trailing dot or
+ * the "}" */
+ d = off + SvGROW(sv, off
+ + 3 * len
+ + 6 /* For the "\N{U+", and
+ trailing NUL */
+ + (STRLEN)(send - e));
+ Copy("\\N{U+", d, 5, char);
+ d += 5;
+ while (str < str_end) {
+ char hex_string[4];
+ my_snprintf(hex_string, sizeof(hex_string),
+ "%02X.", (U8) *str);
+ Copy(hex_string, d, 3, char);
+ d += 3;
+ str++;
+ }
+ d--; /* We will overwrite below the final
+ dot with a right brace */
+ }
+ else {
+ STRLEN char_length; /* cur char's byte length */
+
+ /* and the number of bytes after this is
+ * translated into hex digits */
+ STRLEN output_length;
+
+ /* 2 hex per byte; 2 chars for '\N'; 2 chars
+ * for max('U+', '.'); and 1 for NUL */
+ char hex_string[2 * UTF8_MAXBYTES + 5];
+
+ /* Get the first character of the result. */
+ U32 uv = utf8n_to_uvuni((U8 *) str,
+ len,
+ &char_length,
+ UTF8_ALLOW_ANYUV);
+ /* Convert first code point to hex, including
+ * the boiler plate before it. For all these,
+ * we convert to native format so that
+ * downstream code can continue to assume the
+ * input is native */
+ output_length =
+ my_snprintf(hex_string, sizeof(hex_string),
+ "\\N{U+%X",
+ (unsigned int) UNI_TO_NATIVE(uv));
+
+ /* Make sure there is enough space to hold it */
+ d = off + SvGROW(sv, off
+ + output_length
+ + (STRLEN)(send - e)
+ + 2); /* '}' + NUL */
+ /* And output it */
+ Copy(hex_string, d, output_length, char);
+ d += output_length;
+
+ /* For each subsequent character, append dot and
+ * its ordinal in hex */
+ while ((str += char_length) < str_end) {
+ const STRLEN off = d - SvPVX_const(sv);
+ U32 uv = utf8n_to_uvuni((U8 *) str,
+ str_end - str,
+ &char_length,
+ UTF8_ALLOW_ANYUV);
+ output_length =
+ my_snprintf(hex_string,
+ sizeof(hex_string),
+ ".%X",
+ (unsigned int) UNI_TO_NATIVE(uv));
+
+ d = off + SvGROW(sv, off
+ + output_length
+ + (STRLEN)(send - e)
+ + 2); /* '}' + NUL */
+ Copy(hex_string, d, output_length, char);
+ d += output_length;
+ }
}
*d++ = '}'; /* Done. Add the trailing brace */