X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/bde9e88d4f9dd5fc838744a75cfc6b9c1a2cfd82..4915c7eeb41998a4845c8cc396a0b95fcd49dbbd:/toke.c diff --git a/toke.c b/toke.c index ea6318e..0389417 100644 --- a/toke.c +++ b/toke.c @@ -994,10 +994,13 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags) if (flags & LEX_STUFF_UTF8) { goto plain_copy; } else { - STRLEN highhalf = 0; + STRLEN highhalf = 0; /* Count of variants */ const char *p, *e = pv+len; - for (p = pv; p != e; p++) - highhalf += !!(((U8)*p) & 0x80); + for (p = pv; p != e; p++) { + if (! UTF8_IS_INVARIANT(*p)) { + highhalf++; + } + } if (!highhalf) goto plain_copy; lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len+highhalf); @@ -1008,9 +1011,9 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags) PL_parser->bufend += len+highhalf; for (p = pv; p != e; p++) { U8 c = (U8)*p; - if (c & 0x80) { - *bufptr++ = (char)(0xc0 | (c >> 6)); - *bufptr++ = (char)(0x80 | (c & 0x3f)); + if (! UTF8_IS_INVARIANT(c)) { + *bufptr++ = UTF8_TWO_BYTE_HI(c); + *bufptr++ = UTF8_TWO_BYTE_LO(c); } else { *bufptr++ = (char)c; } @@ -1022,14 +1025,13 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags) const char *p, *e = pv+len; for (p = pv; p != e; p++) { U8 c = (U8)*p; - if (c >= 0xc4) { + if (UTF8_IS_ABOVE_LATIN1(c)) { Perl_croak(aTHX_ "Lexing code attempted to stuff " "non-Latin-1 character into Latin-1 input"); - } else if (c >= 0xc2 && p+1 != e && - (((U8)p[1]) & 0xc0) == 0x80) { + } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e)) { p++; highhalf++; - } else if (c >= 0x80) { + } else if (! UTF8_IS_INVARIANT(c)) { /* malformed UTF-8 */ ENTER; SAVESPTR(PL_warnhook); @@ -1046,17 +1048,20 @@ Perl_lex_stuff_pvn(pTHX_ const char *pv, STRLEN len, U32 flags) SvCUR_set(PL_parser->linestr, SvCUR(PL_parser->linestr) + len-highhalf); PL_parser->bufend += len-highhalf; - for (p = pv; p != e; p++) { - U8 c = (U8)*p; - if (c & 0x80) { - *bufptr++ = (char)(((c & 0x3) << 6) | (p[1] & 0x3f)); - p++; - } else { - *bufptr++ = (char)c; + p = pv; + while (p < e) { + if (UTF8_IS_INVARIANT(*p)) { + *bufptr++ = *p; + p++; } + else { + assert(p < e -1 ); + *bufptr++ = TWO_BYTE_UTF8_TO_UNI(*p, *(p+1)); + p += 2; + } } } else { - plain_copy: + plain_copy: lex_grow_linestr(SvCUR(PL_parser->linestr)+1+len); bufptr = PL_parser->bufptr; Move(bufptr, bufptr+len, PL_parser->bufend+1-bufptr, char); @@ -1404,10 +1409,10 @@ Perl_lex_peek_unichar(pTHX_ U32 flags) bufend = PL_parser->bufend; } head = (U8)*s; - if (!(head & 0x80)) + if (UTF8_IS_INVARIANT(head)) return head; - if (head & 0x40) { - len = PL_utf8skip[head]; + if (UTF8_IS_START(head)) { + len = UTF8SKIP(&head); while ((STRLEN)(bufend-s) < len) { if (!lex_next_chunk(flags | LEX_KEEP_PREVIOUS)) break; @@ -2772,14 +2777,7 @@ S_get_and_check_backslash_N_name(pTHX_ const char* s, const char* const e) } } - /* A custom translator can leave res not in UTF-8, so make sure. XXX This - * can be revisited to not use utf8 for characters that don't need it when - * regexes don't have to be in utf8 for Unicode semantics. If doing so, - * remember EBCDIC */ - if (! SvUTF8(res)) { - sv_utf8_upgrade(res); - } - else { /* Don't accept malformed input */ + if (SvUTF8(res)) { /* Don't accept malformed input */ const U8* first_bad_char_loc; STRLEN len; const char* const str = SvPV_const(res, len); @@ -2977,7 +2975,7 @@ S_scan_const(pTHX_ char *start) #ifdef EBCDIC && !native_range #endif - ) { + ) { char * const c = (char*)utf8_hop((U8*)d, -1); char *e = d++; while (e-- > c) @@ -3393,31 +3391,6 @@ S_scan_const(pTHX_ char *start) /* Here it looks like a named character */ - if (PL_lex_inpat) { - - /* XXX This block is temporary code. \N{} implies that the - * pattern is to have Unicode semantics, and therefore - * currently has to be encoded in utf8. By putting it in - * utf8 now, we save a whole pass in the regular expression - * compiler. Once that code is changed so Unicode - * semantics doesn't necessarily have to be in utf8, this - * block should be removed. However, the code that parses - * the output of this would have to be changed to not - * necessarily expect utf8 */ - if (!has_utf8) { - SvCUR_set(sv, d - SvPVX_const(sv)); - SvPOK_on(sv); - *d = '\0'; - /* See Note on sizing above. */ - sv_utf8_upgrade_flags_grow(sv, - SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, - /* 5 = '\N{' + cur char + NUL */ - (STRLEN)(send - s) + 5); - d = SvPVX(sv) + SvCUR(sv); - has_utf8 = TRUE; - } - } - if (*s == 'U' && s[1] == '+') { /* \N{U+...} */ I32 flags = PERL_SCAN_ALLOW_UNDERSCORES | PERL_SCAN_DISALLOW_PREFIX; @@ -3498,73 +3471,88 @@ S_scan_const(pTHX_ char *start) * returned by charnames */ const char *str_end = str + len; - STRLEN char_length; /* cur char's byte length */ - STRLEN output_length; /* and the number of bytes - after this is translated - into hex digits */ const STRLEN off = d - SvPVX_const(sv); - /* 2 hex per byte; 2 chars for '\N'; 2 chars for - * max('U+', '.'); and 1 for NUL */ - char hex_string[2 * UTF8_MAXBYTES + 5]; - - /* Get the first character of the result. */ - U32 uv = utf8n_to_uvuni((U8 *) str, - len, - &char_length, - UTF8_ALLOW_ANYUV); - - /* The call to is_utf8_string() above hopefully - * guarantees that there won't be an error. But - * it's easy here to make sure. The function just - * above warns and returns 0 if invalid utf8, but - * it can also return 0 if the input is validly a - * NUL. Disambiguate */ - if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') { - uv = UNICODE_REPLACEMENT; - } - - /* Convert first code point to hex, including the - * boiler plate before it. For all these, we - * convert to native format so that downstream code - * can continue to assume the input is native */ - output_length = - my_snprintf(hex_string, sizeof(hex_string), - "\\N{U+%X", - (unsigned int) UNI_TO_NATIVE(uv)); - - /* Make sure there is enough space to hold it */ - d = off + SvGROW(sv, off - + output_length - + (STRLEN)(send - e) - + 2); /* '}' + NUL */ - /* And output it */ - Copy(hex_string, d, output_length, char); - d += output_length; - - /* For each subsequent character, append dot and - * its ordinal in hex */ - while ((str += char_length) < str_end) { - const STRLEN off = d - SvPVX_const(sv); - U32 uv = utf8n_to_uvuni((U8 *) str, - str_end - str, - &char_length, - UTF8_ALLOW_ANYUV); - if (uv == 0 && NATIVE_TO_ASCII(*str) != '\0') { - uv = UNICODE_REPLACEMENT; - } - - output_length = - my_snprintf(hex_string, sizeof(hex_string), - ".%X", - (unsigned int) UNI_TO_NATIVE(uv)); - - d = off + SvGROW(sv, off - + output_length - + (STRLEN)(send - e) - + 2); /* '}' + NUL */ - Copy(hex_string, d, output_length, char); - d += output_length; + if (! SvUTF8(res)) { + /* For the non-UTF-8 case, we can determine the + * exact length needed without having to parse + * through the string. Each character takes up + * 2 hex digits plus either a trailing dot or + * the "}" */ + d = off + SvGROW(sv, off + + 3 * len + + 6 /* For the "\N{U+", and + trailing NUL */ + + (STRLEN)(send - e)); + Copy("\\N{U+", d, 5, char); + d += 5; + while (str < str_end) { + char hex_string[4]; + my_snprintf(hex_string, sizeof(hex_string), + "%02X.", (U8) *str); + Copy(hex_string, d, 3, char); + d += 3; + str++; + } + d--; /* We will overwrite below the final + dot with a right brace */ + } + else { + STRLEN char_length; /* cur char's byte length */ + + /* and the number of bytes after this is + * translated into hex digits */ + STRLEN output_length; + + /* 2 hex per byte; 2 chars for '\N'; 2 chars + * for max('U+', '.'); and 1 for NUL */ + char hex_string[2 * UTF8_MAXBYTES + 5]; + + /* Get the first character of the result. */ + U32 uv = utf8n_to_uvuni((U8 *) str, + len, + &char_length, + UTF8_ALLOW_ANYUV); + /* Convert first code point to hex, including + * the boiler plate before it. For all these, + * we convert to native format so that + * downstream code can continue to assume the + * input is native */ + output_length = + my_snprintf(hex_string, sizeof(hex_string), + "\\N{U+%X", + (unsigned int) UNI_TO_NATIVE(uv)); + + /* Make sure there is enough space to hold it */ + d = off + SvGROW(sv, off + + output_length + + (STRLEN)(send - e) + + 2); /* '}' + NUL */ + /* And output it */ + Copy(hex_string, d, output_length, char); + d += output_length; + + /* For each subsequent character, append dot and + * its ordinal in hex */ + while ((str += char_length) < str_end) { + const STRLEN off = d - SvPVX_const(sv); + U32 uv = utf8n_to_uvuni((U8 *) str, + str_end - str, + &char_length, + UTF8_ALLOW_ANYUV); + output_length = + my_snprintf(hex_string, + sizeof(hex_string), + ".%X", + (unsigned int) UNI_TO_NATIVE(uv)); + + d = off + SvGROW(sv, off + + output_length + + (STRLEN)(send - e) + + 2); /* '}' + NUL */ + Copy(hex_string, d, output_length, char); + d += output_length; + } } *d++ = '}'; /* Done. Add the trailing brace */