From 6c12993c67cedb7be68d2e7c7f52fbd9a39b92c8 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sun, 1 Jul 2018 19:23:35 -0600 Subject: [PATCH] Use strict dfa to translate from UTF-8 to code point With this commit, if a sequence passes the dfa, the result can be returned immediately. Previously some rare potentially problematic sequences could pass, which would then need further checking, which then have to be done always. So this speeds up the general case. --- regcharclass.h | 11 ++++++++++- regen/regcharclass.pl | 4 ++++ utf8.c | 51 +++++++++++++++++++++++++++++++-------------------- 3 files changed, 45 insertions(+), 21 deletions(-) diff --git a/regcharclass.h b/regcharclass.h index 7727bcc..bded7d5 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -615,6 +615,15 @@ ( 0x200F == cp || ( 0x200F < cp && \ ( 0x2028 == cp || 0x2029 == cp ) ) ) ) ) ) ) ) ) ) ) +/* + HANGUL_ED: Hangul syllables whose first character is \xED + + 0xD000 - 0xD7FF +*/ +/*** GENERATED CODE ***/ +#define is_HANGUL_ED_utf8_safe(s,e) \ +( ( ( ( ( ((e) - (s)) >= 3 ) && ( 0xED == ((const U8*)s)[0] ) ) && ( ( ((const U8*)s)[1] & 0xE0 ) == 0x80 ) ) && ( ( ((const U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 ) + #endif /* ASCII/Latin1 */ #if 'A' == 193 /* EBCDIC 1047 */ \ @@ -1901,6 +1910,6 @@ * 6aaacc29ce24746bcb2bf82a920fcf90e07cf92d75325199c50f40754d39bb72 lib/unicore/mktables * 21653d2744fdd071f9ef138c805393901bb9547cf3e777ebf50215a191f986ea lib/unicore/version * 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl - * 069232ed937edb5a8f6a2e0e6e2d56e76ecc8d4580804f4f1ee98c828905434c regen/regcharclass.pl + * 0a1a1fad4b43cd9338269aa8cd46d246a33546c5409aa7e75a147e5350cd39ee regen/regcharclass.pl * 393f8d882713a3ba227351ad0f00ea4839fda74fcf77dcd1cdf31519925adba5 regen/regcharclass_multi_char_folds.pl * ex: set ro: */ diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index b837152..4884d1a 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -1736,3 +1736,7 @@ PROBLEMATIC_LOCALE_FOLDEDS_START : The first folded character of folds which are PATWS: pattern white space => generic cp : safe \p{_Perl_PatWS} + +HANGUL_ED: Hangul syllables whose first character is \xED +=> UTF8 :only_ascii_platform safe +0xD000 - 0xD7FF diff --git a/utf8.c b/utf8.c index 58745b1..f04f17a 100644 --- a/utf8.c +++ b/utf8.c @@ -1561,38 +1561,51 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s, #define PERL_UTF8_DECODE_REJECT 1 while (s < send && LIKELY(state != PERL_UTF8_DECODE_REJECT)) { - UV type = perl_extended_utf8_dfa_tab[*s]; + UV type = strict_utf8_dfa_tab[*s]; uv = (state == 0) ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s)) : UTF8_ACCUMULATE(uv, *s); - state = perl_extended_utf8_dfa_tab[256 + state + type]; + state = strict_utf8_dfa_tab[256 + state + type]; if (state == 0) { - - /* If this could be a code point that the flags don't allow (the first - * surrogate is the first such possible one), delve further, but we already - * have calculated 'uv' */ - if ( (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE - |UTF8_DISALLOW_PERL_EXTENDED - |UTF8_WARN_ILLEGAL_INTERCHANGE - |UTF8_WARN_PERL_EXTENDED)) - && uv >= UNICODE_SURROGATE_FIRST) - { - curlen = s + 1 - s0; - goto got_uv; - } - - return UNI_TO_NATIVE(uv); + return uv; } s++; } - /* Here, is some sort of failure. Use the full mechanism */ + /* Here, is one of: a) malformed; b) a problematic code point (surrogate, + * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul + * syllables that the dfa doesn't properly handle. Quickly dispose of the + * final case. + * + * Each of the affected Hanguls starts with \xED */ + +#ifndef EBCDIC + if (is_HANGUL_ED_utf8_safe(s0, send)) { + return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT)) + | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT) + | (s0[2] & UTF_CONTINUATION_MASK); + } + +#endif + + /* Here is potentially problematic. Use the full mechanism */ uv = *s0; + /* In conjunction with the exhaustive tests that can be enabled in + * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely + * what it is intended to do, and that no flaws in it are masked by + * dropping down and executing the code below + + assert(! isUTF8_CHAR(s0, send) + || UTF8_IS_SURROGATE(s0, send) + || UTF8_IS_SUPER(s0, send) + || UTF8_IS_NONCHAR(s0,send)); + */ + /* A continuation character can't start a valid sequence */ if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) { possible_problems |= UTF8_GOT_CONTINUATION; @@ -1712,8 +1725,6 @@ Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s, } } - got_uv: - /* Here, we have found all the possible problems, except for when the input * is for a problematic code point not allowed by the input parameters. */ -- 1.8.3.1