From dd9bc2b0af8e838ed989897601a0ee36eeed092f Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Tue, 6 May 2014 13:18:28 -0600 Subject: [PATCH 1/1] Add some (UN)?LIKELY() to UTF8 handling It's very rare actually for code to be presented with malformed UTF-8, so give the compiler a hint about the likely branches. --- regcharclass.h | 16 ++++++++-------- regen/regcharclass.pl | 7 ++++--- utf8.h | 6 +++--- 3 files changed, 15 insertions(+), 14 deletions(-) diff --git a/regcharclass.h b/regcharclass.h index ebda2f7..7de537b 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -708,11 +708,11 @@ */ /*** GENERATED CODE ***/ #define is_PATWS_safe(s,e,is_utf8) \ -( ((e) > (s)) ? \ +( ( LIKELY((e) > (s)) ) ? \ ( ( ( 0x09 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x0D ) || 0x20 == ((U8*)s)[0] ) ? 1\ : (! is_utf8 ) ? \ ( 0x85 == ((U8*)s)[0] ) \ - : (((e) - (s)) >= UTF8SKIP(s)) ? \ + : ( LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) ? \ ( ( 0xC2 == ((U8*)s)[0] ) ? \ ( ( 0x85 == ((U8*)s)[1] ) ? 2 : 0 ) \ : ( ( ( 0xE2 == ((U8*)s)[0] ) && ( 0x80 == ((U8*)s)[1] ) ) && ( ( ((U8*)s)[2] & 0xFE ) == 0x8E || ( ((U8*)s)[2] & 0xFE ) == 0xA8 ) ) ? 3 : 0 )\ @@ -1427,9 +1427,9 @@ */ /*** GENERATED CODE ***/ #define is_PATWS_safe(s,e,is_utf8) \ -( ((e) > (s)) ? \ +( ( LIKELY((e) > (s)) ) ? \ ( ( ( ( ((U8*)s)[0] & 0xEF ) == 0x05 ) || ((U8*)s)[0] == 0x0B || ( ( ((U8*)s)[0] & 0xFE ) == 0x0C ) || ((U8*)s)[0] == 0x25 || ((U8*)s)[0] == 0x40 ) ? 1\ - : ( ( is_utf8 && (((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\ + : ( ( is_utf8 && LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\ ( ( 0x55 == ((U8*)s)[2] || 0x56 == ((U8*)s)[2] ) ? 3 : 0 )\ : ( ( 0x42 == ((U8*)s)[1] ) && ( 0x49 == ((U8*)s)[2] || 0x4A == ((U8*)s)[2] ) ) ? 3 : 0 ) : 0 )\ : 0 ) @@ -2152,9 +2152,9 @@ */ /*** GENERATED CODE ***/ #define is_PATWS_safe(s,e,is_utf8) \ -( ((e) > (s)) ? \ +( ( LIKELY((e) > (s)) ) ? \ ( ( ( ( ((U8*)s)[0] & 0xEF ) == 0x05 ) || ((U8*)s)[0] == 0x0B || ( ( ((U8*)s)[0] & 0xFE ) == 0x0C ) || ((U8*)s)[0] == 0x25 || ((U8*)s)[0] == 0x40 ) ? 1\ - : ( ( is_utf8 && (((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\ + : ( ( is_utf8 && LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\ ( ( 0x55 == ((U8*)s)[2] || 0x56 == ((U8*)s)[2] ) ? 3 : 0 )\ : ( ( 0x42 == ((U8*)s)[1] ) && ( 0x49 == ((U8*)s)[2] || 0x4A == ((U8*)s)[2] ) ) ? 3 : 0 ) : 0 )\ : 0 ) @@ -2877,9 +2877,9 @@ */ /*** GENERATED CODE ***/ #define is_PATWS_safe(s,e,is_utf8) \ -( ((e) > (s)) ? \ +( ( LIKELY((e) > (s)) ) ? \ ( ( ( ( ((U8*)s)[0] & 0xEF ) == 0x05 ) || ((U8*)s)[0] == 0x0B || ( ( ((U8*)s)[0] & 0xFE ) == 0x0C ) || ((U8*)s)[0] == 0x25 || ((U8*)s)[0] == 0x40 ) ? 1\ - : ( ( is_utf8 && (((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\ + : ( ( is_utf8 && LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\ ( ( ( ((U8*)s)[2] & 0xFE ) == 0x56 ) ? 3 : 0 ) \ : ( ( 0x42 == ((U8*)s)[1] ) && ( 0x49 == ((U8*)s)[2] || 0x51 == ((U8*)s)[2] ) ) ? 3 : 0 ) : 0 )\ : 0 ) diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl index 1f453e8..7919041 100755 --- a/regen/regcharclass.pl +++ b/regen/regcharclass.pl @@ -671,8 +671,9 @@ sub length_optree { # have only a few things that can match past this, so I (khw) # don't think it is worth it. (Even better would be to use # calculate_mask(keys %$utf8) instead of UTF8_IS_START, and use it - # if it saves a bunch. - my $cond = "(((e) - (s)) >= UTF8SKIP(s))"; + # if it saves a bunch. We assume that input text likely to be + # well-formed . + my $cond = "LIKELY(((e) - (s)) >= UTF8SKIP(s))"; $else = __cond_join($cond, $utf8, $else); # For 'generic', we also will want the latin1 UTF-8 variants for @@ -715,7 +716,7 @@ sub length_optree { } # We need at least one byte available to start off the tests - $else = __cond_join("((e) > (s))", $else, 0); + $else = __cond_join("LIKELY((e) > (s))", $else, 0); } else { # Here, we don't want or there aren't any variants. A single # byte available is enough. diff --git a/utf8.h b/utf8.h index 74e7d48..52671eb 100644 --- a/utf8.h +++ b/utf8.h @@ -639,13 +639,13 @@ machines) is a valid UTF-8 character. =cut */ -#define isUTF8_CHAR(s, e) (((e) <= (s)) \ +#define isUTF8_CHAR(s, e) (UNLIKELY((e) <= (s)) \ ? 0 \ : (UTF8_IS_INVARIANT(*s)) \ ? 1 \ - : (((e) - (s)) < UTF8SKIP(s)) \ + : UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \ ? 0 \ - : (IS_UTF8_CHAR_FAST(UTF8SKIP(s))) \ + : LIKELY(IS_UTF8_CHAR_FAST(UTF8SKIP(s))) \ ? is_UTF8_CHAR_utf8_no_length_checks(s) \ : _is_utf8_char_slow(s, e)) -- 1.8.3.1