Add some (UN)?LIKELY() to UTF8 handling

author Karl Williamson <khw@cpan.org>

Tue, 6 May 2014 19:18:28 +0000 (13:18 -0600)

committer Karl Williamson <khw@cpan.org>

Sat, 31 May 2014 17:56:56 +0000 (11:56 -0600)
author Karl Williamson <khw@cpan.org>
Tue, 6 May 2014 19:18:28 +0000 (13:18 -0600)
committer Karl Williamson <khw@cpan.org>
Sat, 31 May 2014 17:56:56 +0000 (11:56 -0600)
diff --git a/regcharclass.h b/regcharclass.h

index ebda2f7..7de537b 100644 (file)
--- a/regcharclass.h
+++ b/regcharclass.h
@@ -708,11 +708,11 @@
  */
  /*** GENERATED CODE ***/
  #define is_PATWS_safe(s,e,is_utf8)                                          \
-( ((e) > (s)) ?                                                             \
+( ( LIKELY((e) > (s)) ) ?                                                   \
      ( ( ( 0x09 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0x0D ) || 0x20 == ((U8*)s)[0] ) ? 1\
      : (! is_utf8 ) ?                                                        \
             ( 0x85 == ((U8*)s)[0] )                                         \
-       : (((e) - (s)) >= UTF8SKIP(s)) ?                                    \
+       : ( LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) ?                          \
             ( ( 0xC2 == ((U8*)s)[0] ) ?                                     \
                 ( ( 0x85 == ((U8*)s)[1] ) ? 2 : 0 )                         \
             : ( ( ( 0xE2 == ((U8*)s)[0] ) && ( 0x80 == ((U8*)s)[1] ) ) && ( ( ((U8*)s)[2] & 0xFE ) == 0x8E || ( ((U8*)s)[2] & 0xFE ) == 0xA8 ) ) ? 3 : 0 )\
@@ -1427,9 +1427,9 @@
  */
  /*** GENERATED CODE ***/
  #define is_PATWS_safe(s,e,is_utf8)                                          \
-( ((e) > (s)) ?                                                             \
+( ( LIKELY((e) > (s)) ) ?                                                   \
      ( ( ( ( ((U8*)s)[0] & 0xEF ) == 0x05 ) || ((U8*)s)[0] == 0x0B || ( ( ((U8*)s)[0] & 0xFE ) == 0x0C ) || ((U8*)s)[0] == 0x25 || ((U8*)s)[0] == 0x40 ) ? 1\
-    : ( ( is_utf8 && (((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\
+    : ( ( is_utf8 && LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\
                     ( ( 0x55 == ((U8*)s)[2] || 0x56 == ((U8*)s)[2] ) ? 3 : 0 )\
                 : ( ( 0x42 == ((U8*)s)[1] ) && ( 0x49 == ((U8*)s)[2] || 0x4A == ((U8*)s)[2] ) ) ? 3 : 0 ) : 0 )\
  : 0 )
@@ -2152,9 +2152,9 @@
  */
  /*** GENERATED CODE ***/
  #define is_PATWS_safe(s,e,is_utf8)                                          \
-( ((e) > (s)) ?                                                             \
+( ( LIKELY((e) > (s)) ) ?                                                   \
      ( ( ( ( ((U8*)s)[0] & 0xEF ) == 0x05 ) || ((U8*)s)[0] == 0x0B || ( ( ((U8*)s)[0] & 0xFE ) == 0x0C ) || ((U8*)s)[0] == 0x25 || ((U8*)s)[0] == 0x40 ) ? 1\
-    : ( ( is_utf8 && (((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\
+    : ( ( is_utf8 && LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\
                     ( ( 0x55 == ((U8*)s)[2] || 0x56 == ((U8*)s)[2] ) ? 3 : 0 )\
                 : ( ( 0x42 == ((U8*)s)[1] ) && ( 0x49 == ((U8*)s)[2] || 0x4A == ((U8*)s)[2] ) ) ? 3 : 0 ) : 0 )\
  : 0 )
@@ -2877,9 +2877,9 @@
  */
  /*** GENERATED CODE ***/
  #define is_PATWS_safe(s,e,is_utf8)                                          \
-( ((e) > (s)) ?                                                             \
+( ( LIKELY((e) > (s)) ) ?                                                   \
      ( ( ( ( ((U8*)s)[0] & 0xEF ) == 0x05 ) || ((U8*)s)[0] == 0x0B || ( ( ((U8*)s)[0] & 0xFE ) == 0x0C ) || ((U8*)s)[0] == 0x25 || ((U8*)s)[0] == 0x40 ) ? 1\
-    : ( ( is_utf8 && (((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\
+    : ( ( is_utf8 && LIKELY(((e) - (s)) >= UTF8SKIP(s)) ) && ( 0xCA == ((U8*)s)[0] ) ) ? ( ( 0x41 == ((U8*)s)[1] ) ?\
                     ( ( ( ((U8*)s)[2] & 0xFE ) == 0x56 ) ? 3 : 0 )          \
                 : ( ( 0x42 == ((U8*)s)[1] ) && ( 0x49 == ((U8*)s)[2] || 0x51 == ((U8*)s)[2] ) ) ? 3 : 0 ) : 0 )\
  : 0 )
diff --git a/regen/regcharclass.pl b/regen/regcharclass.pl

index 1f453e8..7919041 100755 (executable)
--- a/regen/regcharclass.pl
+++ b/regen/regcharclass.pl
@@ -671,8 +671,9 @@ sub length_optree {
              # have only a few things that can match past this, so I (khw)
              # don't think it is worth it.  (Even better would be to use
              # calculate_mask(keys %$utf8) instead of UTF8_IS_START, and use it
-            # if it saves a bunch.
-            my $cond = "(((e) - (s)) >= UTF8SKIP(s))";
+            # if it saves a bunch.  We assume that input text likely to be
+            # well-formed .
+            my $cond = "LIKELY(((e) - (s)) >= UTF8SKIP(s))";
              $else = __cond_join($cond, $utf8, $else);
  
              # For 'generic', we also will want the latin1 UTF-8 variants for
@@ -715,7 +716,7 @@ sub length_optree {
              }
  
              # We need at least one byte available to start off the tests
-            $else = __cond_join("((e) > (s))", $else, 0);
+            $else = __cond_join("LIKELY((e) > (s))", $else, 0);
          }
          else {  # Here, we don't want or there aren't any variants.  A single
                  # byte available is enough.
diff --git a/utf8.h b/utf8.h

index 74e7d48..52671eb 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -639,13 +639,13 @@ machines) is a valid UTF-8 character.
  =cut
  */
  
-#define isUTF8_CHAR(s, e)   (((e) <= (s))                                   \
+#define isUTF8_CHAR(s, e)   (UNLIKELY((e) <= (s))                           \
                               ? 0                                            \
                               : (UTF8_IS_INVARIANT(*s))                      \
                                 ? 1                                          \
-                               : (((e) - (s)) < UTF8SKIP(s))                \
+                               : UNLIKELY(((e) - (s)) < UTF8SKIP(s))        \
                                   ? 0                                        \
-                                 : (IS_UTF8_CHAR_FAST(UTF8SKIP(s)))         \
+                                 : LIKELY(IS_UTF8_CHAR_FAST(UTF8SKIP(s)))   \
                                     ? is_UTF8_CHAR_utf8_no_length_checks(s)  \
                                     : _is_utf8_char_slow(s, e))
author	Karl Williamson <khw@cpan.org>
	Tue, 6 May 2014 19:18:28 +0000 (13:18 -0600)
committer	Karl Williamson <khw@cpan.org>
	Sat, 31 May 2014 17:56:56 +0000 (11:56 -0600)
regcharclass.h		patch \| blob \| blame \| history
regen/regcharclass.pl		patch \| blob \| blame \| history
utf8.h		patch \| blob \| blame \| history