This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8.h: Remove some EBCDIC dependencies
authorKarl Williamson <public@khwilliamson.com>
Thu, 6 Sep 2012 02:32:29 +0000 (20:32 -0600)
committerKarl Williamson <public@khwilliamson.com>
Fri, 14 Sep 2012 03:14:04 +0000 (21:14 -0600)
regen/regcharclass.pl has been enhanced in previous commits so that it
generates as good code as these hand-defined macro definitions for
various UTF-8 constructs.  And, it should be able to generate EBCDIC
ones as well.  By using its definitions, we can remove the EBCDIC
dependencies for them.  It is quite possible that the EBCDIC versions
were wrong, since they have never been tested.  Even if
regcharclass.pl has bugs under EBCDIC, it is easier to find and fix
those in one place, than all the sundry definitions.

regcharclass.h
regen/regcharclass.pl
utf8.h

index 840caf1..f9c828d 100644 (file)
 0x2029 == cp ) ) ) ) ) )
 
 /*
+       REPLACEMENT: Unicode REPLACEMENT CHARACTER
+
+       0xFFFD
+*/
+/*** GENERATED CODE ***/
+#define is_REPLACEMENT_utf8_safe(s,e)                                       \
+( ( ( ( ((e)-(s) > 2) && ( 0xEF == ((U8*)s)[0] ) ) && ( 0xBF == ((U8*)s)[1] ) ) && ( 0xBD == ((U8*)s)[2] ) ) ? 3 : 0 )
+
+/*
+       NONCHAR: Non character code points
+
+       \p{Nchar}
+*/
+/*** GENERATED CODE ***/
+#define is_NONCHAR_utf8(s)                                                  \
+( ( 0xEF == ((U8*)s)[0] ) ?                                                 \
+    ( ( 0xB7 == ((U8*)s)[1] ) ?                                             \
+       ( ( 0x90 <= ((U8*)s)[2] && ((U8*)s)[2] <= 0xAF ) ? 3 : 0 )          \
+    : ( 0xBF == ((U8*)s)[1] ) ?                                             \
+       ( ( ((U8*)s)[2] >= 0xBE ) ? 3 : 0 )                                 \
+    : 0 )                                                                   \
+: ( 0xF0 == ((U8*)s)[0] ) ?                                                 \
+    ( ( ( ( 0x9F == ((U8*)s)[1] || 0xAF == ((U8*)s)[1] || 0xBF == ((U8*)s)[1] ) && ( 0xBF == ((U8*)s)[2] ) ) && ( ((U8*)s)[3] >= 0xBE ) ) ? 4 : 0 )\
+: ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF3 ) ?                          \
+    ( ( ( ( ( ((U8*)s)[1] & 0xCF ) == 0x8F ) && ( 0xBF == ((U8*)s)[2] ) ) && ( ((U8*)s)[3] >= 0xBE ) ) ? 4 : 0 )\
+: ( 0xF4 == ((U8*)s)[0] ) ?                                                 \
+    ( ( ( ( 0x8F == ((U8*)s)[1] ) && ( 0xBF == ((U8*)s)[2] ) ) && ( ((U8*)s)[3] >= 0xBE ) ) ? 4 : 0 )\
+: 0 )
+
+/*
+       SURROGATE: Surrogate characters
+
+       \p{Gc=Cs}
+*/
+/*** GENERATED CODE ***/
+#define is_SURROGATE_utf8(s)                                                \
+( ( ( 0xED == ((U8*)s)[0] ) && ( ((U8*)s)[1] >= 0xA0 ) ) ? 3 : 0 )
+
+/*
        GCB_L: Grapheme_Cluster_Break=L
 
        \p{_X_GCB_L}
index 70f46b0..81ac13c 100755 (executable)
@@ -1112,6 +1112,18 @@ VERTWS: Vertical Whitespace: \v \V
 => generic UTF8 LATIN1 cp :fast safe
 \p{VertSpace}
 
+REPLACEMENT: Unicode REPLACEMENT CHARACTER
+=> UTF8 :safe
+0xFFFD
+
+NONCHAR: Non character code points
+=> UTF8 :fast
+\p{Nchar}
+
+SURROGATE: Surrogate characters
+=> UTF8 :fast
+\p{Gc=Cs}
+
 GCB_L: Grapheme_Cluster_Break=L
 => UTF8 :fast
 \p{_X_GCB_L}
diff --git a/utf8.h b/utf8.h
index e312d87..a6af557 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -93,6 +93,7 @@ EXTCONST unsigned char PL_utf8skip[];
 
 END_EXTERN_C
 
+#include "regcharclass.h"
 #include "unicode_constants.h"
 
 /* Native character to iso-8859-1 */
@@ -346,35 +347,11 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  * problematic in some contexts.  This allows code that needs to check for
  * those to to quickly exclude the vast majority of code points it will
  * encounter */
-#ifdef EBCDIC
-#   define UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE UTF_TO_NATIVE(0xF1)
-#else
-#   define UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE 0xED
-#endif
+#define UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE \
+                                    FIRST_SURROGATE_UTF8_FIRST_BYTE
 
-/*             ASCII              EBCDIC I8
- * U+D7FF:   \xED\x9F\xBF      \xF1\xB5\xBF\xBF    last before surrogates
- * U+D800:   \xED\xA0\x80      \xF1\xB6\xA0\xA0    1st surrogate
- * U+DFFF:   \xED\xBF\xBF      \xF1\xB7\xBF\xBF    final surrogate
- * U+E000:   \xEE\x80\x80      \xF1\xB8\xA0\xA0    next after surrogates
- */
-#ifdef EBCDIC /* Both versions assume well-formed UTF8 */
-#   define UTF8_IS_SURROGATE(s)  (*(s) == UTF_TO_NATIVE(0xF1)                 \
-                                 && ((*((s) +1) == UTF_TO_NATIVE(0xB6))       \
-                                    || *((s) + 1) == UTF_TO_NATIVE(0xB7)))
-    /* <send> points to one beyond the end of the string that starts at <s> */
-#   define UTF8_IS_REPLACEMENT(s, send) (*(s) == UTF_TO_NATIVE(0xEF)          \
-                                        && (send - s) >= 4                   \
-                                        && *((s) + 1) == UTF_TO_NATIVE(0xBF) \
-                                        && *((s) + 2) == UTF_TO_NATIVE(0xBF) \
-                                        && *((s) + 3) == UTF_TO_NATIVE(0xBD)
-#else
-#   define UTF8_IS_SURROGATE(s) (*(s) == 0xED && *((s) + 1) >= 0xA0)
-#   define UTF8_IS_REPLACEMENT(s, send) (*(s) == 0xEF          \
-                                         && (send - s) >= 3    \
-                                        && *((s) + 1) == 0xBF \
-                                        && *((s) + 2) == 0xBD)
-#endif
+#define UTF8_IS_SURROGATE(s) cBOOL(is_SURROGATE_utf8(s))
+#define UTF8_IS_REPLACEMENT(s, send) cBOOL(is_REPLACEMENT_utf8_safe(s,send))
 
 /*               ASCII              EBCDIC I8
  * U+10FFFF: \xF4\x8F\xBF\xBF  \xF9\xA1\xBF\xBF\xBF    max legal Unicode
@@ -389,60 +366,12 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
                                        && (*(s) > 0xF4 || (*((s) + 1) >= 0x90)))
 #endif
 
-/*        ASCII                     EBCDIC I8
- * U+FDCF: \xEF\xB7\x8F                \xF1\xBF\xAE\xAF        last before non-char block
- * U+FDD0: \xEF\xB7\x90                \xF1\xBF\xAE\xB0        first non-char in block
- * U+FDEF: \xEF\xB7\xAF                \xF1\xBF\xAF\xAF        last non-char in block
- * U+FDF0: \xEF\xB7\xB0                \xF1\xBF\xAF\xB0        first after non-char block
- * U+FFFF: \xEF\xBF\xBF                \xF1\xBF\xBF\xBF
- * U+1FFFF: \xF0\x9F\xBF\xBF   \xF3\xBF\xBF\xBF
- * U+2FFFF: \xF0\xAF\xBF\xBF   \xF5\xBF\xBF\xBF
- * U+3FFFF: \xF0\xBF\xBF\xBF   \xF7\xBF\xBF\xBF
- * U+4FFFF: \xF1\x8F\xBF\xBF   \xF8\xA9\xBF\xBF\xBF
- * U+5FFFF: \xF1\x9F\xBF\xBF   \xF8\xAB\xBF\xBF\xBF
- * U+6FFFF: \xF1\xAF\xBF\xBF   \xF8\xAD\xBF\xBF\xBF
- * U+7FFFF: \xF1\xBF\xBF\xBF   \xF8\xAF\xBF\xBF\xBF
- * U+8FFFF: \xF2\x8F\xBF\xBF   \xF8\xB1\xBF\xBF\xBF
- * U+9FFFF: \xF2\x9F\xBF\xBF   \xF8\xB3\xBF\xBF\xBF
- * U+AFFFF: \xF2\xAF\xBF\xBF   \xF8\xB5\xBF\xBF\xBF
- * U+BFFFF: \xF2\xBF\xBF\xBF   \xF8\xB7\xBF\xBF\xBF
- * U+CFFFF: \xF3\x8F\xBF\xBF   \xF8\xB9\xBF\xBF\xBF
- * U+DFFFF: \xF3\x9F\xBF\xBF   \xF8\xBB\xBF\xBF\xBF
- * U+EFFFF: \xF3\xAF\xBF\xBF   \xF8\xBD\xBF\xBF\xBF
- * U+FFFFF: \xF3\xBF\xBF\xBF   \xF8\xBF\xBF\xBF\xBF
- * U+10FFFF: \xF4\x8F\xBF\xBF  \xF9\xA1\xBF\xBF\xBF
- */
-#define UTF8_IS_NONCHAR_(s) (                                                   \
-    *(s) >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE                        \
-    && ! UTF8_IS_SUPER(s)                                                       \
-    && UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_FIRST_PROBLEMATIC(s)         \
-
-#ifdef EBCDIC /* Both versions assume well-formed UTF8 */
-#   define UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s)           \
-    ((*(s) == UTF_TO_NATIVE(0xF1)                                               \
-       && (*((s) + 1) == UTF_TO_NATIVE(0xBF)                                    \
-       &&    ((*((s) + 2) == UTF_TO_NATIVE(0xAE)                                \
-           && *((s) + 3) >= UTF_TO_NATIVE(0xB0))                               \
-         || (*((s) + 2) == UTF_TO_NATIVE(0xAF)                                 \
-           && *((s) + 3) <= UTF_TO_NATIVE(0xAF)))))                            \
-    || (UTF8SKIP(*(s)) > 3                                                      \
-       /* (These were all derived by inspection and experimentation with an */ \
-       /* editor)  The next line checks the next to final byte in the char */  \
-       && *((s) + UTF8SKIP(*(s)) - 2) == UTF_TO_NATIVE(0xBF)                   \
-       && *((s) + UTF8SKIP(*(s)) - 3) == UTF_TO_NATIVE(0xBF)                   \
-        && (NATIVE_TO_UTF(*((s) + UTF8SKIP(*(s)) - 4)) & 0x81) == 0x81          \
-        && (NATIVE_TO_UTF(*((s) + UTF8SKIP(*(s)) - 1)) & 0xBE) == 0XBE))
-#else
-#   define UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s)           \
-    ((*(s) == 0xEF                                                              \
-       && ((*((s) + 1) == 0xB7 && (*((s) + 2) >= 0x90 && (*((s) + 2) <= 0xAF)))\
-               /* Gets U+FFF[EF] */                                            \
-           || (*((s) + 1) == 0xBF && ((*((s) + 2) & 0xBE) == 0xBE))))          \
- || ((*((s) + 2) == 0xBF                                                        \
-        && (*((s) + 3) & 0xBE) == 0xBE                                         \
-           /* Excludes things like U+10FFE = \xF0\x90\xBF\xBE */               \
-        && (*((s) + 1) & 0x8F) == 0x8F)))
-#endif
+/* These are now machine generated, and the 'given' clause is no longer
+ * applicable */
+#define UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s)             \
+                                                    cBOOL(is_NONCHAR_utf8(s))
+#define UTF8_IS_NONCHAR_(s)                                                    \
+                    UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s)
 
 #define UNICODE_SURROGATE_FIRST                0xD800
 #define UNICODE_SURROGATE_LAST         0xDFFF