mktables: Calculate legal chars in algorithmic names

[perl5.git] / utf8.h
diff --git a/utf8.h b/utf8.h

index f60202b..fb83507 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -380,11 +380,19 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) code point if less than
       ((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
  
  /* Is the byte 'c' the first byte of a multi-byte UTF8-8 encoded sequence?
- * This doesn't catch invariants (they are single-byte).  It also excludes the
+ * This excludes invariants (they are single-byte).  It also excludes the
   * illegal overlong sequences that begin with C0 and C1 on ASCII platforms, and
- * C0-C4 I8 start bytes on EBCDIC ones */
-#define UTF8_IS_START(c)    (__ASSERT_(FITS_IN_8_BITS(c))                   \
+ * C0-C4 I8 start bytes on EBCDIC ones.  On EBCDIC E0 can't start a
+ * non-overlong sequence, so we define a base macro and for those platforms,
+ * extend it to also exclude E0 */
+#define UTF8_IS_START_base(c)    (__ASSERT_(FITS_IN_8_BITS(c))              \
                               (NATIVE_UTF8_TO_I8(c) >= UTF_MIN_START_BYTE))
+#ifdef EBCDIC
+#  define UTF8_IS_START(c)                                                  \
+                (UTF8_IS_START_base(c) && (c) != I8_TO_NATIVE_UTF8(0xE0))
+#else
+#  define UTF8_IS_START(c)  UTF8_IS_START_base(c)
+#endif
  
  #define UTF_MIN_ABOVE_LATIN1_BYTE                                           \
                      ((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
@@ -684,32 +692,16 @@ with a ptr argument.
   * beginning of a utf8 character.  Now that foo_utf8() determines that itself,
   * no need to do it again here
   */
-#define isIDFIRST_lazy_if(p,UTF)                                            \
-            _is_utf8_FOO(_CC_IDFIRST, (const U8 *) p, "isIDFIRST_lazy_if",  \
-                         "isIDFIRST_lazy_if_safe",                          \
-                         cBOOL(UTF && ! IN_BYTES), 0, __FILE__,__LINE__)
-
  #define isIDFIRST_lazy_if_safe(p, e, UTF)                                   \
                     ((IN_BYTES || !UTF)                                      \
                       ? isIDFIRST(*(p))                                      \
                       : isIDFIRST_utf8_safe(p, e))
-
-#define isWORDCHAR_lazy_if(p,UTF)                                           \
-            _is_utf8_FOO(_CC_IDFIRST, (const U8 *) p, "isWORDCHAR_lazy_if", \
-                         "isWORDCHAR_lazy_if_safe",                         \
-                         cBOOL(UTF && ! IN_BYTES), 0, __FILE__,__LINE__)
-
  #define isWORDCHAR_lazy_if_safe(p, e, UTF)                                  \
                     ((IN_BYTES || !UTF)                                      \
                       ? isWORDCHAR(*(p))                                     \
                       : isWORDCHAR_utf8_safe((U8 *) p, (U8 *) e))
  #define isALNUM_lazy_if_safe(p, e, UTF) isWORDCHAR_lazy_if_safe(p, e, UTF)
  
-#define isALNUM_lazy_if(p,UTF)                                              \
-            _is_utf8_FOO(_CC_IDFIRST, (const U8 *) p, "isALNUM_lazy_if",    \
-                         "isWORDCHAR_lazy_if_safe",                         \
-                         cBOOL(UTF && ! IN_BYTES), 0, __FILE__,__LINE__)
-
  #define UTF8_MAXLEN UTF8_MAXBYTES
  
  /* A Unicode character can fold to up to 3 characters */
@@ -1017,7 +1009,13 @@ Evaluates to 0xFFFD, the code point of the Unicode REPLACEMENT CHARACTER
  
  #define UNI_DISPLAY_ISPRINT    0x0001
  #define UNI_DISPLAY_BACKSLASH  0x0002
-#define UNI_DISPLAY_QQ         (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
+#define UNI_DISPLAY_BACKSPACE  0x0004  /* Allow \b when also
+                                           UNI_DISPLAY_BACKSLASH */
+#define UNI_DISPLAY_QQ         (UNI_DISPLAY_ISPRINT                \
+                                |UNI_DISPLAY_BACKSLASH              \
+                                |UNI_DISPLAY_BACKSPACE)
+
+/* Character classes could also allow \b, but not patterns in general */
  #define UNI_DISPLAY_REGEX      (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
  
  #define ANYOF_FOLD_SHARP_S(node, input, end)   \