utf8.h: Correct macros for EBCDIC

[perl5.git] / utf8.h
diff --git a/utf8.h b/utf8.h

index f990f37..4738648 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -105,14 +105,20 @@ END_EXTERN_C
  #include "regcharclass.h"
  #include "unicode_constants.h"
  
-/* Native character to iso-8859-1 */
-#define NATIVE_TO_ASCII(ch)      (ch)
-#define ASCII_TO_NATIVE(ch)      (ch)
-/* Transform after encoding */
-#define NATIVE_TO_UTF(ch)        (ch)
-#define NATIVE_TO_I8(ch) NATIVE_TO_UTF(ch)     /* a clearer synonym */
-#define UTF_TO_NATIVE(ch)        (ch)
-#define I8_TO_NATIVE(ch) UTF_TO_NATIVE(ch)
+/* Native character to/from iso-8859-1.  Are the identity functions on ASCII
+ * platforms */
+#define NATIVE_TO_LATIN1(ch)     (ch)
+#define LATIN1_TO_NATIVE(ch)     (ch)
+
+/* I8 is an intermediate version of UTF-8 used only in UTF-EBCDIC.  We thus
+ * consider it to be identical to UTF-8 on ASCII platforms.  Strictly speaking
+ * UTF-8 and UTF-EBCDIC are two different things, but we often conflate them
+ * because they are 8-bit encodings that serve the same purpose in Perl, and
+ * rarely do we need to distinguish them.  The term "NATIVE_UTF8" applies to
+ * whichever one is applicable on the current platform */
+#define NATIVE_UTF8_TO_I8(ch) (ch)
+#define I8_TO_NATIVE_UTF8(ch) (ch)
+
  /* Transforms in wide UV chars */
  #define UNI_TO_NATIVE(ch)        (ch)
  #define NATIVE_TO_UNI(ch)        (ch)
@@ -136,7 +142,7 @@ END_EXTERN_C
     U+0800..U+0FFF      E0      * A0..BF    80..BF
     U+1000..U+CFFF       E1..EC    80..BF    80..BF
     U+D000..U+D7FF       ED        80..9F    80..BF
-   U+D800..U+DFFF       +++++++ utf16 surrogates, not legal utf8 +++++++
+   U+D800..U+DFFF       ED        A0..BF    80..BF  (surrogates)
     U+E000..U+FFFF       EE..EF    80..BF    80..BF
    U+10000..U+3FFFF     F0      * 90..BF    80..BF    80..BF
    U+40000..U+FFFFF     F1..F3    80..BF    80..BF    80..BF
@@ -144,7 +150,7 @@ END_EXTERN_C
      Below are non-Unicode code points
   U+110000..U+13FFFF    F4        90..BF    80..BF    80..BF
   U+110000..U+1FFFFF    F5..F7    80..BF    80..BF    80..BF
- U+200000:              F8..    * 88..BF    80..BF    80..BF    80..BF
+ U+200000..:            F8..    * 88..BF    80..BF    80..BF    80..BF
  
  Note the gaps before several of the byte entries above marked by '*'.  These are
  caused by legal UTF-8 avoiding non-shortest encodings: it is technically
@@ -273,8 +279,21 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
   * this level; the macros that some of these call may have different
   * definitions in the two encodings */
  
-#define NATIVE8_TO_UNI(ch)     NATIVE_TO_ASCII(ch)     /* a clearer synonym */
-
+/* In domain restricted to ASCII, these may make more sense to the reader than
+ * the ones with Latin1 in the name */
+#define NATIVE_TO_ASCII(ch)      NATIVE_TO_LATIN1(ch)
+#define ASCII_TO_NATIVE(ch)      LATIN1_TO_NATIVE(ch)
+
+/* More or less misleadingly-named defines, retained for back compat */
+#define NATIVE_TO_UTF(ch)        NATIVE_UTF8_TO_I8(ch)
+#define NATIVE_TO_I8(ch)         NATIVE_UTF8_TO_I8(ch)
+#define UTF_TO_NATIVE(ch)        I8_TO_NATIVE_UTF8(ch)
+#define I8_TO_NATIVE(ch)         I8_TO_NATIVE_UTF8(ch)
+#define NATIVE8_TO_UNI(ch)       NATIVE_TO_LATIN1(ch)
+
+/* Adds a UTF8 continuation byte 'new' of information to a running total code
+ * point 'old' of all the continuation bytes so far.  This is designed to be
+ * used in a loop to convert from UTF-8 to the code point represented */
  #define UTF8_ACCUMULATE(old, new)      (((old) << UTF_ACCUMULATION_SHIFT)     \
                                          | (((U8)new) & UTF_CONTINUATION_MASK))
  
@@ -300,25 +319,30 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
   * works on both UTF-8 encoded strings and non-encoded, as it returns TRUE in
   * each for the exact same set of bit patterns.  (And it works on any byte in a
   * UTF-8 encoded string) */
-#define UTF8_IS_INVARIANT(c)           UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
+#define UTF8_IS_INVARIANT(c)           UNI_IS_INVARIANT(NATIVE_UTF8_TO_I8(c))
  
-#define NATIVE_IS_INVARIANT(c)         UNI_IS_INVARIANT(NATIVE8_TO_UNI(c))
+#define NATIVE_IS_INVARIANT(c)         UNI_IS_INVARIANT(NATIVE_TO_LATIN1(c))
  
  #define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF    /* constrained by EBCDIC */
  
  /* The macros in the next sets are used to generate the two utf8 or utfebcdic
   * bytes from an ordinal that is known to fit into two bytes; it must be less
   * than 0x3FF to work across both encodings. */
-/* Nocast allows these to be used in the case label of a switch statement */
-#define UTF8_TWO_BYTE_HI_nocast(c)     NATIVE_TO_I8(((c)                       \
+/* Nocast allows these to be used in the case label of a switch statement;
+ * however this doesn't won't work for ebcdic, and should be avoided.  Use
+ * regen/unicode_constants instead */
+#define UTF8_TWO_BYTE_HI_nocast(c)     I8_TO_NATIVE_UTF8((NATIVE_TO_UNI(c)     \
                          >> UTF_ACCUMULATION_SHIFT) | (0xFF & UTF_START_MARK(2)))
-#define UTF8_TWO_BYTE_LO_nocast(c)  NATIVE_TO_I8(((c) & UTF_CONTINUATION_MASK)  \
-                                    | UTF_CONTINUATION_MARK)
+#define UTF8_TWO_BYTE_LO_nocast(c)  I8_TO_NATIVE_UTF8((NATIVE_TO_UNI(c)         \
+                                                  & UTF_CONTINUATION_MASK)      \
+                                                | UTF_CONTINUATION_MARK)
  
  #define UTF8_TWO_BYTE_HI(c)    ((U8) (UTF8_TWO_BYTE_HI_nocast(c)))
  #define UTF8_TWO_BYTE_LO(c)    ((U8) (UTF8_TWO_BYTE_LO_nocast(c)))
  
-/* This name is used when the source is a single byte */
+/* This name is used when the source is a single byte.  For EBCDIC these could
+ * be more efficiently written; the reason is that things above 0xFF have to be
+ * special-cased, which is done by the EBCDIC version of NATIVE_TO_UNI() */
  #define UTF8_EIGHT_BIT_HI(c)   UTF8_TWO_BYTE_HI((U8)(c))
  #define UTF8_EIGHT_BIT_LO(c)   UTF8_TWO_BYTE_LO((U8)(c))
  
@@ -412,9 +436,9 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
   * U+110001: \xF4\x90\x80\x81  \xF9\xA2\xA0\xA0\xA1
   */
  #ifdef EBCDIC /* Both versions assume well-formed UTF8 */
-#   define UTF8_IS_SUPER(s)  (NATIVE_TO_I8(* (U8*) (s)) >= 0xF9                 \
-                              && (NATIVE_TO_I8(* (U8*) (s)) > 0xF9              \
-                                  || (NATIVE_TO_I8(* (U8*) ((s)) + 1 >= 0xA2))))
+#   define UTF8_IS_SUPER(s) (NATIVE_UTF8_TO_I8(* (U8*) (s)) >= 0xF9             \
+                         && (NATIVE_UTF8_TO_I8(* (U8*) (s)) > 0xF9              \
+                             || (NATIVE_UTF8_TO_I8(* (U8*) ((s)) + 1 >= 0xA2))))
  #else
  #   define UTF8_IS_SUPER(s) (*(U8*) (s) >= 0xF4                                 \
                              && (*(U8*) (s) > 0xF4 || (*((U8*) (s) + 1) >= 0x90)))
@@ -501,8 +525,8 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
          (ANYOF_NONBITMAP(node)) && \
          (ANYOF_FLAGS(node) & ANYOF_LOC_NONBITMAP_FOLD) && \
          ((end) > (input) + 1) && \
-        toLOWER((input)[0]) == 's' && \
-        toLOWER((input)[1]) == 's')
+        toFOLD((input)[0]) == 's' && \
+        toFOLD((input)[1]) == 's')
  #define SHARP_S_SKIP 2
  
  /* If you want to exclude surrogates, and beyond legal Unicode, see the blame