Deprecate NATIVE_TO_NEED and ASCII_TO_NEED

[perl5.git] / utfebcdic.h
diff --git a/utfebcdic.h b/utfebcdic.h

index 3eba83d..766c977 100644 (file)
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -152,7 +152,7 @@ unsigned char PL_utf8skip[] = {
   * remains 'A' */
  
  #if '^' == 95   /* if defined(__MVS__) || defined(??) (VM/ESA?) 1047 */
-EXTCONST unsigned char PL_utf2e[] = { /* I8 to EBCDIC (IBM-1047) */
+EXTCONST unsigned char PL_utf2e[] = { /* I8 to UTFEBCDIC (IBM-1047) */
   0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
   0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
   0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
@@ -171,7 +171,7 @@ EXTCONST unsigned char PL_utf2e[] = { /* I8 to EBCDIC (IBM-1047) */
   0xDC, 0xDD, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE
  };
  
-EXTCONST unsigned char PL_e2utf[] = { /* EBCDIC (IBM-1047) to I8 */
+EXTCONST unsigned char PL_e2utf[] = { /* UTFEBCDIC (IBM-1047) to I8 */
   0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
   0x10, 0x11, 0x12, 0x13, 0x9D, 0x0A, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
   0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x17, 0x1B, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
@@ -192,7 +192,7 @@ EXTCONST unsigned char PL_e2utf[] = { /* EBCDIC (IBM-1047) to I8 */
  #endif /* 1047 */
  
  #if '^' == 106  /* if defined(_OSD_POSIX) POSIX-BC */
-unsigned char PL_utf2e[] = { /* I8 to EBCDIC (POSIX-BC) */
+unsigned char PL_utf2e[] = { /* I8 to UTFEBCDIC (POSIX-BC) */
   0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x15, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
   0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
   0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
@@ -211,7 +211,7 @@ unsigned char PL_utf2e[] = { /* I8 to EBCDIC (POSIX-BC) */
   0xDC, 0xC0, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xDD, 0xFC, 0xE0, 0xFE
  };
  
-unsigned char PL_e2utf[] = { /* EBCDIC (POSIX-BC) to I8 */
+unsigned char PL_e2utf[] = { /* UTFEBCDIC (POSIX-BC) to I8 */
   0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
   0x10, 0x11, 0x12, 0x13, 0x9D, 0x0A, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
   0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x17, 0x1B, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
@@ -232,7 +232,7 @@ unsigned char PL_e2utf[] = { /* EBCDIC (POSIX-BC) to I8 */
  #endif /* POSIX-BC */
  
  #if '^' == 176  /* if defined(??) (OS/400?) 037 */
-unsigned char PL_utf2e[] = { /* I8 to EBCDIC (IBM-037) */
+unsigned char PL_utf2e[] = { /* I8 to UTFEBCDIC (IBM-037) */
   0x00, 0x01, 0x02, 0x03, 0x37, 0x2D, 0x2E, 0x2F, 0x16, 0x05, 0x25, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
   0x10, 0x11, 0x12, 0x13, 0x3C, 0x3D, 0x32, 0x26, 0x18, 0x19, 0x3F, 0x27, 0x1C, 0x1D, 0x1E, 0x1F,
   0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D, 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
@@ -251,7 +251,7 @@ unsigned char PL_utf2e[] = { /* I8 to EBCDIC (IBM-037) */
   0xDC, 0xDD, 0xDE, 0xDF, 0xE1, 0xEA, 0xEB, 0xEC, 0xED, 0xEE, 0xEF, 0xFA, 0xFB, 0xFC, 0xFD, 0xFE
  };
  
-unsigned char PL_e2utf[] = { /* EBCDIC (IBM-037) to I8 */
+unsigned char PL_e2utf[] = { /* UTFEBCDIC (IBM-037) to I8 */
   0x00, 0x01, 0x02, 0x03, 0x9C, 0x09, 0x86, 0x7F, 0x97, 0x8D, 0x8E, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
   0x10, 0x11, 0x12, 0x13, 0x9D, 0x85, 0x08, 0x87, 0x18, 0x19, 0x92, 0x8F, 0x1C, 0x1D, 0x1E, 0x1F,
   0x80, 0x81, 0x82, 0x83, 0x84, 0x0A, 0x17, 0x1B, 0x88, 0x89, 0x8A, 0x8B, 0x8C, 0x05, 0x06, 0x07,
@@ -503,6 +503,10 @@ EXTCONST unsigned char PL_fold[] = { /* fast EBCDIC case folding table, 'A' =>
  };
  #endif          /* 037 */
  
+/* Since the EBCDIC code pages are isomorphic to Latin1, that table is merely a
+ * duplicate */
+EXTCONST unsigned char * PL_fold_latin1 = PL_fold;
+
  #else
  EXTCONST unsigned char PL_utf8skip[];
  EXTCONST unsigned char PL_e2utf[];
@@ -510,30 +514,22 @@ EXTCONST unsigned char PL_utf2e[];
  EXTCONST unsigned char PL_e2a[];
  EXTCONST unsigned char PL_a2e[];
  EXTCONST unsigned char PL_fold[];
+EXTCONST unsigned char * PL_fold_latin1;
  #endif
  
-/* Since the EBCDIC code pages are isomorphic to Latin1, that table is merely a
- * duplicate */
-EXTCONST unsigned char * PL_fold_latin1 = PL_fold;
-
  END_EXTERN_C
  
  /* EBCDIC-happy ways of converting native code to UTF-8 */
  
-/* Native to iso-8859-1 */
-#define NATIVE_TO_ASCII(ch)      PL_e2a[(U8)(ch)]
-#define ASCII_TO_NATIVE(ch)      PL_a2e[(U8)(ch)]
-/* Transform after encoding, essentially converts to/from I8 */
-#define NATIVE_TO_UTF(ch)        PL_e2utf[(U8)(ch)]    /* to I8 */
-#define NATIVE_TO_I8(ch)         NATIVE_TO_UTF(ch)     /* synonym */
-#define UTF_TO_NATIVE(ch)        PL_utf2e[(U8)(ch)]    /* from I8 */
-#define I8_TO_NATIVE(ch)         UTF_TO_NATIVE(ch)     /* synonym */
-/* Transform in wide UV char space */
-#define NATIVE_TO_UNI(ch)        (((ch) > 255) ? (ch) : NATIVE_TO_ASCII(ch))
-#define UNI_TO_NATIVE(ch)        (((ch) > 255) ? (ch) : ASCII_TO_NATIVE(ch))
-/* Transform in invariant..byte space */
-#define NATIVE_TO_NEED(enc,ch)   ((enc) ? UTF_TO_NATIVE(NATIVE_TO_ASCII(ch)) : (ch))
-#define ASCII_TO_NEED(enc,ch)    ((enc) ? UTF_TO_NATIVE(ch) : ASCII_TO_NATIVE(ch))
+#define NATIVE_TO_LATIN1(ch)            PL_e2a[(U8)(ch)]
+#define LATIN1_TO_NATIVE(ch)            PL_a2e[(U8)(ch)]
+
+#define NATIVE_UTF8_TO_I8(ch) (ch)      PL_e2utf[(U8)(ch)]
+#define I8_TO_NATIVE_UTF8(ch) (ch)      PL_utf2e[(U8)(ch)]
+
+/* Transforms in wide UV chars */
+#define NATIVE_TO_UNI(ch)        (((ch) > 255) ? (ch) : NATIVE_TO_LATIN1(ch))
+#define UNI_TO_NATIVE(ch)        (((ch) > 255) ? (ch) : LATIN1_TO_NATIVE(ch))
  
  /*
    The following table is adapted from tr16, it shows I8 encoding of Unicode code points.
@@ -541,8 +537,6 @@ END_EXTERN_C
          Unicode                             Bit pattern 1st Byte 2nd Byte 3rd Byte 4th Byte 5th Byte 6th Byte 7th byte
      U+0000..U+007F                     000000000xxxxxxx 0xxxxxxx
      U+0080..U+009F                     00000000100xxxxx 100xxxxx
-    U+00A0..U+00FF                     00000000yyyxxxxx 11000yyy 101xxxxx
-
      U+00A0..U+03FF                     000000yyyyyxxxxx 110yyyyy 101xxxxx
      U+0400..U+3FFF                     00zzzzyyyyyxxxxx 1110zzzz 101yyyyy 101xxxxx
      U+4000..U+3FFFF                 0wwwzzzzzyyyyyxxxxx 11110www 101zzzzz 101yyyyy 101xxxxx
@@ -562,14 +556,19 @@ END_EXTERN_C
                       (uv) < 0x400000       ? 5 : \
                       (uv) < 0x4000000      ? 6 : 7 )
  
-
  #define UNI_IS_INVARIANT(c)            ((c) <  0xA0)
-/* UTF-EBCDIC semantic macros - transform back into I8 and then compare */
  
-#define UTF8_IS_START(c)               (NATIVE_TO_UTF(c) >= 0xC5 && NATIVE_TO_UTF(c) != 0xE0)
-#define UTF8_IS_CONTINUATION(c)                ((NATIVE_TO_UTF(c) & 0xE0) == 0xA0)
-#define UTF8_IS_CONTINUED(c)           (NATIVE_TO_UTF(c) >= 0xA0)
-#define UTF8_IS_DOWNGRADEABLE_START(c) (NATIVE_TO_UTF(c) >= 0xC5 && NATIVE_TO_UTF(c) <= 0xC7)
+/* UTF-EBCDIC semantic macros - transform back into I8 and then compare
+ * Comments as to the meaning of each are given at their corresponding utf8.h
+ * definitions */
+
+#define UTF8_IS_START(c)               (NATIVE_UTF8_TO_I8(c) >= 0xC5     \
+                                         && NATIVE_UTF8_TO_I8(c) != 0xE0)
+#define UTF8_IS_CONTINUATION(c)                ((NATIVE_UTF8_TO_I8(c) & 0xE0) == 0xA0)
+#define UTF8_IS_CONTINUED(c)           (NATIVE_UTF8_TO_I8(c) >= 0xA0)
+#define UTF8_IS_DOWNGRADEABLE_START(c) (NATIVE_UTF8_TO_I8(c) >= 0xC5     \
+                                         && NATIVE_UTF8_TO_I8(c) <= 0xC7)
+#define UTF8_IS_ABOVE_LATIN1(c)        (NATIVE_UTF8_TO_I8(c) >= 0xC8)
  
  #define UTF_START_MARK(len) (((len) >  7) ? 0xFF : ((U8)(0xFE << (7-(len)))))
  #define UTF_START_MASK(len) (((len) >= 6) ? 0x01 : (0x1F >> ((len)-2)))
@@ -577,6 +576,19 @@ END_EXTERN_C
  #define UTF_CONTINUATION_MASK          ((U8)0x1f)
  #define UTF_ACCUMULATION_SHIFT         5
  
+/* How wide can a single UTF-8 encoded character become in bytes. */
+/* NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8
+ * is an encoding of Unicode, and Unicode's upper limit, 0x10FFFF, can be
+ * expressed with 5 bytes.  However, Perl thinks of UTF-8 as a way to encode
+ * non-negative integers in a binary format, even those above Unicode */
+#define UTF8_MAXBYTES 7
+
+/* The maximum number of UTF-8 bytes a single Unicode character can
+ * uppercase/lowercase/fold into.  Unicode guarantees that the maximum
+ * expansion is 3 characters.  On EBCDIC platforms, the highest Unicode
+ * character occupies 5 bytes, therefore this number is 15 */
+#define UTF8_MAXBYTES_CASE     15
+
  /*
   * Local variables:
   * c-indentation-style: bsd