X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/2d1545e53e75b1c3ae16ad055ae011e2e015e0c3..a7db2e35a577614d7a6eaf05114b2d6224f0ab33:/utfebcdic.h

diff --git a/utfebcdic.h b/utfebcdic.h
index 5912b3a..10b666a 100644
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -27,17 +27,25 @@
  *	invariant byte	    starts with 0	starts with 0 or 100
  *	continuation byte   starts with 10	starts with 101
  *	start byte	    same in both: if the code point requires N bytes,
- *			    then the leading N bits are 1, followed by a 0.  (No
- *			    trailing 0 for the very largest possible allocation
- *			    in I8, far beyond the current Unicode standard's
- *			    max, as shown in the comment later in this file.)
+ *			    then the leading N bits are 1, followed by a 0.  If
+ *			    all 8 bits in the first byte are 1, the code point
+ *			    will occupy 14 bytes (compared to 13 in Perl's
+ *			    extended UTF-8).  This is incompatible with what
+ *			    tr16 implies should be the representation of code
+ *			    points 2**30 and above, but allows Perl to be able
+ *			    to represent all code points that fit in a 64-bit
+ *			    word in either our extended UTF-EBCDIC or UTF-8.
  *  3)	Use the algorithm in tr16 to convert each byte from step 2 into
  *	final UTF-EBCDIC.  This is done by table lookup from a table
  *	constructed from the algorithm, reproduced in ebcdic_tables.h as
  *	PL_utf2e, with its inverse being PL_e2utf.  They are constructed so that
  *	all EBCDIC invariants remain invariant, but no others do, and the first
  *	byte of a variant will always have its upper bit set.  But note that
- *	the upper bit of some invariants is also 1.
+ *	the upper bit of some invariants is also 1.  The table also is designed
+ *	so that lexically comparing two UTF-EBCDIC-variant characters yields
+ *	the Unicode code point order.  (To get native code point order, one has
+ *	to convert the latin1-range characters to their native code point
+ *	value.)
  *
  *  For example, the ordinal value of 'A' is 193 in EBCDIC, and also is 193 in
  *  UTF-EBCDIC.  Step 1) converts it to 65, Step 2 leaves it at 65, and Step 3
@@ -142,13 +150,24 @@ END_EXTERN_C
 #define I8_TO_NATIVE_UTF8(b)           (__ASSERT_(FITS_IN_8_BITS(b)) PL_utf2e[(U8)(b)])
 
 /* Transforms in wide UV chars */
-#define NATIVE_TO_UNI(ch)    (FITS_IN_8_BITS(ch) ? NATIVE_TO_LATIN1(ch) : (ch))
-#define UNI_TO_NATIVE(ch)    (FITS_IN_8_BITS(ch) ? LATIN1_TO_NATIVE(ch) : (ch))
+#define NATIVE_TO_UNI(ch)    (FITS_IN_8_BITS(ch) ? NATIVE_TO_LATIN1(ch) : (UV) (ch))
+#define UNI_TO_NATIVE(ch)    (FITS_IN_8_BITS(ch) ? LATIN1_TO_NATIVE(ch) : (UV) (ch))
+
+/* How wide can a single UTF-8 encoded character become in bytes. */
+/* NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8
+ * is an encoding of Unicode, and Unicode's upper limit, 0x10FFFF, can be
+ * expressed with 5 bytes.  However, Perl thinks of UTF-8 as a way to encode
+ * non-negative integers in a binary format, even those above Unicode.  14 is
+ * the smallest number that covers 2**64
+ *
+ * WARNING: This number must be in sync with the value in
+ * regen/charset_translations.pl. */
+#define UTF8_MAXBYTES 14
 
 /*
-  The following table is adapted from tr16, it shows I8 encoding of Unicode code points.
+  The following table is adapted from tr16, it shows the I8 encoding of Unicode code points.
 
-        Unicode                         U32 Bit pattern 1st Byte 2nd Byte 3rd Byte 4th Byte 5th Byte 6th Byte 7th byte
+        Unicode                         U32 Bit pattern 1st Byte 2nd Byte 3rd Byte 4th Byte 5th Byte 6th Byte 7th Byte
     U+0000..U+007F                     000000000xxxxxxx 0xxxxxxx
     U+0080..U+009F                     00000000100xxxxx 100xxxxx
     U+00A0..U+03FF                     000000yyyyyxxxxx 110yyyyy 101xxxxx
@@ -156,21 +175,20 @@ END_EXTERN_C
     U+4000..U+3FFFF                 0wwwzzzzzyyyyyxxxxx 11110www 101zzzzz 101yyyyy 101xxxxx
    U+40000..U+3FFFFF            0vvwwwwwzzzzzyyyyyxxxxx 111110vv 101wwwww 101zzzzz 101yyyyy 101xxxxx
   U+400000..U+3FFFFFF       0uvvvvvwwwwwzzzzzyyyyyxxxxx 1111110u 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx
- U+4000000..U+7FFFFFFF 0tuuuuuvvvvvwwwwwzzzzzyyyyyxxxxx 1111111t 101uuuuu 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx
+ U+4000000..U+3FFFFFFF 00uuuuuvvvvvwwwwwzzzzzyyyyyxxxxx 11111110 101uuuuu 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx
 
-  Note: The I8 transformation is valid for UCS-4 values X'0' to
-  X'7FFFFFFF' (the full extent of ISO/IEC 10646 coding space).
+Beyond this, Perl uses an incompatible extension, similar to the one used in
+regular UTF-8.  There are now 14 bytes.  A full 32 bits of information thus looks like this:
+                                                        1st Byte  2nd-7th 8th Byte 9th Byte 10th B   11th B   12th B   13th B   14th B
+U+40000000..U+FFFFFFFF ttuuuuuvvvvvwwwwwzzzzzyyyyyxxxxx 11111111 10100000 101000tt 101uuuuu 101vvvvv 101wwwww 101zzzzz 101yyyyy 101xxxxx
 
- */
+For 32-bit words, the 2nd through 7th bytes effectively function as leading
+zeros.  Above 32 bits, these fill up, with each byte yielding 5 bits of
+information, so that with 13 continuation bytes, we can handle 65 bits, just
+above what a 64 bit word can hold */
 
-/* Input is a true Unicode (not-native) code point */
-#define OFFUNISKIP(uv) ( (uv) < 0xA0        ? 1 : \
-		         (uv) < 0x400       ? 2 : \
-		         (uv) < 0x4000      ? 3 : \
-		         (uv) < 0x40000     ? 4 : \
-		         (uv) < 0x400000    ? 5 : \
-		         (uv) < 0x4000000   ? 6 : 7 )
 
+/* This is a fundamental property of UTF-EBCDIC */
 #define OFFUNI_IS_INVARIANT(c) (((UV)(c)) <  0xA0)
 
 /* It turns out that on EBCDIC platforms, the invariants are the characters
@@ -180,13 +198,6 @@ END_EXTERN_C
 #define UVCHR_IS_INVARIANT(uv) cBOOL(FITS_IN_8_BITS(uv)                        \
    && (PL_charclass[(U8) (uv)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL))))
 
-#define UVCHR_SKIP(uv) (UVCHR_IS_INVARIANT(uv)  ? 1 :                       \
-                        (uv) < 0x400            ? 2 :                       \
-		        (uv) < 0x4000           ? 3 :                       \
-		        (uv) < 0x40000          ? 4 :                       \
-		        (uv) < 0x400000         ? 5 :                       \
-		        (uv) < 0x4000000        ? 6 : 7 )
-
 /* UTF-EBCDIC semantic macros - We used to transform back into I8 and then
  * compare, but now only have to do a single lookup by using a bit in
  * l1_char_class_tab.h.
@@ -194,8 +205,17 @@ END_EXTERN_C
  * definitions. */
 
 #define UTF8_IS_START(c)		_generic_isCC(c, _CC_UTF8_IS_START)
+
+#define UTF_IS_CONTINUATION_MASK    0xE0
+
 #define UTF8_IS_CONTINUATION(c)		_generic_isCC(c, _CC_UTF8_IS_CONTINUATION)
 
+/* The above instead could be written as this:
+#define UTF8_IS_CONTINUATION(c)                                                 \
+            (((NATIVE_UTF8_TO_I8(c) & UTF_IS_CONTINUATION_MASK)                 \
+                                                == UTF_CONTINUATION_MARK)
+ */
+
 /* Equivalent to ! UVCHR_IS_INVARIANT(c) */
 #define UTF8_IS_CONTINUED(c) 		cBOOL(FITS_IN_8_BITS(c)                 \
    && ! (PL_charclass[(U8) (c)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL))))
@@ -213,33 +233,13 @@ END_EXTERN_C
 #define isUTF8_POSSIBLY_PROBLEMATIC(c)                                          \
                 _generic_isCC(c, _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE)
 
-/* Can't exceed 7 on EBCDIC platforms */
-#define UTF_START_MARK(len) (0xFF & (0xFE << (7-(len))))
-
-#define UTF_START_MASK(len) (((len) >= 6) ? 0x01 : (0x1F >> ((len)-2)))
 #define UTF_CONTINUATION_MARK		0xA0
-#define UTF_CONTINUATION_MASK		((U8)0x1f)
 #define UTF_ACCUMULATION_SHIFT		5
 
-/* How wide can a single UTF-8 encoded character become in bytes. */
-/* NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8
- * is an encoding of Unicode, and Unicode's upper limit, 0x10FFFF, can be
- * expressed with 5 bytes.  However, Perl thinks of UTF-8 as a way to encode
- * non-negative integers in a binary format, even those above Unicode */
-#define UTF8_MAXBYTES 7
-
-/* The maximum number of UTF-8 bytes a single Unicode character can
- * uppercase/lowercase/fold into.  Unicode guarantees that the maximum
- * expansion is 3 characters.  On EBCDIC platforms, the highest Unicode
- * character occupies 5 bytes, therefore this number is 15 */
-#define UTF8_MAXBYTES_CASE	15
-
 /* ^? is defined to be APC on EBCDIC systems.  See the definition of toCTRL()
  * for more */
 #define QUESTION_MARK_CTRL   LATIN1_TO_NATIVE(0x9F)
 
-#define MAX_UTF8_TWO_BYTE 0x3FF
-
 /*
  * ex: set ts=8 sts=4 sw=4 et:
  */