* PL_utf2e, with its inverse being PL_e2utf. They are constructed so that
* all EBCDIC invariants remain invariant, but no others do, and the first
* byte of a variant will always have its upper bit set. But note that
- * the upper bit of some invariants is also 1.
+ * the upper bit of some invariants is also 1. The table also is designed
+ * so that lexically comparing two UTF-EBCDIC-variant characters yields
+ * the Unicode code point order. (To get native code point order, one has
+ * to convert the latin1-range characters to their native code point
+ * value.)
*
* For example, the ordinal value of 'A' is 193 in EBCDIC, and also is 193 in
* UTF-EBCDIC. Step 1) converts it to 65, Step 2 leaves it at 65, and Step 3
#define I8_TO_NATIVE_UTF8(b) (__ASSERT_(FITS_IN_8_BITS(b)) PL_utf2e[(U8)(b)])
/* Transforms in wide UV chars */
-#define NATIVE_TO_UNI(ch) (FITS_IN_8_BITS(ch) ? NATIVE_TO_LATIN1(ch) : (ch))
-#define UNI_TO_NATIVE(ch) (FITS_IN_8_BITS(ch) ? LATIN1_TO_NATIVE(ch) : (ch))
+#define NATIVE_TO_UNI(ch) (FITS_IN_8_BITS(ch) ? NATIVE_TO_LATIN1(ch) : (UV) (ch))
+#define UNI_TO_NATIVE(ch) (FITS_IN_8_BITS(ch) ? LATIN1_TO_NATIVE(ch) : (UV) (ch))
/* How wide can a single UTF-8 encoded character become in bytes. */
/* NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8
above what a 64 bit word can hold */
+/* This is a fundamental property of UTF-EBCDIC */
#define OFFUNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0)
/* It turns out that on EBCDIC platforms, the invariants are the characters
#define UVCHR_IS_INVARIANT(uv) cBOOL(FITS_IN_8_BITS(uv) \
&& (PL_charclass[(U8) (uv)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL))))
-/* Internal macro to be used only in the definitions of the next two */
-#define __BASE_UNI_SKIP(uv) ((uv) < 0x400 ? 2 : \
- (uv) < 0x4000 ? 3 : \
- (uv) < 0x40000 ? 4 : \
- (uv) < 0x400000 ? 5 : \
- (uv) < 0x4000000 ? 6 : \
- (uv) < 0x40000000 ? 7 : UTF8_MAXBYTES )
-
-/* Input is a true Unicode (not-native) code point */
-#define OFFUNISKIP(uv) ( OFFUNI_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
-
-#define UVCHR_SKIP(uv) ( UVCHR_IS_INVARIANT(uv) ? 1 : _BASE_UNI_SKIP(uv))
-
/* UTF-EBCDIC semantic macros - We used to transform back into I8 and then
* compare, but now only have to do a single lookup by using a bit in
* l1_char_class_tab.h.
* definitions. */
#define UTF8_IS_START(c) _generic_isCC(c, _CC_UTF8_IS_START)
+
+#define UTF_IS_CONTINUATION_MASK 0xE0
+
#define UTF8_IS_CONTINUATION(c) _generic_isCC(c, _CC_UTF8_IS_CONTINUATION)
+/* The above instead could be written as this:
+#define UTF8_IS_CONTINUATION(c) \
+ (((NATIVE_UTF8_TO_I8(c) & UTF_IS_CONTINUATION_MASK) \
+ == UTF_CONTINUATION_MARK)
+ */
+
/* Equivalent to ! UVCHR_IS_INVARIANT(c) */
#define UTF8_IS_CONTINUED(c) cBOOL(FITS_IN_8_BITS(c) \
&& ! (PL_charclass[(U8) (c)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL))))
_generic_isCC(c, _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE)
#define UTF_CONTINUATION_MARK 0xA0
-#define UTF_CONTINUATION_MASK ((U8)0x1f)
#define UTF_ACCUMULATION_SHIFT 5
/* ^? is defined to be APC on EBCDIC systems. See the definition of toCTRL()
* for more */
#define QUESTION_MARK_CTRL LATIN1_TO_NATIVE(0x9F)
-#define MAX_UTF8_TWO_BYTE 0x3FF
-
/*
* ex: set ts=8 sts=4 sw=4 et:
*/