#define NATIVE8_TO_UNI(ch) NATIVE_TO_LATIN1(ch)
/* This defines the 1-bits that are to be in the first byte of a multi-byte
- * UTF-8 encoded character that give the number of bytes that comprise the
- * character. 'len' is the number of bytes in the multi-byte sequence. */
+ * UTF-8 encoded character that mark it as a start byte and give the number of
+ * bytes that comprise the character. 'len' is the number of bytes in the
+ * multi-byte sequence. */
#define UTF_START_MARK(len) (((len) > 7) ? 0xFF : (0xFF & (0xFE << (7-(len)))))
/* Masks out the initial one bits in a start byte, leaving the real data ones.
* beginning of a utf8 character. Now that foo_utf8() determines that itself,
* no need to do it again here
*/
-#define isIDFIRST_lazy_if(p,UTF) ((IN_BYTES || !UTF ) \
- ? isIDFIRST(*(p)) \
+#define isIDFIRST_lazy_if(p,UTF) ((IN_BYTES || !UTF) \
+ ? isIDFIRST(*(p)) \
: isIDFIRST_utf8((const U8*)p))
-#define isWORDCHAR_lazy_if(p,UTF) ((IN_BYTES || (!UTF )) \
- ? isWORDCHAR(*(p)) \
+#define isWORDCHAR_lazy_if(p,UTF) ((IN_BYTES || (!UTF)) \
+ ? isWORDCHAR(*(p)) \
: isWORDCHAR_utf8((const U8*)p))
#define isALNUM_lazy_if(p,UTF) isWORDCHAR_lazy_if(p,UTF)
* PL_utf2e, with its inverse being PL_e2utf. They are constructed so that
* all EBCDIC invariants remain invariant, but no others do, and the first
* byte of a variant will always have its upper bit set. But note that
- * the upper bit of some invariants is also 1.
+ * the upper bit of some invariants is also 1. The table also is designed
+ * so that lexically comparing two UTF-EBCDIC-variant characters yields
+ * the Unicode code point order. (To get native code point order, one has
+ * to convert the latin1-range characters to their native code point
+ * value.)
*
* For example, the ordinal value of 'A' is 193 in EBCDIC, and also is 193 in
* UTF-EBCDIC. Step 1) converts it to 65, Step 2 leaves it at 65, and Step 3
above what a 64 bit word can hold */
+/* This is a fundamental property of UTF-EBCDIC */
#define OFFUNI_IS_INVARIANT(c) (((UV)(c)) < 0xA0)
/* It turns out that on EBCDIC platforms, the invariants are the characters