utf8.h, utfebcdic.h: Comments, white-space only

author Karl Williamson <khw@cpan.org>

Fri, 6 Nov 2015 20:21:48 +0000 (13:21 -0700)

committer Karl Williamson <khw@cpan.org>

Sun, 6 Dec 2015 16:58:06 +0000 (09:58 -0700)
author Karl Williamson <khw@cpan.org>
Fri, 6 Nov 2015 20:21:48 +0000 (13:21 -0700)
committer Karl Williamson <khw@cpan.org>
Sun, 6 Dec 2015 16:58:06 +0000 (09:58 -0700)
diff --git a/utf8.h b/utf8.h

index cd7e5bc..d792a93 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -356,8 +356,9 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) code point if less than
  #define NATIVE8_TO_UNI(ch)       NATIVE_TO_LATIN1(ch)
  
  /* This defines the 1-bits that are to be in the first byte of a multi-byte
- * UTF-8 encoded character that give the number of bytes that comprise the
- * character. 'len' is the number of bytes in the multi-byte sequence. */
+ * UTF-8 encoded character that mark it as a start byte and give the number of
+ * bytes that comprise the character. 'len' is the number of bytes in the
+ * multi-byte sequence. */
  #define UTF_START_MARK(len) (((len) >  7) ? 0xFF : (0xFF & (0xFE << (7-(len)))))
  
  /* Masks out the initial one bits in a start byte, leaving the real data ones.
@@ -509,11 +510,11 @@ only) byte is pointed to by C<s>.
   * beginning of a utf8 character.  Now that foo_utf8() determines that itself,
   * no need to do it again here
   */
-#define isIDFIRST_lazy_if(p,UTF) ((IN_BYTES || !UTF ) \
-                                ? isIDFIRST(*(p)) \
+#define isIDFIRST_lazy_if(p,UTF) ((IN_BYTES || !UTF)                \
+                                ? isIDFIRST(*(p))                  \
                                  : isIDFIRST_utf8((const U8*)p))
-#define isWORDCHAR_lazy_if(p,UTF)   ((IN_BYTES || (!UTF )) \
-                                ? isWORDCHAR(*(p)) \
+#define isWORDCHAR_lazy_if(p,UTF)   ((IN_BYTES || (!UTF))           \
+                                ? isWORDCHAR(*(p))                 \
                                  : isWORDCHAR_utf8((const U8*)p))
  #define isALNUM_lazy_if(p,UTF)   isWORDCHAR_lazy_if(p,UTF)
  
diff --git a/utfebcdic.h b/utfebcdic.h

index 97c0c9d..3a4fcc2 100644 (file)
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -41,7 +41,11 @@
   *     PL_utf2e, with its inverse being PL_e2utf.  They are constructed so that
   *     all EBCDIC invariants remain invariant, but no others do, and the first
   *     byte of a variant will always have its upper bit set.  But note that
- *     the upper bit of some invariants is also 1.
+ *     the upper bit of some invariants is also 1.  The table also is designed
+ *     so that lexically comparing two UTF-EBCDIC-variant characters yields
+ *     the Unicode code point order.  (To get native code point order, one has
+ *     to convert the latin1-range characters to their native code point
+ *     value.)
   *
   *  For example, the ordinal value of 'A' is 193 in EBCDIC, and also is 193 in
   *  UTF-EBCDIC.  Step 1) converts it to 65, Step 2 leaves it at 65, and Step 3
@@ -184,6 +188,7 @@ information, so that with 13 continuation bytes, we can handle 65 bits, just
  above what a 64 bit word can hold */
  
  
+/* This is a fundamental property of UTF-EBCDIC */
  #define OFFUNI_IS_INVARIANT(c) (((UV)(c)) <  0xA0)
  
  /* It turns out that on EBCDIC platforms, the invariants are the characters
author	Karl Williamson <khw@cpan.org>
	Fri, 6 Nov 2015 20:21:48 +0000 (13:21 -0700)
committer	Karl Williamson <khw@cpan.org>
	Sun, 6 Dec 2015 16:58:06 +0000 (09:58 -0700)
utf8.h		patch \| blob \| blame \| history
utfebcdic.h		patch \| blob \| blame \| history