perlapi: Consolidate SvPVX-ish entries

[perl5.git] / utfebcdic.h
diff --git a/utfebcdic.h b/utfebcdic.h

index 4a66637..97b8f70 100644 (file)
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -202,17 +202,33 @@ possible to UTF-8-encode a single code point in different ways, but that is
  explicitly forbidden, and the shortest possible encoding should always be used
  (and that is what Perl does). */
  
-/* Comments as to the meaning of each are given at their corresponding utf8.h
- * definitions. */
+/* It turns out that just this one number is sufficient to derive all the basic
+ * macros for UTF-8 and UTF-EBCDIC.  Everything follows from the fact that
+ * there are 6 bits of real information in a UTF-8 continuation byte vs. 5 bits
+ * in a UTF-EBCDIC one. */
+
+#define UTF_ACCUMULATION_SHIFT         5
+
+/* Also needed is how perl handles a start byte of 8 one bits.  The decision
+ * was made to just append the minimal number of bytes after that so that code
+ * points up to 64 bits wide could be represented.  In UTF-8, that was an extra
+ * 5 bytes, and in UTF-EBCDIC it's 6.  The result is in UTF8_MAXBYTES defined
+ * above.  This implementation has the advantage that you have everything you
+ * need in the first byte.  Other ways of extending UTF-8 have been devised,
+ * some to arbitrarily high code points.  But they require looking at the next
+ * byte(s) when the first one is 8 one bits. */
+
+/* These others are for efficiency or for other decisions we've made */
  
  #define isUTF8_POSSIBLY_PROBLEMATIC(c)                                          \
                  _generic_isCC(c, _CC_UTF8_START_BYTE_IS_FOR_AT_LEAST_SURROGATE)
  
-#define UTF_ACCUMULATION_SHIFT         5
  /* ^? is defined to be APC on EBCDIC systems.  See the definition of toCTRL()
   * for more */
  #define QUESTION_MARK_CTRL   LATIN1_TO_NATIVE(0x9F)
  
+#define UNICODE_IS_PERL_EXTENDED(uv)    UNLIKELY((UV) (uv) > 0x3FFFFFFF)
+
  /* Helper macros for isUTF8_CHAR_foo, so use those instead of this.  These were
   * generated by regen/regcharclass.pl, and then moved here.  Then they were
   * hand-edited to add some LIKELY() calls, presuming that malformations are
@@ -456,8 +472,6 @@ explicitly forbidden, and the shortest possible encoding should always be used
   * has this start byte (expressed in I8) as the maximum */
  #define _IS_UTF8_CHAR_HIGHEST_START_BYTE 0xF9
  
-#define UNICODE_IS_PERL_EXTENDED(uv)    UNLIKELY((UV) (uv) > 0x3FFFFFFF)
-
  /*
   * ex: set ts=8 sts=4 sw=4 et:
   */