infnan: Notes on the nan payload.

[perl5.git] / utfebcdic.h
diff --git a/utfebcdic.h b/utfebcdic.h

index 7eec66f..24101ed 100644 (file)
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -51,8 +51,7 @@
   * The EBCDIC invariants have been chosen to be those characters whose Unicode
   * equivalents have ordinal numbers less than 160, that is the same characters
   * that are expressible in ASCII, plus the C1 controls.  So there are 160
- * invariants instead of the 128 in UTF-8.  (My guess is that this is because
- * the C1 control NEL (and maybe others) is important in IBM.)
+ * invariants instead of the 128 in UTF-8.
   *
   * The purpose of Step 3 is to make the encoding be invariant for the chosen
   * characters.  This messes up the convenient patterns found in step 2, so
@@ -84,6 +83,20 @@
   * pages.  Best is to convert to I8 before sending them, as the I8
   * representation is the same no matter what the underlying code page is.
   *
+ * Because of the way UTF-EBCDIC is constructed, the lowest 32 code points that
+ * aren't equivalent to ASCII characters nor C1 controls form the set of
+ * continuation bytes; the remaining 64 non-ASCII, non-control code points form
+ * the potential start bytes, in order.  (However, the first 5 of these lead to
+ * malformed overlongs, so there really are only 59 start bytes.) Hence the
+ * UTF-EBCDIC for the smallest variant code point, 0x160, will have likely 0x41
+ * as its continuation byte, provided 0x41 isn't an ASCII or C1 equivalent.
+ * And its start byte will be the code point that is 37 (32+5) non-ASCII,
+ * non-control code points past it.  (0 - 3F are controls, and 40 is SPACE,
+ * leaving 41 as the first potentially available one.)  In contrast, on ASCII
+ * platforms, the first 64 (not 32) non-ASCII code points are the continuation
+ * bytes.  And the first 2 (not 5) potential start bytes form overlong
+ * malformed sequences.
+ *
   * EBCDIC characters above 0xFF are the same as Unicode in Perl's
   * implementation of all 3 encodings, so for those Step 1 is trivial.
   *
@@ -201,11 +214,5 @@ END_EXTERN_C
  #define MAX_UTF8_TWO_BYTE 0x3FF
  
  /*
- * Local variables:
- * c-indentation-style: bsd
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- *
   * ex: set ts=8 sts=4 sw=4 et:
   */