utf8.h: Remove an EBCDIC dependency

author Karl Williamson <khw@cpan.org>

Mon, 14 Jun 2021 12:13:41 +0000 (06:13 -0600)

committer Karl Williamson <khw@cpan.org>

Sat, 7 Aug 2021 11:14:43 +0000 (05:14 -0600)
author Karl Williamson <khw@cpan.org>
Mon, 14 Jun 2021 12:13:41 +0000 (06:13 -0600)
committer Karl Williamson <khw@cpan.org>
Sat, 7 Aug 2021 11:14:43 +0000 (05:14 -0600)
diff --git a/utf8.h b/utf8.h

index 1cb0b68..86340ad 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -274,8 +274,6 @@ are in the character. */
  #define isUTF8_POSSIBLY_PROBLEMATIC(c) (__ASSERT_(FITS_IN_8_BITS(c))        \
                                          (U8) c >= 0xED)
  
-#define UNICODE_IS_PERL_EXTENDED(uv)    UNLIKELY((UV) (uv) > 0x7FFFFFFF)
-
  #endif /* EBCDIC vs ASCII */
  
  /* It turns out that in a number of cases, that handling ASCII vs EBCDIC is a
@@ -764,6 +762,25 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
                && (! IN_BYTES))
  
  
+/* Perl extends Unicode so that it is possible to encode (as extended UTF-8 or
+ * UTF-EBCDIC) any 64-bit value.  No standard known to khw ever encoded higher
+ * than a 31 bit value.  On ASCII platforms this just meant arbitrarily saying
+ * nothing could be higher than this.  On these the start byte FD gets you to
+ * 31 bits, and FE and FF are forbidden as start bytes.  On EBCDIC platforms,
+ * FD gets you only to 26 bits; adding FE to mean 7 total bytes gets you to 30
+ * bits.  To get to 31 bits, they treated an initial FF byte idiosyncratically.
+ * It was considered to be the start byte FE meaning it had 7 total bytes, and
+ * the final 1 was treated as an information bit, getting you to 31 bits.
+ *
+ * Perl used to accept this idiosyncratic interpretation of FF, but now rejects
+ * it in order to get to being able to encode 64 bits.  The bottom line is that
+ * it is a Perl extension to use the start bytes FE and FF on ASCII platforms,
+ * and the start byte FF on EBCDIC ones.  That translates into that it is a
+ * Perl extension to represent anything occupying more than 31 bits on ASCII
+ * platforms; 30 bits on EBCDIC. */
+#define UNICODE_IS_PERL_EXTENDED(uv)                                        \
+          UNLIKELY((UV) (uv) > nBIT_UMAX(31 - ONE_IF_EBCDIC_ZERO_IF_NOT))
+
  #define UTF8_ALLOW_EMPTY               0x0001  /* Allow a zero length string */
  #define UTF8_GOT_EMPTY                  UTF8_ALLOW_EMPTY
  
diff --git a/utfebcdic.h b/utfebcdic.h

index 1b9b35a..a9691bb 100644 (file)
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -222,8 +222,6 @@ explicitly forbidden, and the shortest possible encoding should always be used
   * for more */
  #define QUESTION_MARK_CTRL   LATIN1_TO_NATIVE(0x9F)
  
-#define UNICODE_IS_PERL_EXTENDED(uv)    UNLIKELY((UV) (uv) > 0x3FFFFFFF)
-
  /*
   * ex: set ts=8 sts=4 sw=4 et:
   */
author	Karl Williamson <khw@cpan.org>
	Mon, 14 Jun 2021 12:13:41 +0000 (06:13 -0600)
committer	Karl Williamson <khw@cpan.org>
	Sat, 7 Aug 2021 11:14:43 +0000 (05:14 -0600)
utf8.h		patch \| blob \| blame \| history
utfebcdic.h		patch \| blob \| blame \| history