utf8.h, utfebcdic.h: Fix-up UTF8_MAXBYTES_CASE defn

author Karl Williamson <khw@cpan.org>

Fri, 30 Oct 2015 03:19:40 +0000 (21:19 -0600)

committer Karl Williamson <khw@cpan.org>

Sun, 6 Dec 2015 05:06:49 +0000 (22:06 -0700)
author Karl Williamson <khw@cpan.org>
Fri, 30 Oct 2015 03:19:40 +0000 (21:19 -0600)
committer Karl Williamson <khw@cpan.org>
Sun, 6 Dec 2015 05:06:49 +0000 (22:06 -0700)
diff --git a/utf8.h b/utf8.h

index 84766b7..77b6a6e 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -265,24 +265,12 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
                       (uv) < 0x80000000     ? 6 : 7 )
  #endif
  
-/* The maximum number of UTF-8 bytes a single Unicode character can
- * uppercase/lowercase/fold into.  Unicode guarantees that the maximum
- * expansion is 3 characters.  On ASCIIish platforms, the highest Unicode
- * character occupies 4 bytes, therefore this number would be 12, but this is
- * smaller than the maximum width a single above-Unicode character can occupy,
- * so use that instead */
-#if UTF8_MAXBYTES < 12
-#error UTF8_MAXBYTES must be at least 12
-#endif
-
  /* ^? is defined to be DEL on ASCII systems.  See the definition of toCTRL()
   * for more */
  #define QUESTION_MARK_CTRL  DEL_NATIVE
  
  #define MAX_UTF8_TWO_BYTE 0x7FF
  
-#define UTF8_MAXBYTES_CASE     UTF8_MAXBYTES
-
  /*
  
  =for apidoc Am|STRLEN|UVCHR_SKIP|UV cp
@@ -302,6 +290,19 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) code point if less than
  
  #endif /* EBCDIC vs ASCII */
  
+/* The maximum number of UTF-8 bytes a single Unicode character can
+ * uppercase/lowercase/fold into.  Unicode guarantees that the maximum
+ * expansion is UTF8_MAX_FOLD_CHAR_EXPAND characters, but any above-Unicode
+ * code point will fold to itself, so we only have to look at the expansion of
+ * the maximum Unicode code point.  But this number may be less than the space
+ * occupied by a very large code point under Perl's extended UTF-8.  We have to
+ * make it large enough to fit any single character.  (It turns out that ASCII
+ * and EBCDIC differ in which is larger) */
+#define UTF8_MAXBYTES_CASE                                                     \
+        (UTF8_MAXBYTES >= (UTF8_MAX_FOLD_CHAR_EXPAND * OFFUNISKIP(0x10FFFF))    \
+                           ? UTF8_MAXBYTES                                      \
+                           : (UTF8_MAX_FOLD_CHAR_EXPAND * OFFUNISKIP(0x10FFFF)))
+
  /* Rest of these are attributes of Unicode and perl's internals rather than the
   * encoding, or happen to be the same in both ASCII and EBCDIC (at least at
   * this level; the macros that some of these call may have different
diff --git a/utfebcdic.h b/utfebcdic.h

index bf54d4c..c5a7859 100644 (file)
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -239,12 +239,6 @@ above what a 64 bit word can hold */
  #define UTF_CONTINUATION_MASK          ((U8)0x1f)
  #define UTF_ACCUMULATION_SHIFT         5
  
-/* The maximum number of UTF-8 bytes a single Unicode character can
- * uppercase/lowercase/fold into.  Unicode guarantees that the maximum
- * expansion is 3 characters.  On EBCDIC platforms, the highest Unicode
- * character occupies 5 bytes, therefore this number is 15 */
-#define UTF8_MAXBYTES_CASE     15
-
  /* ^? is defined to be APC on EBCDIC systems.  See the definition of toCTRL()
   * for more */
  #define QUESTION_MARK_CTRL   LATIN1_TO_NATIVE(0x9F)
author	Karl Williamson <khw@cpan.org>
	Fri, 30 Oct 2015 03:19:40 +0000 (21:19 -0600)
committer	Karl Williamson <khw@cpan.org>
	Sun, 6 Dec 2015 05:06:49 +0000 (22:06 -0700)
utf8.h		patch \| blob \| blame \| history
utfebcdic.h		patch \| blob \| blame \| history