utf8.h: Combine EBCDIC and ASCII macros

author Karl Williamson <khw@cpan.org>

Sun, 6 Dec 2015 04:56:43 +0000 (21:56 -0700)

committer Karl Williamson <khw@cpan.org>

Sun, 6 Dec 2015 05:06:50 +0000 (22:06 -0700)
author Karl Williamson <khw@cpan.org>
Sun, 6 Dec 2015 04:56:43 +0000 (21:56 -0700)
committer Karl Williamson <khw@cpan.org>
Sun, 6 Dec 2015 05:06:50 +0000 (22:06 -0700)
diff --git a/utf8.h b/utf8.h

index df106c1..c41d51c 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -245,6 +245,21 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  #  define UTF8_QUAD_MAX UINT64_C(0x1000000000)
  #endif
  
+/* ^? is defined to be DEL on ASCII systems.  See the definition of toCTRL()
+ * for more */
+#define QUESTION_MARK_CTRL  DEL_NATIVE
+
+/* Surrogates, non-character code points and above-Unicode code points are
+ * problematic in some contexts.  This allows code that needs to check for
+ * those to to quickly exclude the vast majority of code points it will
+ * encounter */
+#define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
+
+#endif /* EBCDIC vs ASCII */
+
+/* 2**UTF_ACCUMULATION_SHIFT - 1 */
+#define UTF_CONTINUATION_MASK  ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1))
+
  /* Internal macro to be used only in this file to aid in constructing other
   * publicly accessible macros.
   * The number of bytes required to express this uv in UTF-8, for just those
@@ -275,26 +290,23 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  
  /* Internal macro to be used only in this file.
   * This adds to __COMMON_UNI_SKIP the details at this platform's upper range.
- * For 64-bit ASCII platforms, we need one more test
+ * For any-sized EBCDIC platforms, or 64-bit ASCII ones, we need one more test
   * to see if just 7 bytes is needed, or if the maximum is needed.  For 32-bit
   * ASCII platforms, everything is representable by 7 bytes */
-#ifdef UV_IS_QUAD
+#if defined(UV_IS_QUAD) || defined(EBCDIC)
  #   define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv)                       \
       (UV) (uv) < ((UV) 1U << (6 * UTF_ACCUMULATION_SHIFT)) ? 7 : UTF8_MAXBYTES)
  #else
  #   define __BASE_UNI_SKIP(uv) (__COMMON_UNI_SKIP(uv) 7)
  #endif
  
-/* ^? is defined to be DEL on ASCII systems.  See the definition of toCTRL()
- * for more */
-#define QUESTION_MARK_CTRL  DEL_NATIVE
+/* The next two macros use the base macro defined above, and add in the tests
+ * at the low-end of the range, for just 1 byte, yielding complete macros,
+ * publicly accessible. */
+
+/* Input is a true Unicode (not-native) code point */
+#define OFFUNISKIP(uv) (OFFUNI_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
  
-/* Surrogates, non-character code points and above-Unicode code points are
- * problematic in some contexts.  This allows code that needs to check for
- * those to to quickly exclude the vast majority of code points it will
- * encounter */
-#define isUTF8_POSSIBLY_PROBLEMATIC(c) ((U8) c >= 0xED)
-#define OFFUNISKIP(uv) ( OFFUNI_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
  /*
  
  =for apidoc Am|STRLEN|UVCHR_SKIP|UV cp
@@ -306,13 +318,8 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) code point if less than
   */
  #define UVCHR_SKIP(uv) ( UVCHR_IS_INVARIANT(uv) ? 1 : __BASE_UNI_SKIP(uv))
  
-
-#endif /* EBCDIC vs ASCII */
-
-/* 2**UTF_ACCUMULATION_SHIFT - 1 */
-#define UTF_CONTINUATION_MASK  ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1))
-
-/* 32 start bytes with UTF_ACCUMULATION_SHIFT bits of information each */
+/* As explained in the comments for __COMMON_UNI_SKIP, 32 start bytes with
+ * UTF_ACCUMULATION_SHIFT bits of information each */
  #define MAX_UTF8_TWO_BYTE (32 * (1U << UTF_ACCUMULATION_SHIFT) - 1)
  
  /* constrained by EBCDIC which has 5 bits per continuation byte */
diff --git a/utfebcdic.h b/utfebcdic.h

index e306122..97c0c9d 100644 (file)
--- a/utfebcdic.h
+++ b/utfebcdic.h
@@ -193,14 +193,6 @@ above what a 64 bit word can hold */
  #define UVCHR_IS_INVARIANT(uv) cBOOL(FITS_IN_8_BITS(uv)                        \
     && (PL_charclass[(U8) (uv)] & (_CC_mask(_CC_ASCII) | _CC_mask(_CC_CNTRL))))
  
-/* Internal macro to be used only in the definitions of the next two */
-#define __BASE_UNI_SKIP(uv) ((uv) < 0x400       ? 2 :                  \
-                            (uv) < 0x4000      ? 3 :                  \
-                            (uv) < 0x40000     ? 4 :                  \
-                            (uv) < 0x400000    ? 5 :                  \
-                            (uv) < 0x4000000   ? 6 :                  \
-                            (uv) < 0x40000000  ? 7 : UTF8_MAXBYTES )
-
  /* UTF-EBCDIC semantic macros - We used to transform back into I8 and then
   * compare, but now only have to do a single lookup by using a bit in
   * l1_char_class_tab.h.
author	Karl Williamson <khw@cpan.org>
	Sun, 6 Dec 2015 04:56:43 +0000 (21:56 -0700)
committer	Karl Williamson <khw@cpan.org>
	Sun, 6 Dec 2015 05:06:50 +0000 (22:06 -0700)
utf8.h		patch \| blob \| blame \| history
utfebcdic.h		patch \| blob \| blame \| history