Data::Dumper: Generalize for EBCDIC platforms

[perl5.git] / utf8.h
diff --git a/utf8.h b/utf8.h

index 8945663..8418055 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -39,6 +39,20 @@
  #define _CORE_SWASH_INIT_RETURN_IF_UNDEF       0x2
  #define _CORE_SWASH_INIT_ACCEPT_INVLIST        0x4
  
+/*
+=head1 Unicode Support
+
+=for apidoc is_ascii_string
+
+This is a misleadingly-named synonym for L</is_invariant_string>.
+On ASCII-ish platforms, the name isn't misleading: the ASCII-range characters
+are exactly the UTF-8 invariants.  But EBCDIC machines have more invariants
+than just the ASCII characters, so C<is_invariant_string> is preferred.
+
+=cut
+*/
+#define is_ascii_string(s, len)     is_invariant_string(s, len)
+
  #define uvchr_to_utf8(a,b)          uvchr_to_utf8_flags(a,b,0)
  #define uvchr_to_utf8_flags(d,uv,flags)                                        \
                              uvoffuni_to_utf8_flags(d,NATIVE_TO_UNI(uv),flags)
@@ -61,14 +75,9 @@
  #define FOLDEQ_LOCALE             (1 << 1)
  #define FOLDEQ_S1_ALREADY_FOLDED  (1 << 2)
  #define FOLDEQ_S2_ALREADY_FOLDED  (1 << 3)
+#define FOLDEQ_S1_FOLDS_SANE      (1 << 4)
+#define FOLDEQ_S2_FOLDS_SANE      (1 << 5)
  
-/*
-=for apidoc ibcmp_utf8
-
-This is a synonym for (! foldEQ_utf8())
-
-=cut
-*/
  #define ibcmp_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2) \
                     cBOOL(! foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2))
  
@@ -384,16 +393,16 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
   * code point whose UTF-8 is known to occupy 2 bytes; they are less efficient
   * than the EIGHT_BIT versions on EBCDIC platforms.  We use the logical '~'
   * operator instead of "<=" to avoid getting compiler warnings.
- * MAX_PORTABLE_UTF8_TWO_BYTE should be exactly all one bits in the lower few
+ * MAX_UTF8_TWO_BYTE should be exactly all one bits in the lower few
   * places, so the ~ works */
  #define UTF8_TWO_BYTE_HI(c)                                                    \
         (__ASSERT_((sizeof(c) ==  1)                                            \
-                  || !(((WIDEST_UTYPE)(c)) & ~MAX_PORTABLE_UTF8_TWO_BYTE))     \
-        ((U8) __BASE_TWO_BYTE_HI(c, NATIVE_TO_LATIN1)))
+                  || !(((WIDEST_UTYPE)(c)) & ~MAX_UTF8_TWO_BYTE))              \
+        ((U8) __BASE_TWO_BYTE_HI(c, NATIVE_TO_UNI)))
  #define UTF8_TWO_BYTE_LO(c)                                                    \
         (__ASSERT_((sizeof(c) ==  1)                                            \
-                  || !(((WIDEST_UTYPE)(c)) & ~MAX_PORTABLE_UTF8_TWO_BYTE))     \
-        ((U8) __BASE_TWO_BYTE_LO(c, NATIVE_TO_LATIN1)))
+                  || !(((WIDEST_UTYPE)(c)) & ~MAX_UTF8_TWO_BYTE))              \
+        ((U8) __BASE_TWO_BYTE_LO(c, NATIVE_TO_UNI)))
  
  /* This is illegal in any well-formed UTF-8 in both EBCDIC and ASCII
   * as it is only in overlongs. */
@@ -422,8 +431,11 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  #define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES)
  #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)
  #define IN_UNI_8_BIT \
-           (CopHINTS_get(PL_curcop) & (HINT_UNI_8_BIT|HINT_LOCALE_NOT_CHARS) \
-            && ! IN_LOCALE_RUNTIME && ! IN_BYTES)
+           (((CopHINTS_get(PL_curcop) & (HINT_UNI_8_BIT))                       \
+               || (CopHINTS_get(PL_curcop) & HINT_LOCALE_PARTIAL                 \
+                   /* -1 below is for :not_characters */                         \
+                   && _is_in_locale_category(FALSE, -1)))                        \
+              && ! IN_BYTES)
  
  
  #define UTF8_ALLOW_EMPTY               0x0001  /* Allow a zero length string */
@@ -489,7 +501,9 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
   * U+10FFFF: \xF4\x8F\xBF\xBF  \xF9\xA1\xBF\xBF\xBF    max legal Unicode
   * U+110000: \xF4\x90\x80\x80  \xF9\xA2\xA0\xA0\xA0
   * U+110001: \xF4\x90\x80\x81  \xF9\xA2\xA0\xA0\xA1
- */
+ *
+ * BE AWARE that this test doesn't rule out malformed code points, in
+ * particular overlongs */
  #ifdef EBCDIC /* Both versions assume well-formed UTF8 */
  #   define UTF8_IS_SUPER(s) (NATIVE_UTF8_TO_I8(* (U8*) (s)) >= 0xF9             \
                           && (NATIVE_UTF8_TO_I8(* (U8*) (s)) > 0xF9              \
@@ -579,8 +593,7 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
          (ANYOF_NONBITMAP(node)) && \
          (ANYOF_FLAGS(node) & ANYOF_LOC_NONBITMAP_FOLD) && \
          ((end) > (input) + 1) && \
-        toFOLD((input)[0]) == 's' && \
-        toFOLD((input)[1]) == 's')
+        isALPHA_FOLD_EQ((input)[0], 's'))
  
  #define SHARP_S_SKIP 2
  
@@ -628,7 +641,6 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  #endif
  
  /*
-=head1 Unicode Support
  
  =for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e