utf8.h: Add macro TWO_BYTE_UTF8_TO_UNI()

[perl5.git] / utf8.h
diff --git a/utf8.h b/utf8.h

index 659319e..ef5fecc 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -20,6 +20,16 @@
  #define uvuni_to_utf8(d, uv)           uvuni_to_utf8_flags(d, uv, 0)
  #define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
  
+/*
+=for apidoc ibcmp_utf8
+
+This is a synonym for (! foldEQ_utf8())
+
+=cut
+*/
+#define ibcmp_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2) \
+                   cBOOL(! foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2))
+
  #ifdef EBCDIC
  /* The equivalent of these macros but implementing UTF-EBCDIC
     are in the following header file:
@@ -27,7 +37,7 @@
  
  #include "utfebcdic.h"
  
-#else
+#else  /* ! EBCDIC */
  START_EXTERN_C
  
  #ifdef DOINIT
@@ -47,11 +57,9 @@ EXTCONST unsigned char PL_utf8skip[];
  #endif
  
  END_EXTERN_C
-#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
  
  /* Native character to iso-8859-1 */
  #define NATIVE_TO_ASCII(ch)      (ch)
-#define NATIVE8_TO_UNI(ch)        (ch)
  #define ASCII_TO_NATIVE(ch)      (ch)
  /* Transform after encoding */
  #define NATIVE_TO_UTF(ch)        (ch)
@@ -63,7 +71,7 @@ END_EXTERN_C
  #define NATIVE_TO_NEED(enc,ch)   (ch)
  #define ASCII_TO_NEED(enc,ch)    (ch)
  
-/* As there are no translations avoid the function wrapper */
+/* As there are no translations, avoid the function wrapper */
  #define utf8n_to_uvchr utf8n_to_uvuni
  #define uvchr_to_utf8  uvuni_to_utf8
  
@@ -74,22 +82,21 @@ END_EXTERN_C
   Code Points           1st Byte  2nd Byte  3rd Byte  4th Byte
  
     U+0000..U+007F      00..7F
-   U+0080..U+07FF      C2..DF    80..BF
-   U+0800..U+0FFF      E0        A0..BF    80..BF
+   U+0080..U+07FF     * C2..DF    80..BF
+   U+0800..U+0FFF      E0      * A0..BF    80..BF
     U+1000..U+CFFF       E1..EC    80..BF    80..BF
     U+D000..U+D7FF       ED        80..9F    80..BF
-   U+D800..U+DFFF       ******* ill-formed *******
+   U+D800..U+DFFF       +++++++ utf16 surrogates, not legal utf8 +++++++
     U+E000..U+FFFF       EE..EF    80..BF    80..BF
-  U+10000..U+3FFFF     F0        90..BF    80..BF    80..BF
+  U+10000..U+3FFFF     F0      * 90..BF    80..BF    80..BF
    U+40000..U+FFFFF     F1..F3    80..BF    80..BF    80..BF
   U+100000..U+10FFFF    F4        80..8F    80..BF    80..BF
  
-Note the A0..BF in U+0800..U+0FFF, the 80..9F in U+D000...U+D7FF,
-the 90..BF in U+10000..U+3FFFF, and the 80...8F in U+100000..U+10FFFF.
-The "gaps" are caused by legal UTF-8 avoiding non-shortest encodings:
-it is technically possible to UTF-8-encode a single code point in different
-ways, but that is explicitly forbidden, and the shortest possible encoding
-should always be used (and that is what Perl does).
+Note the gaps before several of the byte entries above marked by '*'.  These are
+caused by legal UTF-8 avoiding non-shortest encodings: it is technically
+possible to UTF-8-encode a single code point in different ways, but that is
+explicitly forbidden, and the shortest possible encoding should always be used
+(and that is what Perl does).
  
   */
  
@@ -104,16 +111,18 @@ should always be used (and that is what Perl does).
    00000dddccccccbbbbbbaaaaaa     11110ddd  10cccccc  10bbbbbb  10aaaaaa
  
  As you can see, the continuation bytes all begin with C<10>, and the
-leading bits of the start byte tell how many bytes the are in the
+leading bits of the start byte tell how many bytes there are in the
  encoded character.
  
+Perl's extended UTF-8 means we can have start bytes up to FF.
+
  */
  
  
  #define UNI_IS_INVARIANT(c)            (((UV)c) <  0x80)
-#define UTF8_IS_INVARIANT(c)           UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
-#define NATIVE_IS_INVARIANT(c)         UNI_IS_INVARIANT(NATIVE_TO_ASCII(c))
-#define UTF8_IS_START(c)               (((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
+/* Note that C0 and C1 are invalid in legal UTF8, so the lower bound of the
+ * below might ought to be C2 */
+#define UTF8_IS_START(c)               (((U8)c) >= 0xc0)
  #define UTF8_IS_CONTINUATION(c)                (((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
  #define UTF8_IS_CONTINUED(c)           (((U8)c) &  0x80)
  #define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfc) == 0xc0)
@@ -124,10 +133,6 @@ encoded character.
  #define UTF_CONTINUATION_MARK          0x80
  #define UTF_ACCUMULATION_SHIFT         6
  #define UTF_CONTINUATION_MASK          ((U8)0x3f)
-#define UTF8_ACCUMULATE(old, new)      (((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK))
-
-#define UTF8_EIGHT_BIT_HI(c)   ((((U8)(c))>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2))
-#define UTF8_EIGHT_BIT_LO(c)   (((((U8)(c)))&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK)
  
  #ifdef HAS_QUAD
  #define UNISKIP(uv) ( (uv) < 0x80           ? 1 : \
@@ -147,24 +152,60 @@ encoded character.
                       (uv) < 0x80000000     ? 6 : 7 )
  #endif
  
+#endif /* EBCDIC vs ASCII */
+
+/* Rest of these are attributes of Unicode and perl's internals rather than the
+ * encoding, or happen to be the same in both ASCII and EBCDIC (at least at
+ * this level; the macros that some of these call may have different
+ * definitions in the two encodings */
+
+#define NATIVE8_TO_UNI(ch)     NATIVE_TO_ASCII(ch)     /* a clearer synonym */
+
+#define UTF8_ACCUMULATE(old, new)      (((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK))
+
+/* Convert a two (not one) byte utf8 character to a unicode code point value.
+ * Needs just one iteration of accumulate.  Should not be used unless it is
+ * known that the two bytes are legal: 1) two-byte start, and 2) continuation.
+ * Note that the result can be larger than 255 if the input character is not
+ * downgradable */
+#define TWO_BYTE_UTF8_TO_UNI(HI, LO) \
+                   UTF8_ACCUMULATE((NATIVE_TO_UTF(HI) & UTF_START_MASK(2)), \
+                                    NATIVE_TO_UTF(LO))
+
+#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
+
+#define UTF8_IS_INVARIANT(c)           UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
+#define NATIVE_IS_INVARIANT(c)         UNI_IS_INVARIANT(NATIVE8_TO_UNI(c))
+
+#define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF    /* constrained by EBCDIC */
+
+/* The macros in the next sets are used to generate the two utf8 or utfebcdic
+ * bytes from an ordinal that is known to fit into two bytes; it must be less
+ * than 0x3FF to work across both encodings. */
+/* Nocast allows these to be used in the case label of a switch statement */
+#define UTF8_TWO_BYTE_HI_nocast(c)     UTF_TO_NATIVE(((c)>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2))
+#define UTF8_TWO_BYTE_LO_nocast(c)     UTF_TO_NATIVE(((c)&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK)
+
+#define UTF8_TWO_BYTE_HI(c)    ((U8) (UTF8_TWO_BYTE_HI_nocast(c)))
+#define UTF8_TWO_BYTE_LO(c)    ((U8) (UTF8_TWO_BYTE_LO_nocast(c)))
+
+/* This name is used when the source is a single byte */
+#define UTF8_EIGHT_BIT_HI(c)   UTF8_TWO_BYTE_HI((U8)(c))
+#define UTF8_EIGHT_BIT_LO(c)   UTF8_TWO_BYTE_LO((U8)(c))
+
  /*
   * Note: we try to be careful never to call the isXXX_utf8() functions
- * unless we're pretty sure we've seen the beginning of a UTF-8 character
- * (that is, the two high bits are set).  Otherwise we risk loading in the
- * heavy-duty swash_init and swash_fetch routines unnecessarily.
+ * unless we're pretty sure we've seen the beginning of a UTF-8 or UTFEBCDIC
+ * character.  Otherwise we risk loading in the heavy-duty swash_init and
+ * swash_fetch routines unnecessarily.
   */
-#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \
+#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || ! UTF8_IS_START(*((const U8*)p)))) \
                                 ? isIDFIRST(*(p)) \
                                 : isIDFIRST_utf8((const U8*)p))
-#define isALNUM_lazy_if(p,c)   ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \
+#define isALNUM_lazy_if(p,c)   ((IN_BYTES || (!c || ! UTF8_IS_START(*((const U8*)p)))) \
                                 ? isALNUM(*(p)) \
                                 : isALNUM_utf8((const U8*)p))
  
-
-#endif /* EBCDIC vs ASCII */
-
-/* Rest of these are attributes of Unicode and perl's internals rather than the encoding */
-
  #define isIDFIRST_lazy(p)      isIDFIRST_lazy_if(p,1)
  #define isALNUM_lazy(p)                isALNUM_lazy_if(p,1)
  
@@ -176,30 +217,28 @@ encoded character.
   * as a way to encode non-negative integers in a binary format. */
  #define UTF8_MAXLEN UTF8_MAXBYTES
  
-#define UTF8_MAXLEN_UCLC 3             /* Obsolete, do not use. */
-#define UTF8_MAXLEN_UCLC_MULT 39       /* Obsolete, do not use. */
-#define UTF8_MAXLEN_FOLD 3             /* Obsolete, do not use. */
-#define UTF8_MAXLEN_FOLD_MULT 39       /* Obsolete, do not use. */
-
  /* The maximum number of UTF-8 bytes a single Unicode character can
   * uppercase/lowercase/fold into; this number depends on the Unicode
   * version.  An example of maximal expansion is the U+03B0 which
   * uppercases to U+03C5 U+0308 U+0301.  The Unicode databases that
- * tell these things are UnicodeDatabase.txt, CaseFolding.txt, and
+ * tell these things are UnicodeData.txt, CaseFolding.txt, and
   * SpecialCasing.txt. */
  #define UTF8_MAXBYTES_CASE     6
  
  #define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES)
  #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)
+#define IN_UNI_8_BIT ( (CopHINTS_get(PL_curcop) & HINT_UNI_8_BIT) \
+                       && ! IN_LOCALE_RUNTIME && ! IN_BYTES)
  
  #define UTF8_ALLOW_EMPTY               0x0001
  #define UTF8_ALLOW_CONTINUATION                0x0002
  #define UTF8_ALLOW_NON_CONTINUATION    0x0004
-#define UTF8_ALLOW_FE_FF               0x0008 /* Allow above 0x7fffFFFF */
-#define UTF8_ALLOW_SHORT               0x0010
+#define UTF8_ALLOW_FE_FF               0x0008 /* Allow FE or FF start bytes, \
+                                                 yields above 0x7fffFFFF */
+#define UTF8_ALLOW_SHORT               0x0010 /* expecting more bytes */
  #define UTF8_ALLOW_SURROGATE           0x0020
  #define UTF8_ALLOW_FFFF                        0x0040 /* Allow UNICODE_ILLEGAL */
-#define UTF8_ALLOW_LONG                        0x0080
+#define UTF8_ALLOW_LONG                        0x0080 /* expecting fewer bytes */
  #define UTF8_ALLOW_ANYUV               (UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\
                                          UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
  #define UTF8_ALLOW_ANY                 0x00FF
@@ -233,37 +272,28 @@ encoded character.
  #    define UTF8_QUAD_MAX      UINT64_C(0x1000000000)
  #endif
  
-#define UTF8_IS_ASCII(c) UTF8_IS_INVARIANT(c)
-
-#define UNICODE_LATIN_SMALL_LETTER_SHARP_S     0x00DF
  #define UNICODE_GREEK_CAPITAL_LETTER_SIGMA     0x03A3
  #define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2
  #define UNICODE_GREEK_SMALL_LETTER_SIGMA       0x03C3
  
-#define EBCDIC_LATIN_SMALL_LETTER_SHARP_S      0x0059
-
  #define UNI_DISPLAY_ISPRINT    0x0001
  #define UNI_DISPLAY_BACKSLASH  0x0002
  #define UNI_DISPLAY_QQ         (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
  #define UNI_DISPLAY_REGEX      (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
  
-#ifdef EBCDIC
-#   define ANYOF_FOLD_SHARP_S(node, input, end)        \
-       (ANYOF_BITMAP_TEST(node, EBCDIC_LATIN_SMALL_LETTER_SHARP_S) && \
-        (ANYOF_FLAGS(node) & ANYOF_UNICODE) && \
-        (ANYOF_FLAGS(node) & ANYOF_FOLD) && \
-        ((end) > (input) + 1) && \
-        toLOWER((input)[0]) == 's' && \
-        toLOWER((input)[1]) == 's')
-#else
-#   define ANYOF_FOLD_SHARP_S(node, input, end)        \
-       (ANYOF_BITMAP_TEST(node, UNICODE_LATIN_SMALL_LETTER_SHARP_S) && \
-        (ANYOF_FLAGS(node) & ANYOF_UNICODE) && \
+#ifndef EBCDIC
+#   define LATIN_SMALL_LETTER_SHARP_S  0x00DF
+#   define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS 0x00FF
+#   define MICRO_SIGN 0x00B5
+#endif
+
+#define ANYOF_FOLD_SHARP_S(node, input, end)   \
+       (ANYOF_BITMAP_TEST(node, LATIN_SMALL_LETTER_SHARP_S) && \
+        (ANYOF_FLAGS(node) & ANYOF_NONBITMAP) && \
          (ANYOF_FLAGS(node) & ANYOF_FOLD) && \
          ((end) > (input) + 1) && \
          toLOWER((input)[0]) == 's' && \
          toLOWER((input)[1]) == 's')
-#endif
  #define SHARP_S_SKIP 2
  
  #ifdef EBCDIC