X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/a9f50d33e5a32910a09035ea5927706c74be0d64..2376ab1f7929a7a9f33c6ed28ac0f7c3361894c2:/utf8.h

diff --git a/utf8.h b/utf8.h
index 7dcea6f..2357fb0 100644
--- a/utf8.h
+++ b/utf8.h
@@ -43,33 +43,25 @@
 #define uvchr_to_utf8_flags(d,uv,flags)                                        \
                             uvoffuni_to_utf8_flags(d,NATIVE_TO_UNI(uv),flags)
 #define utf8_to_uvchr_buf(s, e, lenp)                                          \
-                     utf8n_to_uvchr(s, (e) - (s), lenp,                        \
+                     utf8n_to_uvchr(s, (U8*)(e) - (U8*)(s), lenp,              \
                                     ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)
 
 #define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
-#define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, \
-	             FOLD_FLAGS_FULL, NULL)
-#define to_utf8_lower(a,b,c) _to_utf8_lower_flags(a,b,c,0, NULL)
-#define to_utf8_upper(a,b,c) _to_utf8_upper_flags(a,b,c,0, NULL)
-#define to_utf8_title(a,b,c) _to_utf8_title_flags(a,b,c,0, NULL)
+#define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
+#define to_utf8_lower(a,b,c) _to_utf8_lower_flags(a,b,c,0)
+#define to_utf8_upper(a,b,c) _to_utf8_upper_flags(a,b,c,0)
+#define to_utf8_title(a,b,c) _to_utf8_title_flags(a,b,c,0)
 
 /* Source backward compatibility. */
 #define is_utf8_string_loc(s, len, ep)	is_utf8_string_loclen(s, len, ep, 0)
 
 #define foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2) \
 		    foldEQ_utf8_flags(s1, pe1, l1, u1, s2, pe2, l2, u2, 0)
-#define FOLDEQ_UTF8_NOMIX_ASCII (1 << 0)
-#define FOLDEQ_UTF8_LOCALE      (1 << 1)
+#define FOLDEQ_UTF8_NOMIX_ASCII   (1 << 0)
+#define FOLDEQ_LOCALE             (1 << 1)
 #define FOLDEQ_S1_ALREADY_FOLDED  (1 << 2)
 #define FOLDEQ_S2_ALREADY_FOLDED  (1 << 3)
 
-/*
-=for apidoc ibcmp_utf8
-
-This is a synonym for (! foldEQ_utf8())
-
-=cut
-*/
 #define ibcmp_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2) \
 		    cBOOL(! foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2))
 
@@ -233,7 +225,9 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
     (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * CHARBITS)           \
            - UTF_ACCUMULATION_SHIFT))
 
-#ifdef HAS_QUAD
+#if UVSIZE >= 8
+#  define UTF8_QUAD_MAX UINT64_C(0x1000000000)
+
 /* Input is a true Unicode (not-native) code point */
 #define OFFUNISKIP(uv) ( (uv) < 0x80        ? 1 : \
 		      (uv) < 0x800          ? 2 : \
@@ -269,6 +263,10 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 #error UTF8_MAXBYTES must be at least 12
 #endif
 
+/* ^? is defined to be DEL on ASCII systems.  See the definition of toCTRL()
+ * for more */
+#define QUESTION_MARK_CTRL  DEL_NATIVE
+
 #define MAX_UTF8_TWO_BYTE 0x7FF
 
 #define UTF8_MAXBYTES_CASE	UTF8_MAXBYTES
@@ -338,33 +336,57 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
  * UTF-8 encoded string) */
 #define UTF8_IS_INVARIANT(c)		UNI_IS_INVARIANT(NATIVE_UTF8_TO_I8(c))
 
-#define NATIVE_IS_INVARIANT(c)		UNI_IS_INVARIANT(NATIVE_TO_LATIN1(c))
+/* Like the above, but its name implies a non-UTF8 input, and is implemented
+ * differently (for no particular reason) */
+#define NATIVE_BYTE_IS_INVARIANT(c)	UNI_IS_INVARIANT(NATIVE_TO_LATIN1(c))
+
+/* Like the above, but accepts any UV as input */
+#define UVCHR_IS_INVARIANT(uv)          UNI_IS_INVARIANT(NATIVE_TO_UNI(uv))
 
 #define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF    /* constrained by EBCDIC */
 
-/* The macros in the next sets are used to generate the two utf8 or utfebcdic
- * bytes from an ordinal that is known to fit into two bytes; it must be less
- * than 0x3FF to work across both encodings. */
-/* Nocast allows these to be used in the case label of a switch statement;
- * however this doesn't won't work for ebcdic, and should be avoided.  Use
- * regen/unicode_constants instead */
-#define UTF8_TWO_BYTE_HI_nocast(c)	I8_TO_NATIVE_UTF8((NATIVE_TO_UNI(c)     \
-                        >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
-#define UTF8_TWO_BYTE_LO_nocast(c)  I8_TO_NATIVE_UTF8((NATIVE_TO_UNI(c)         \
-                                                  & UTF_CONTINUATION_MASK)      \
-                                                | UTF_CONTINUATION_MARK)
-
-#define UTF8_TWO_BYTE_HI(c)	((U8) (UTF8_TWO_BYTE_HI_nocast(c)))
-#define UTF8_TWO_BYTE_LO(c)	((U8) (UTF8_TWO_BYTE_LO_nocast(c)))
-
-/* This name is used when the source is a single byte (input not checked).
- * These expand identically to the TWO_BYTE versions on ASCII platforms, but
- * use to/from LATIN1 instead of UNI, which on EBCDIC eliminates tests */
-#define UTF8_EIGHT_BIT_HI(c)	I8_TO_NATIVE_UTF8((NATIVE_TO_LATIN1(c)          \
-                        >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
-#define UTF8_EIGHT_BIT_LO(c)	I8_TO_NATIVE_UTF8((NATIVE_TO_LATIN1(c)          \
-                                                  & UTF_CONTINUATION_MASK)      \
-                                                | UTF_CONTINUATION_MARK)
+/* The macros in the next 4 sets are used to generate the two utf8 or utfebcdic
+ * bytes from an ordinal that is known to fit into exactly two (not one) bytes;
+ * it must be less than 0x3FF to work across both encodings. */
+
+/* These two are helper macros for the other three sets, and should not be used
+ * directly anywhere else.  'translate_function' is either NATIVE_TO_LATIN1
+ * (which works for code points up to 0xFF) or NATIVE_TO_UNI which works for any
+ * code point */
+#define __BASE_TWO_BYTE_HI(c, translate_function)                               \
+            I8_TO_NATIVE_UTF8((translate_function(c) >> UTF_ACCUMULATION_SHIFT) \
+                              | UTF_START_MARK(2))
+#define __BASE_TWO_BYTE_LO(c, translate_function)                               \
+              I8_TO_NATIVE_UTF8((translate_function(c) & UTF_CONTINUATION_MASK) \
+                                 | UTF_CONTINUATION_MARK)
+
+/* The next two macros should not be used.  They were designed to be usable as
+ * the case label of a switch statement, but this doesn't work for EBCDIC.  Use
+ * regen/unicode_constants.pl instead */
+#define UTF8_TWO_BYTE_HI_nocast(c)  __BASE_TWO_BYTE_HI(c, NATIVE_TO_UNI)
+#define UTF8_TWO_BYTE_LO_nocast(c)  __BASE_TWO_BYTE_LO(c, NATIVE_TO_UNI)
+
+/* The next two macros are used when the source should be a single byte
+ * character; checked for under DEBUGGING */
+#define UTF8_EIGHT_BIT_HI(c) (__ASSERT_(FITS_IN_8_BITS(c))                    \
+                             ((U8) __BASE_TWO_BYTE_HI(c, NATIVE_TO_LATIN1)))
+#define UTF8_EIGHT_BIT_LO(c) (__ASSERT_(FITS_IN_8_BITS(c))                    \
+                             ((U8) __BASE_TWO_BYTE_LO(c, NATIVE_TO_LATIN1)))
+
+/* These final two macros in the series are used when the source can be any
+ * code point whose UTF-8 is known to occupy 2 bytes; they are less efficient
+ * than the EIGHT_BIT versions on EBCDIC platforms.  We use the logical '~'
+ * operator instead of "<=" to avoid getting compiler warnings.
+ * MAX_PORTABLE_UTF8_TWO_BYTE should be exactly all one bits in the lower few
+ * places, so the ~ works */
+#define UTF8_TWO_BYTE_HI(c)                                                    \
+       (__ASSERT_((sizeof(c) ==  1)                                            \
+                  || !(((WIDEST_UTYPE)(c)) & ~MAX_PORTABLE_UTF8_TWO_BYTE))     \
+        ((U8) __BASE_TWO_BYTE_HI(c, NATIVE_TO_LATIN1)))
+#define UTF8_TWO_BYTE_LO(c)                                                    \
+       (__ASSERT_((sizeof(c) ==  1)                                            \
+                  || !(((WIDEST_UTYPE)(c)) & ~MAX_PORTABLE_UTF8_TWO_BYTE))     \
+        ((U8) __BASE_TWO_BYTE_LO(c, NATIVE_TO_LATIN1)))
 
 /* This is illegal in any well-formed UTF-8 in both EBCDIC and ASCII
  * as it is only in overlongs. */
@@ -393,8 +415,11 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 #define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES)
 #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)
 #define IN_UNI_8_BIT \
-	    (CopHINTS_get(PL_curcop) & (HINT_UNI_8_BIT|HINT_LOCALE_NOT_CHARS) \
-	     && ! IN_LOCALE_RUNTIME && ! IN_BYTES)
+	    (((CopHINTS_get(PL_curcop) & (HINT_UNI_8_BIT))                       \
+               || (CopHINTS_get(PL_curcop) & HINT_LOCALE_PARTIAL                 \
+                   /* -1 below is for :not_characters */                         \
+                   && _is_in_locale_category(FALSE, -1)))                        \
+              && ! IN_BYTES)
 
 
 #define UTF8_ALLOW_EMPTY		0x0001	/* Allow a zero length string */
@@ -421,7 +446,9 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 #define UTF8_WARN_SUPER		        0x0400	/* points above the legal max */
 
 /* Code points which never were part of the original UTF-8 standard, the first
- * byte of which is a FE or FF on ASCII platforms. */
+ * byte of which is a FE or FF on ASCII platforms. If the first byte is FF, it
+ * will overflow a 32-bit word.  If the first byte is FE, it will overflow a
+ * signed 32-bit word. */
 #define UTF8_DISALLOW_FE_FF		0x0800
 #define UTF8_WARN_FE_FF		        0x1000
 
@@ -516,10 +543,6 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 #define UNICODE_IS_SUPER(c)		((c) > PERL_UNICODE_MAX)
 #define UNICODE_IS_FE_FF(c)		((c) > 0x7FFFFFFF)
 
-#ifdef HAS_QUAD
-#    define UTF8_QUAD_MAX	UINT64_C(0x1000000000)
-#endif
-
 #define LATIN_SMALL_LETTER_SHARP_S      LATIN_SMALL_LETTER_SHARP_S_NATIVE
 #define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS                                  \
                                 LATIN_SMALL_LETTER_Y_WITH_DIAERESIS_NATIVE
@@ -532,14 +555,15 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 #define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA	0x03C2
 #define UNICODE_GREEK_SMALL_LETTER_SIGMA	0x03C3
 #define GREEK_SMALL_LETTER_MU                   0x03BC
-#define GREEK_CAPITAL_LETTER_MU 0x039C	/* Upper and title case of MICRON */
-#define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178	/* Also is title case */
-#define LATIN_CAPITAL_LETTER_SHARP_S	0x1E9E
-#define LATIN_SMALL_LETTER_LONG_S   0x017F
+#define GREEK_CAPITAL_LETTER_MU                 0x039C	/* Upper and title case
+                                                           of MICRON */
+#define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS   0x0178	/* Also is title case */
+#define LATIN_CAPITAL_LETTER_SHARP_S	        0x1E9E
+#define LATIN_SMALL_LETTER_LONG_S               0x017F
 #define LATIN_SMALL_LIGATURE_LONG_S_T           0xFB05
 #define LATIN_SMALL_LIGATURE_ST                 0xFB06
-#define KELVIN_SIGN                 0x212A
-#define ANGSTROM_SIGN               0x212B
+#define KELVIN_SIGN                             0x212A
+#define ANGSTROM_SIGN                           0x212B
 
 #define UNI_DISPLAY_ISPRINT	0x0001
 #define UNI_DISPLAY_BACKSLASH	0x0002
@@ -553,64 +577,83 @@ Perl's extended UTF-8 means we can have start bytes up to FF.
 	 ((end) > (input) + 1) && \
 	 toFOLD((input)[0]) == 's' && \
 	 toFOLD((input)[1]) == 's')
+
 #define SHARP_S_SKIP 2
 
 /* If you want to exclude surrogates, and beyond legal Unicode, see the blame
  * log for earlier versions which gave details for these */
 
+/* A helper macro for isUTF8_CHAR, so use that one, and not this one.  This is
+ * retained solely for backwards compatibility and may be deprecated and
+ * removed in a future Perl version.
+ *
+ * regen/regcharclass.pl generates is_UTF8_CHAR_utf8() macros for up to these
+ * number of bytes.  So this has to be coordinated with that file */
+#ifdef EBCDIC
+#   define IS_UTF8_CHAR_FAST(n) ((n) <= 3)
+#else
+#   define IS_UTF8_CHAR_FAST(n) ((n) <= 4)
+#endif
+
 #ifndef EBCDIC
-/* This was generated by regen/regcharclass.pl, and then moved here.  The lines
- * that generated it were then commented out.  This was done solely because it
- * takes on the order of 10 minutes to generate, and is never going to change.
- * The EBCDIC equivalent hasn't been commented out in regcharclass.pl, so it
- * should generate and run the correct stuff */
+/* A helper macro for isUTF8_CHAR, so use that one instead of this.  This was
+ * generated by regen/regcharclass.pl, and then moved here.  The lines that
+ * generated it were then commented out.  This was done solely because it takes
+ * on the order of 10 minutes to generate, and is never going to change, unless
+ * the generated code is improved.
+ *
+ * The EBCDIC versions have been cut to not cover all of legal Unicode, so
+ * don't take too long to generate, and there is a separate one for each code
+ * page, so they are in regcharclass.h instead of here */
 /*
-	UTF8_CHAR: Matches utf8 from 1 to 4 bytes
+	UTF8_CHAR: Matches legal UTF-8 encoded characters from 2 through 4 bytes
 
-	0x0 - 0x1FFFFF
+	0x80 - 0x1FFFFF
 */
 /*** GENERATED CODE ***/
-#define is_UTF8_CHAR_utf8_safe(s,e)                                         \
-( ((e)-(s) > 3) ?                                                           \
-    ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1                                \
-    : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ?                      \
-	( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 )                      \
-    : ( 0xE0 == ((U8*)s)[0] ) ?                                             \
-	( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-    : ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ?                      \
-	( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-    : ( 0xF0 == ((U8*)s)[0] ) ?                                             \
-	( ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
-    : ( ( ( ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
-: ((e)-(s) > 2) ?                                                           \
-    ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1                                \
-    : ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ?                      \
-	( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 )                      \
-    : ( 0xE0 == ((U8*)s)[0] ) ?                                             \
-	( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-    : ( ( ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
-: ((e)-(s) > 1) ?                                                           \
-    ( ( ( ((U8*)s)[0] & 0x80 ) == 0x00 ) ? 1                                \
-    : ( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) ? 2 : 0 )\
-: ((e)-(s) > 0) ?                                                           \
-    ( ( ((U8*)s)[0] & 0x80 ) == 0x00 )                                      \
-: 0 )
+#define is_UTF8_CHAR_utf8_no_length_checks(s)                               \
+( ( 0xC2 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xDF ) ?                          \
+    ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ? 2 : 0 )                          \
+: ( 0xE0 == ((U8*)s)[0] ) ?                                                 \
+    ( ( ( ( ((U8*)s)[1] & 0xE0 ) == 0xA0 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xE1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xEF ) ?                          \
+    ( ( ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) ? 3 : 0 )\
+: ( 0xF0 == ((U8*)s)[0] ) ?                                                 \
+    ( ( ( ( 0x90 <= ((U8*)s)[1] && ((U8*)s)[1] <= 0xBF ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )\
+: ( ( ( ( 0xF1 <= ((U8*)s)[0] && ((U8*)s)[0] <= 0xF7 ) && ( ( ((U8*)s)[1] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[2] & 0xC0 ) == 0x80 ) ) && ( ( ((U8*)s)[3] & 0xC0 ) == 0x80 ) ) ? 4 : 0 )
 #endif
 
-/* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it
- * (1) allows UTF-8 encoded UTF-16 surrogates
- * (2) it allows code points past U+10FFFF.
- * The Perl_is_utf8_char() full "slow" code will handle the Perl
- * "extended UTF-8". */
-#define IS_UTF8_CHAR(p, n)      (is_UTF8_CHAR_utf8_safe(p, (p) + (n)) == n)
+/*
+=head1 Unicode Support
 
-/* regen/regcharclass.pl generates is_UTF8_CHAR_utf8_safe() macros for up to
- * these number of bytes.  So this has to be coordinated with it */
-#ifdef EBCDIC
-#   define IS_UTF8_CHAR_FAST(n) ((n) <= 5)
-#else
-#   define IS_UTF8_CHAR_FAST(n) ((n) <= 4)
-#endif
+=for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e
+
+Returns the number of bytes beginning at C<s> which form a legal UTF-8 (or
+UTF-EBCDIC) encoded character, looking no further than C<e - s> bytes into
+C<s>.  Returns 0 if the sequence starting at C<s> through C<e - 1> is not
+well-formed UTF-8
+
+Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character.
+
+=cut
+*/
+
+#define isUTF8_CHAR(s, e)   (UNLIKELY((e) <= (s))                           \
+                             ? 0                                            \
+                             : (UTF8_IS_INVARIANT(*s))                      \
+                               ? 1                                          \
+                               : UNLIKELY(((e) - (s)) < UTF8SKIP(s))        \
+                                 ? 0                                        \
+                                 : LIKELY(IS_UTF8_CHAR_FAST(UTF8SKIP(s)))   \
+                                   ? is_UTF8_CHAR_utf8_no_length_checks(s)  \
+                                   : _is_utf8_char_slow(s, e))
+
+#define is_utf8_char_buf(buf, buf_end) isUTF8_CHAR(buf, buf_end)
+
+/* Do not use; should be deprecated.  Use isUTF8_CHAR() instead; this is
+ * retained solely for backwards compatibility */
+#define IS_UTF8_CHAR(p, n)      (isUTF8_CHAR(p, (p) + (n)) == n)
 
 #endif /* H_UTF8 */