X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/663f364bd429de50c8f5478879b1285d1270c1b3..9f5650a8ec47c75b463d95214aa5d6d9d837290e:/utf8.h

diff --git a/utf8.h b/utf8.h
index c8da31e..8765336 100644
--- a/utf8.h
+++ b/utf8.h
@@ -1,6 +1,6 @@
 /*    utf8.h
  *
- *    Copyright (C) 2000, 2001, 2002, 2005, 2006, 2007, by Larry Wall and others
+ *    Copyright (C) 2000, 2001, 2002, 2005, 2006, 2007, 2009 by Larry Wall and others
  *
  *    You may distribute under the terms of either the GNU General Public
  *    License or the Artistic License, as specified in the README file.
@@ -27,7 +27,7 @@
 
 #include "utfebcdic.h"
 
-#else
+#else	/* ! EBCDIC */
 START_EXTERN_C
 
 #ifdef DOINIT
@@ -47,7 +47,6 @@ EXTCONST unsigned char PL_utf8skip[];
 #endif
 
 END_EXTERN_C
-#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
 
 /* Native character to iso-8859-1 */
 #define NATIVE_TO_ASCII(ch)      (ch)
@@ -62,7 +61,7 @@ END_EXTERN_C
 #define NATIVE_TO_NEED(enc,ch)   (ch)
 #define ASCII_TO_NEED(enc,ch)    (ch)
 
-/* As there are no translations avoid the function wrapper */
+/* As there are no translations, avoid the function wrapper */
 #define utf8n_to_uvchr utf8n_to_uvuni
 #define uvchr_to_utf8  uvuni_to_utf8
 
@@ -73,22 +72,21 @@ END_EXTERN_C
  Code Points		1st Byte  2nd Byte  3rd Byte  4th Byte
 
    U+0000..U+007F	00..7F
-   U+0080..U+07FF	C2..DF    80..BF
-   U+0800..U+0FFF	E0        A0..BF    80..BF
+   U+0080..U+07FF     * C2..DF    80..BF
+   U+0800..U+0FFF	E0      * A0..BF    80..BF
    U+1000..U+CFFF       E1..EC    80..BF    80..BF
    U+D000..U+D7FF       ED        80..9F    80..BF
-   U+D800..U+DFFF       ******* ill-formed *******
+   U+D800..U+DFFF       +++++++ utf16 surrogates, not legal utf8 +++++++
    U+E000..U+FFFF       EE..EF    80..BF    80..BF
-  U+10000..U+3FFFF	F0        90..BF    80..BF    80..BF
+  U+10000..U+3FFFF	F0      * 90..BF    80..BF    80..BF
   U+40000..U+FFFFF	F1..F3    80..BF    80..BF    80..BF
  U+100000..U+10FFFF	F4        80..8F    80..BF    80..BF
 
-Note the A0..BF in U+0800..U+0FFF, the 80..9F in U+D000...U+D7FF,
-the 90..BF in U+10000..U+3FFFF, and the 80...8F in U+100000..U+10FFFF.
-The "gaps" are caused by legal UTF-8 avoiding non-shortest encodings:
-it is technically possible to UTF-8-encode a single code point in different
-ways, but that is explicitly forbidden, and the shortest possible encoding
-should always be used (and that is what Perl does).
+Note the gaps before several of the byte entries above marked by '*'.  These are
+caused by legal UTF-8 avoiding non-shortest encodings: it is technically
+possible to UTF-8-encode a single code point in different ways, but that is
+explicitly forbidden, and the shortest possible encoding should always be used
+(and that is what Perl does).
 
  */
 
@@ -103,30 +101,26 @@ should always be used (and that is what Perl does).
   00000dddccccccbbbbbbaaaaaa     11110ddd  10cccccc  10bbbbbb  10aaaaaa
 
 As you can see, the continuation bytes all begin with C<10>, and the
-leading bits of the start byte tell how many bytes the are in the
+leading bits of the start byte tell how many bytes there are in the
 encoded character.
 
 */
 
 
 #define UNI_IS_INVARIANT(c)		(((UV)c) <  0x80)
-#define UTF8_IS_INVARIANT(c)		UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
-#define NATIVE_IS_INVARIANT(c)		UNI_IS_INVARIANT(NATIVE_TO_ASCII(c))
+/* Note that C0 and C1 are invalid in legal UTF8, so the lower bound of the
+ * below might ought to be C2 */
 #define UTF8_IS_START(c)		(((U8)c) >= 0xc0 && (((U8)c) <= 0xfd))
 #define UTF8_IS_CONTINUATION(c)		(((U8)c) >= 0x80 && (((U8)c) <= 0xbf))
 #define UTF8_IS_CONTINUED(c) 		(((U8)c) &  0x80)
 #define UTF8_IS_DOWNGRADEABLE_START(c)	(((U8)c & 0xfc) == 0xc0)
 
-#define UTF_START_MARK(len) ((len >  7) ? 0xFF : (0xFE << (7-len)))
-#define UTF_START_MASK(len) ((len >= 7) ? 0x00 : (0x1F >> (len-2)))
+#define UTF_START_MARK(len) (((len) >  7) ? 0xFF : (0xFE << (7-(len))))
+#define UTF_START_MASK(len) (((len) >= 7) ? 0x00 : (0x1F >> ((len)-2)))
 
 #define UTF_CONTINUATION_MARK		0x80
 #define UTF_ACCUMULATION_SHIFT		6
 #define UTF_CONTINUATION_MASK		((U8)0x3f)
-#define UTF8_ACCUMULATE(old, new)	(((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK))
-
-#define UTF8_EIGHT_BIT_HI(c)	((((U8)(c))>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2))
-#define UTF8_EIGHT_BIT_LO(c)	(((((U8)(c)))&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK)
 
 #ifdef HAS_QUAD
 #define UNISKIP(uv) ( (uv) < 0x80           ? 1 : \
@@ -146,24 +140,51 @@ encoded character.
 		      (uv) < 0x80000000     ? 6 : 7 )
 #endif
 
+#endif /* EBCDIC vs ASCII */
+
+/* Rest of these are attributes of Unicode and perl's internals rather than the
+ * encoding, or happen to be the same in both ASCII and EBCDIC (at least at
+ * this level; the macros that some of these call may have different
+ * definitions in the two encodings */
+
+#define NATIVE8_TO_UNI(ch)     NATIVE_TO_ASCII(ch)	/* a clearer synonym */
+
+#define UTF8_ACCUMULATE(old, new)	(((old) << UTF_ACCUMULATION_SHIFT) | (((U8)new) & UTF_CONTINUATION_MASK))
+
+#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
+
+#define UTF8_IS_INVARIANT(c)		UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
+#define NATIVE_IS_INVARIANT(c)		UNI_IS_INVARIANT(NATIVE8_TO_UNI(c))
+
+#define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF    /* constrained by EBCDIC */
+
+/* The macros in the next sets are used to generate the two utf8 or utfebcdic
+ * bytes from an ordinal that is known to fit into two bytes; it must be less
+ * than 0x3FF to work across both encodings. */
+/* Nocast allows these to be used in the case label of a switch statement */
+#define UTF8_TWO_BYTE_HI_nocast(c)	UTF_TO_NATIVE(((c)>>UTF_ACCUMULATION_SHIFT)|UTF_START_MARK(2))
+#define UTF8_TWO_BYTE_LO_nocast(c)	UTF_TO_NATIVE(((c)&UTF_CONTINUATION_MASK)|UTF_CONTINUATION_MARK)
+
+#define UTF8_TWO_BYTE_HI(c)	((U8) (UTF8_TWO_BYTE_HI_nocast(c)))
+#define UTF8_TWO_BYTE_LO(c)	((U8) (UTF8_TWO_BYTE_LO_nocast(c)))
+
+/* This name is used when the source is a single byte */
+#define UTF8_EIGHT_BIT_HI(c)	UTF8_TWO_BYTE_HI((U8)(c))
+#define UTF8_EIGHT_BIT_LO(c)	UTF8_TWO_BYTE_LO((U8)(c))
+
 /*
  * Note: we try to be careful never to call the isXXX_utf8() functions
- * unless we're pretty sure we've seen the beginning of a UTF-8 character
- * (that is, the two high bits are set).  Otherwise we risk loading in the
- * heavy-duty swash_init and swash_fetch routines unnecessarily.
+ * unless we're pretty sure we've seen the beginning of a UTF-8 or UTFEBCDIC
+ * character.  Otherwise we risk loading in the heavy-duty swash_init and
+ * swash_fetch routines unnecessarily.
  */
-#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \
+#define isIDFIRST_lazy_if(p,c) ((IN_BYTES || (!c || ! UTF8_IS_START(*((const U8*)p)))) \
 				? isIDFIRST(*(p)) \
 				: isIDFIRST_utf8((const U8*)p))
-#define isALNUM_lazy_if(p,c)   ((IN_BYTES || (!c || (*((const U8*)p) < 0xc0))) \
+#define isALNUM_lazy_if(p,c)   ((IN_BYTES || (!c || ! UTF8_IS_START(*((const U8*)p)))) \
 				? isALNUM(*(p)) \
 				: isALNUM_utf8((const U8*)p))
 
-
-#endif /* EBCDIC vs ASCII */
-
-/* Rest of these are attributes of Unicode and perl's internals rather than the encoding */
-
 #define isIDFIRST_lazy(p)	isIDFIRST_lazy_if(p,1)
 #define isALNUM_lazy(p)		isALNUM_lazy_if(p,1)
 
@@ -175,30 +196,28 @@ encoded character.
  * as a way to encode non-negative integers in a binary format. */
 #define UTF8_MAXLEN UTF8_MAXBYTES
 
-#define UTF8_MAXLEN_UCLC 3		/* Obsolete, do not use. */
-#define UTF8_MAXLEN_UCLC_MULT 39	/* Obsolete, do not use. */
-#define UTF8_MAXLEN_FOLD 3		/* Obsolete, do not use. */
-#define UTF8_MAXLEN_FOLD_MULT 39	/* Obsolete, do not use. */
-
 /* The maximum number of UTF-8 bytes a single Unicode character can
  * uppercase/lowercase/fold into; this number depends on the Unicode
  * version.  An example of maximal expansion is the U+03B0 which
  * uppercases to U+03C5 U+0308 U+0301.  The Unicode databases that
- * tell these things are UnicodeDatabase.txt, CaseFolding.txt, and
+ * tell these things are UnicodeData.txt, CaseFolding.txt, and
  * SpecialCasing.txt. */
 #define UTF8_MAXBYTES_CASE	6
 
 #define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES)
 #define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)
+#define IN_UNI_8_BIT ( (CopHINTS_get(PL_curcop) & HINT_UNI_8_BIT) \
+			&& ! IN_LOCALE_RUNTIME && ! IN_BYTES)
 
 #define UTF8_ALLOW_EMPTY		0x0001
 #define UTF8_ALLOW_CONTINUATION		0x0002
 #define UTF8_ALLOW_NON_CONTINUATION	0x0004
-#define UTF8_ALLOW_FE_FF		0x0008 /* Allow above 0x7fffFFFF */
-#define UTF8_ALLOW_SHORT		0x0010
+#define UTF8_ALLOW_FE_FF		0x0008 /* Allow FE or FF start bytes, \
+						  yields above 0x7fffFFFF */
+#define UTF8_ALLOW_SHORT		0x0010 /* expecting more bytes */
 #define UTF8_ALLOW_SURROGATE		0x0020
 #define UTF8_ALLOW_FFFF			0x0040 /* Allow UNICODE_ILLEGAL */
-#define UTF8_ALLOW_LONG			0x0080
+#define UTF8_ALLOW_LONG			0x0080 /* expecting fewer bytes */
 #define UTF8_ALLOW_ANYUV		(UTF8_ALLOW_EMPTY|UTF8_ALLOW_FE_FF|\
 					 UTF8_ALLOW_SURROGATE|UTF8_ALLOW_FFFF)
 #define UTF8_ALLOW_ANY			0x00FF
@@ -213,7 +232,7 @@ encoded character.
 #define UNICODE_ILLEGAL			0xFFFF
 
 /* Though our UTF-8 encoding can go beyond this,
- * let's be conservative and do as Unicode 3.2 says. */
+ * let's be conservative and do as Unicode 5.1 says. */
 #define PERL_UNICODE_MAX	0x10FFFF
 
 #define UNICODE_ALLOW_SURROGATE 0x0001	/* Allow UTF-16 surrogates (EVIL) */
@@ -234,35 +253,28 @@ encoded character.
 
 #define UTF8_IS_ASCII(c) UTF8_IS_INVARIANT(c)
 
-#define UNICODE_LATIN_SMALL_LETTER_SHARP_S	0x00DF
 #define UNICODE_GREEK_CAPITAL_LETTER_SIGMA	0x03A3
 #define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA	0x03C2
 #define UNICODE_GREEK_SMALL_LETTER_SIGMA	0x03C3
 
-#define EBCDIC_LATIN_SMALL_LETTER_SHARP_S	0x0059
-
 #define UNI_DISPLAY_ISPRINT	0x0001
 #define UNI_DISPLAY_BACKSLASH	0x0002
 #define UNI_DISPLAY_QQ		(UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
 #define UNI_DISPLAY_REGEX	(UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
 
-#ifdef EBCDIC
-#   define ANYOF_FOLD_SHARP_S(node, input, end)	\
-	(ANYOF_BITMAP_TEST(node, EBCDIC_LATIN_SMALL_LETTER_SHARP_S) && \
-	 (ANYOF_FLAGS(node) & ANYOF_UNICODE) && \
-	 (ANYOF_FLAGS(node) & ANYOF_FOLD) && \
-	 ((end) > (input) + 1) && \
-	 toLOWER((input)[0]) == 's' && \
-	 toLOWER((input)[1]) == 's')
-#else
-#   define ANYOF_FOLD_SHARP_S(node, input, end)	\
-	(ANYOF_BITMAP_TEST(node, UNICODE_LATIN_SMALL_LETTER_SHARP_S) && \
+#ifndef EBCDIC
+#   define LATIN_SMALL_LETTER_SHARP_S	0x00DF
+#   define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS 0x00FF
+#   define MICRO_SIGN 0x00B5
+#endif
+
+#define ANYOF_FOLD_SHARP_S(node, input, end)	\
+	(ANYOF_BITMAP_TEST(node, LATIN_SMALL_LETTER_SHARP_S) && \
 	 (ANYOF_FLAGS(node) & ANYOF_UNICODE) && \
 	 (ANYOF_FLAGS(node) & ANYOF_FOLD) && \
 	 ((end) > (input) + 1) && \
 	 toLOWER((input)[0]) == 's' && \
 	 toLOWER((input)[1]) == 's')
-#endif
 #define SHARP_S_SKIP 2
 
 #ifdef EBCDIC
@@ -340,3 +352,13 @@ encoded character.
 #define IS_UTF8_CHAR_FAST(n) ((n) <= 4)
 
 #endif /* IS_UTF8_CHAR() for UTF-8 */
+
+/*
+ * Local variables:
+ * c-indentation-style: bsd
+ * c-basic-offset: 4
+ * indent-tabs-mode: t
+ * End:
+ *
+ * ex: set ts=8 sts=4 sw=4 noet:
+ */