This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regen/regcharclass.pl: Work on EBCDIC platforms
[perl5.git] / utf8.h
CommitLineData
a0ed51b3
LW
1/* utf8.h
2 *
2eee27d7
SS
3 * Copyright (C) 2000, 2001, 2002, 2005, 2006, 2007, 2009,
4 * 2010, 2011 by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
39e02b42 11/* Use UTF-8 as the default script encoding?
1e54db1a 12 * Turning this on will break scripts having non-UTF-8 binary
39e02b42
JH
13 * data (such as Latin-1) in string literals. */
14#ifdef USE_UTF8_SCRIPTS
15# define USE_UTF8_IN_NAMES (!IN_BYTES)
16#else
17# define USE_UTF8_IN_NAMES (PL_hints & HINT_UTF8)
18#endif
19
051a06d4
KW
20/* For to_utf8_fold_flags, q.v. */
21#define FOLD_FLAGS_LOCALE 0x1
22#define FOLD_FLAGS_FULL 0x2
a0270393 23#define FOLD_FLAGS_NOMIX_ASCII 0x4
051a06d4 24
83199d38
KW
25/* For _core_swash_init(), internal core use only */
26#define _CORE_SWASH_INIT_USER_DEFINED_PROPERTY 0x1
5d3d13d1 27#define _CORE_SWASH_INIT_RETURN_IF_UNDEF 0x2
87367d5f 28#define _CORE_SWASH_INIT_ACCEPT_INVLIST 0x4
83199d38 29
a0270393 30#define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
051a06d4
KW
31#define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, \
32 FOLD_FLAGS_FULL, NULL)
33#define to_utf8_lower(a,b,c) _to_utf8_lower_flags(a,b,c,0, NULL)
34#define to_utf8_upper(a,b,c) _to_utf8_upper_flags(a,b,c,0, NULL)
35#define to_utf8_title(a,b,c) _to_utf8_title_flags(a,b,c,0, NULL)
36bb2ab6 36
fd7cb289
RGS
37/* Source backward compatibility. */
38#define uvuni_to_utf8(d, uv) uvuni_to_utf8_flags(d, uv, 0)
39#define is_utf8_string_loc(s, len, ep) is_utf8_string_loclen(s, len, ep, 0)
40
eda9cac1
KW
41#define foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2) \
42 foldEQ_utf8_flags(s1, pe1, l1, u1, s2, pe2, l2, u2, 0)
a33c29bc 43#define FOLDEQ_UTF8_NOMIX_ASCII (1 << 0)
5e64d0fa 44#define FOLDEQ_UTF8_LOCALE (1 << 1)
18f762c3
KW
45#define FOLDEQ_S1_ALREADY_FOLDED (1 << 2)
46#define FOLDEQ_S2_ALREADY_FOLDED (1 << 3)
a33c29bc 47
e6226b18
KW
48/*
49=for apidoc ibcmp_utf8
50
51This is a synonym for (! foldEQ_utf8())
52
53=cut
54*/
55#define ibcmp_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2) \
56 cBOOL(! foldEQ_utf8(s1, pe1, l1, u1, s2, pe2, l2, u2))
57
1d72bdf6
NIS
58#ifdef EBCDIC
59/* The equivalent of these macros but implementing UTF-EBCDIC
60 are in the following header file:
61 */
62
63#include "utfebcdic.h"
fd7cb289 64
d06134e5 65#else /* ! EBCDIC */
73c4f7a1
GS
66START_EXTERN_C
67
a0ed51b3 68#ifdef DOINIT
6f06b55f 69EXTCONST unsigned char PL_utf8skip[] = {
b2635aa8
KW
70/* 0x00 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
71/* 0x10 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
72/* 0x20 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
73/* 0x30 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
74/* 0x40 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
75/* 0x50 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
76/* 0x60 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
77/* 0x70 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* ascii */
78/* 0x80 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus: continuation byte */
79/* 0x90 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus: continuation byte */
80/* 0xA0 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus: continuation byte */
81/* 0xB0 */ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, /* bogus: continuation byte */
82/* 0xC0 */ 2,2, /* overlong */
83/* 0xC2 */ 2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* U+0080 to U+03FF */
84/* 0xD0 */ 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /* U+0400 to U+07FF */
85/* 0xE0 */ 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, /* U+0800 to U+FFFF */
86/* 0xF0 */ 4,4,4,4,4,4,4,4,5,5,5,5,6,6, /* above BMP to 2**31 - 1 */
87/* 0xFE */ 7,13, /* Perl extended (never was official UTF-8). Up to 72bit
88 allowed (64-bit + reserved). */
a0ed51b3
LW
89};
90#else
6f06b55f 91EXTCONST unsigned char PL_utf8skip[];
a0ed51b3
LW
92#endif
93
73c4f7a1 94END_EXTERN_C
7e2040f0 95
ec34087a
KW
96#include "unicode_constants.h"
97
1d72bdf6
NIS
98/* Native character to iso-8859-1 */
99#define NATIVE_TO_ASCII(ch) (ch)
100#define ASCII_TO_NATIVE(ch) (ch)
101/* Transform after encoding */
102#define NATIVE_TO_UTF(ch) (ch)
428921e5 103#define NATIVE_TO_I8(ch) NATIVE_TO_UTF(ch) /* a clearer synonym */
1d72bdf6 104#define UTF_TO_NATIVE(ch) (ch)
428921e5 105#define I8_TO_NATIVE(ch) UTF_TO_NATIVE(ch)
1d72bdf6
NIS
106/* Transforms in wide UV chars */
107#define UNI_TO_NATIVE(ch) (ch)
108#define NATIVE_TO_UNI(ch) (ch)
109/* Transforms in invariant space */
110#define NATIVE_TO_NEED(enc,ch) (ch)
111#define ASCII_TO_NEED(enc,ch) (ch)
d7578b48 112
d06134e5 113/* As there are no translations, avoid the function wrapper */
1754c1a1 114#define utf8n_to_uvchr utf8n_to_uvuni
6dd9dce9 115#define valid_utf8_to_uvchr valid_utf8_to_uvuni
1754c1a1 116#define uvchr_to_utf8 uvuni_to_utf8
2b9d42f0 117
877d9f0d 118/*
9041c2e3 119
8c007b5a 120 The following table is from Unicode 3.2.
877d9f0d
JH
121
122 Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
123
375122d7 124 U+0000..U+007F 00..7F
e1b711da 125 U+0080..U+07FF * C2..DF 80..BF
37e2e78e 126 U+0800..U+0FFF E0 * A0..BF 80..BF
375122d7 127 U+1000..U+CFFF E1..EC 80..BF 80..BF
e1b711da 128 U+D000..U+D7FF ED 80..9F 80..BF
37e2e78e 129 U+D800..U+DFFF +++++++ utf16 surrogates, not legal utf8 +++++++
375122d7 130 U+E000..U+FFFF EE..EF 80..BF 80..BF
37e2e78e 131 U+10000..U+3FFFF F0 * 90..BF 80..BF 80..BF
877d9f0d
JH
132 U+40000..U+FFFFF F1..F3 80..BF 80..BF 80..BF
133 U+100000..U+10FFFF F4 80..8F 80..BF 80..BF
b2635aa8
KW
134 Below are non-Unicode code points
135 U+110000..U+13FFFF F4 90..BF 80..BF 80..BF
136 U+110000..U+1FFFFF F5..F7 80..BF 80..BF 80..BF
137 U+200000: F8.. * 88..BF 80..BF 80..BF 80..BF
877d9f0d 138
e1b711da 139Note the gaps before several of the byte entries above marked by '*'. These are
37e2e78e
KW
140caused by legal UTF-8 avoiding non-shortest encodings: it is technically
141possible to UTF-8-encode a single code point in different ways, but that is
142explicitly forbidden, and the shortest possible encoding should always be used
143(and that is what Perl does).
8c007b5a 144
877d9f0d
JH
145 */
146
8c007b5a
JH
147/*
148 Another way to look at it, as bits:
149
b2635aa8 150 Code Points 1st Byte 2nd Byte 3rd Byte 4th Byte
8c007b5a 151
b2635aa8
KW
152 0aaa aaaa 0aaa aaaa
153 0000 0bbb bbaa aaaa 110b bbbb 10aa aaaa
154 cccc bbbb bbaa aaaa 1110 cccc 10bb bbbb 10aa aaaa
155 00 000d ddcc cccc bbbb bbaa aaaa 1111 0ddd 10cc cccc 10bb bbbb 10aa aaaa
8c007b5a
JH
156
157As you can see, the continuation bytes all begin with C<10>, and the
e1b711da 158leading bits of the start byte tell how many bytes there are in the
8c007b5a
JH
159encoded character.
160
65ab9279
TC
161Perl's extended UTF-8 means we can have start bytes up to FF.
162
8c007b5a
JH
163*/
164
c4d5f83a 165#define UNI_IS_INVARIANT(c) (((UV)c) < 0x80)
0447e8df 166#define UTF8_IS_START(c) (((U8)c) >= 0xc2)
e021c6e6 167#define UTF8_IS_CONTINUATION(c) ((((U8)c) & 0xC0) == 0x80)
c512ce4f 168#define UTF8_IS_CONTINUED(c) (((U8)c) & 0x80)
0ae1fa71
CH
169
170/* Masking with 0xfe allows low bit to be 0 or 1; thus this matches 0xc[23] */
171#define UTF8_IS_DOWNGRADEABLE_START(c) (((U8)c & 0xfe) == 0xc2)
8850bf83 172
22901f30 173#define UTF_START_MARK(len) (((len) > 7) ? 0xFF : (0xFE << (7-(len))))
b2635aa8
KW
174
175/* Masks out the initial one bits in a start byte, leaving the real data ones.
176 * Doesn't work on an invariant byte */
22901f30 177#define UTF_START_MASK(len) (((len) >= 7) ? 0x00 : (0x1F >> ((len)-2)))
1d72bdf6
NIS
178
179#define UTF_CONTINUATION_MARK 0x80
180#define UTF_ACCUMULATION_SHIFT 6
b2635aa8
KW
181
182/* 2**UTF_ACCUMULATION_SHIFT - 1 */
1d72bdf6 183#define UTF_CONTINUATION_MASK ((U8)0x3f)
c512ce4f 184
eb83ed87
KW
185/* This sets the UTF_CONTINUATION_MASK in the upper bits of a word. If a value
186 * is anded with it, and the result is non-zero, then using the original value
187 * in UTF8_ACCUMULATE will overflow, shifting bits off the left */
188#define UTF_ACCUMULATION_OVERFLOW_MASK \
bb88be5f
KW
189 (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * CHARBITS) \
190 - UTF_ACCUMULATION_SHIFT))
eb83ed87 191
1d68d6cd 192#ifdef HAS_QUAD
5bbb0b5a 193#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \
1d68d6cd
SC
194 (uv) < 0x800 ? 2 : \
195 (uv) < 0x10000 ? 3 : \
196 (uv) < 0x200000 ? 4 : \
197 (uv) < 0x4000000 ? 5 : \
198 (uv) < 0x80000000 ? 6 : \
9041c2e3 199 (uv) < UTF8_QUAD_MAX ? 7 : 13 )
1d68d6cd
SC
200#else
201/* No, I'm not even going to *TRY* putting #ifdef inside a #define */
5bbb0b5a 202#define UNISKIP(uv) ( (uv) < 0x80 ? 1 : \
1d68d6cd
SC
203 (uv) < 0x800 ? 2 : \
204 (uv) < 0x10000 ? 3 : \
205 (uv) < 0x200000 ? 4 : \
206 (uv) < 0x4000000 ? 5 : \
207 (uv) < 0x80000000 ? 6 : 7 )
208#endif
209
d06134e5
KW
210#endif /* EBCDIC vs ASCII */
211
212/* Rest of these are attributes of Unicode and perl's internals rather than the
213 * encoding, or happen to be the same in both ASCII and EBCDIC (at least at
214 * this level; the macros that some of these call may have different
215 * definitions in the two encodings */
216
217#define NATIVE8_TO_UNI(ch) NATIVE_TO_ASCII(ch) /* a clearer synonym */
218
bb88be5f
KW
219#define UTF8_ACCUMULATE(old, new) (((old) << UTF_ACCUMULATION_SHIFT) \
220 | (((U8)new) & UTF_CONTINUATION_MASK))
d06134e5 221
2950f2a7
KW
222/* Convert a two (not one) byte utf8 character to a unicode code point value.
223 * Needs just one iteration of accumulate. Should not be used unless it is
224 * known that the two bytes are legal: 1) two-byte start, and 2) continuation.
225 * Note that the result can be larger than 255 if the input character is not
226 * downgradable */
227#define TWO_BYTE_UTF8_TO_UNI(HI, LO) \
228 UTF8_ACCUMULATE((NATIVE_TO_UTF(HI) & UTF_START_MASK(2)), \
229 NATIVE_TO_UTF(LO))
230
d06134e5
KW
231#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
232
233#define UTF8_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE_TO_UTF(c))
234#define NATIVE_IS_INVARIANT(c) UNI_IS_INVARIANT(NATIVE8_TO_UNI(c))
235
236#define MAX_PORTABLE_UTF8_TWO_BYTE 0x3FF /* constrained by EBCDIC */
237
238/* The macros in the next sets are used to generate the two utf8 or utfebcdic
239 * bytes from an ordinal that is known to fit into two bytes; it must be less
240 * than 0x3FF to work across both encodings. */
241/* Nocast allows these to be used in the case label of a switch statement */
428921e5
KW
242#define UTF8_TWO_BYTE_HI_nocast(c) NATIVE_TO_I8(((c) \
243 >> UTF_ACCUMULATION_SHIFT) | (0xFF & UTF_START_MARK(2)))
244#define UTF8_TWO_BYTE_LO_nocast(c) NATIVE_TO_I8(((c) & UTF_CONTINUATION_MASK) \
245 | UTF_CONTINUATION_MARK)
d06134e5
KW
246
247#define UTF8_TWO_BYTE_HI(c) ((U8) (UTF8_TWO_BYTE_HI_nocast(c)))
248#define UTF8_TWO_BYTE_LO(c) ((U8) (UTF8_TWO_BYTE_LO_nocast(c)))
249
250/* This name is used when the source is a single byte */
251#define UTF8_EIGHT_BIT_HI(c) UTF8_TWO_BYTE_HI((U8)(c))
252#define UTF8_EIGHT_BIT_LO(c) UTF8_TWO_BYTE_LO((U8)(c))
253
7e2040f0 254/*
e3036cf4 255 * 'UTF' is whether or not p is encoded in UTF8. The names 'foo_lazy_if' stem
20df05f4
KW
256 * from an earlier version of these macros in which they didn't call the
257 * foo_utf8() macros (i.e. were 'lazy') unless they decided that *p is the
258 * beginning of a utf8 character. Now that foo_utf8() determines that itself,
259 * no need to do it again here
7e2040f0 260 */
e3036cf4
KW
261#define isIDFIRST_lazy_if(p,UTF) ((IN_BYTES || !UTF ) \
262 ? isIDFIRST(*(p)) \
263 : isIDFIRST_utf8((const U8*)p))
264#define isALNUM_lazy_if(p,UTF) ((IN_BYTES || (!UTF )) \
265 ? isALNUM(*(p)) \
266 : isALNUM_utf8((const U8*)p))
1d72bdf6 267
7e2040f0
GS
268#define isIDFIRST_lazy(p) isIDFIRST_lazy_if(p,1)
269#define isALNUM_lazy(p) isALNUM_lazy_if(p,1)
3bd709b1 270
89ebb4a3
JH
271#define UTF8_MAXBYTES 13
272/* How wide can a single UTF-8 encoded character become in bytes.
273 * NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8
274 * since UTF-8 is an encoding of Unicode and given Unicode's current
275 * upper limit only four bytes is possible. Perl thinks of UTF-8
276 * as a way to encode non-negative integers in a binary format. */
277#define UTF8_MAXLEN UTF8_MAXBYTES
278
89ebb4a3
JH
279/* The maximum number of UTF-8 bytes a single Unicode character can
280 * uppercase/lowercase/fold into; this number depends on the Unicode
281 * version. An example of maximal expansion is the U+03B0 which
282 * uppercases to U+03C5 U+0308 U+0301. The Unicode databases that
d06134e5 283 * tell these things are UnicodeData.txt, CaseFolding.txt, and
88d45ddf
KW
284 * SpecialCasing.txt. The value is 6 for strict Unicode characters, but it has
285 * to be as big as Perl allows for a single character */
286#define UTF8_MAXBYTES_CASE UTF8_MAXBYTES
3bd709b1 287
8cb75cc8
KW
288/* A Unicode character can fold to up to 3 characters */
289#define UTF8_MAX_FOLD_CHAR_EXPAND 3
290
a98fe34d 291#define IN_BYTES (CopHINTS_get(PL_curcop) & HINT_BYTES)
0064a8a9 292#define DO_UTF8(sv) (SvUTF8(sv) && !IN_BYTES)
66cbab2c 293#define IN_UNI_8_BIT \
b36bf33f
KW
294 (CopHINTS_get(PL_curcop) & (HINT_UNI_8_BIT|HINT_LOCALE_NOT_CHARS) \
295 && ! IN_LOCALE_RUNTIME && ! IN_BYTES)
296
1d72bdf6 297
c76687c5
KW
298#define UTF8_ALLOW_EMPTY 0x0001 /* Allow a zero length string */
299
300/* Allow first byte to be a continuation byte */
1d72bdf6 301#define UTF8_ALLOW_CONTINUATION 0x0002
c76687c5
KW
302
303/* Allow second... bytes to be non-continuation bytes */
1d72bdf6 304#define UTF8_ALLOW_NON_CONTINUATION 0x0004
949cf498
KW
305
306/* expecting more bytes than were available in the string */
307#define UTF8_ALLOW_SHORT 0x0008
308
309/* Overlong sequence; i.e., the code point can be specified in fewer bytes. */
310#define UTF8_ALLOW_LONG 0x0010
311
312#define UTF8_DISALLOW_SURROGATE 0x0020 /* Unicode surrogates */
313#define UTF8_WARN_SURROGATE 0x0040
314
315#define UTF8_DISALLOW_NONCHAR 0x0080 /* Unicode non-character */
316#define UTF8_WARN_NONCHAR 0x0100 /* code points */
317
318#define UTF8_DISALLOW_SUPER 0x0200 /* Super-set of Unicode: code */
319#define UTF8_WARN_SUPER 0x0400 /* points above the legal max */
320
321/* Code points which never were part of the original UTF-8 standard, the first
322 * byte of which is a FE or FF on ASCII platforms. */
323#define UTF8_DISALLOW_FE_FF 0x0800
324#define UTF8_WARN_FE_FF 0x1000
325
326#define UTF8_CHECK_ONLY 0x2000
327
328/* For backwards source compatibility. They do nothing, as the default now
329 * includes what they used to mean. The first one's meaning was to allow the
330 * just the single non-character 0xFFFF */
331#define UTF8_ALLOW_FFFF 0
332#define UTF8_ALLOW_SURROGATE 0
333
33d9abfb 334#define UTF8_DISALLOW_ILLEGAL_INTERCHANGE (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_NONCHAR|UTF8_DISALLOW_SURROGATE|UTF8_DISALLOW_FE_FF)
949cf498 335#define UTF8_WARN_ILLEGAL_INTERCHANGE \
33d9abfb 336 (UTF8_WARN_SUPER|UTF8_WARN_NONCHAR|UTF8_WARN_SURROGATE|UTF8_WARN_FE_FF)
949cf498
KW
337#define UTF8_ALLOW_ANY \
338 (~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE))
339#define UTF8_ALLOW_ANYUV \
340 (UTF8_ALLOW_EMPTY \
341 & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE))
9f7f3913
TS
342#define UTF8_ALLOW_DEFAULT (ckWARN(WARN_UTF8) ? 0 : \
343 UTF8_ALLOW_ANYUV)
1d72bdf6 344
7131f24d
KW
345/* Surrogates, non-character code points and above-Unicode code points are
346 * problematic in some contexts. This allows code that needs to check for
347 * those to to quickly exclude the vast majority of code points it will
348 * encounter */
349#ifdef EBCDIC
350# define UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE UTF_TO_NATIVE(0xF1)
351#else
352# define UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE 0xED
353#endif
354
355/* ASCII EBCDIC I8
356 * U+D7FF: \xED\x9F\xBF \xF1\xB5\xBF\xBF last before surrogates
357 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 1st surrogate
358 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF final surrogate
359 * U+E000: \xEE\x80\x80 \xF1\xB8\xA0\xA0 next after surrogates
360 */
361#ifdef EBCDIC /* Both versions assume well-formed UTF8 */
b2635aa8
KW
362# define UTF8_IS_SURROGATE(s) (*(s) == UTF_TO_NATIVE(0xF1) \
363 && ((*((s) +1) == UTF_TO_NATIVE(0xB6)) \
364 || *((s) + 1) == UTF_TO_NATIVE(0xB7)))
28936164
KW
365 /* <send> points to one beyond the end of the string that starts at <s> */
366# define UTF8_IS_REPLACEMENT(s, send) (*(s) == UTF_TO_NATIVE(0xEF) \
367 && (send - s) >= 4 \
368 && *((s) + 1) == UTF_TO_NATIVE(0xBF) \
369 && *((s) + 2) == UTF_TO_NATIVE(0xBF) \
370 && *((s) + 3) == UTF_TO_NATIVE(0xBD)
7131f24d
KW
371#else
372# define UTF8_IS_SURROGATE(s) (*(s) == 0xED && *((s) + 1) >= 0xA0)
28936164
KW
373# define UTF8_IS_REPLACEMENT(s, send) (*(s) == 0xEF \
374 && (send - s) >= 3 \
375 && *((s) + 1) == 0xBF \
376 && *((s) + 2) == 0xBD)
7131f24d
KW
377#endif
378
379/* ASCII EBCDIC I8
380 * U+10FFFF: \xF4\x8F\xBF\xBF \xF9\xA1\xBF\xBF\xBF max legal Unicode
381 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0
382 * U+110001: \xF4\x90\x80\x81 \xF9\xA2\xA0\xA0\xA1
383 */
384#ifdef EBCDIC /* Both versions assume well-formed UTF8 */
428921e5
KW
385# define UTF8_IS_SUPER(s) (NATIVE_TO_I8(*(s)) >= 0xF9 \
386 && (NATIVE_TO_I8(*(s)) > 0xF9) || (NATIVE_TO_I8(*((s)) + 1 >= 0xA2)))
7131f24d
KW
387#else
388# define UTF8_IS_SUPER(s) (*(s) >= 0xF4 \
389 && (*(s) > 0xF4 || (*((s) + 1) >= 0x90)))
390#endif
391
392/* ASCII EBCDIC I8
393 * U+FDCF: \xEF\xB7\x8F \xF1\xBF\xAE\xAF last before non-char block
394 * U+FDD0: \xEF\xB7\x90 \xF1\xBF\xAE\xB0 first non-char in block
395 * U+FDEF: \xEF\xB7\xAF \xF1\xBF\xAF\xAF last non-char in block
396 * U+FDF0: \xEF\xB7\xB0 \xF1\xBF\xAF\xB0 first after non-char block
397 * U+FFFF: \xEF\xBF\xBF \xF1\xBF\xBF\xBF
398 * U+1FFFF: \xF0\x9F\xBF\xBF \xF3\xBF\xBF\xBF
399 * U+2FFFF: \xF0\xAF\xBF\xBF \xF5\xBF\xBF\xBF
400 * U+3FFFF: \xF0\xBF\xBF\xBF \xF7\xBF\xBF\xBF
401 * U+4FFFF: \xF1\x8F\xBF\xBF \xF8\xA9\xBF\xBF\xBF
402 * U+5FFFF: \xF1\x9F\xBF\xBF \xF8\xAB\xBF\xBF\xBF
403 * U+6FFFF: \xF1\xAF\xBF\xBF \xF8\xAD\xBF\xBF\xBF
404 * U+7FFFF: \xF1\xBF\xBF\xBF \xF8\xAF\xBF\xBF\xBF
405 * U+8FFFF: \xF2\x8F\xBF\xBF \xF8\xB1\xBF\xBF\xBF
406 * U+9FFFF: \xF2\x9F\xBF\xBF \xF8\xB3\xBF\xBF\xBF
407 * U+AFFFF: \xF2\xAF\xBF\xBF \xF8\xB5\xBF\xBF\xBF
408 * U+BFFFF: \xF2\xBF\xBF\xBF \xF8\xB7\xBF\xBF\xBF
409 * U+CFFFF: \xF3\x8F\xBF\xBF \xF8\xB9\xBF\xBF\xBF
410 * U+DFFFF: \xF3\x9F\xBF\xBF \xF8\xBB\xBF\xBF\xBF
411 * U+EFFFF: \xF3\xAF\xBF\xBF \xF8\xBD\xBF\xBF\xBF
412 * U+FFFFF: \xF3\xBF\xBF\xBF \xF8\xBF\xBF\xBF\xBF
413 * U+10FFFF: \xF4\x8F\xBF\xBF \xF9\xA1\xBF\xBF\xBF
414 */
415#define UTF8_IS_NONCHAR_(s) ( \
416 *(s) >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE \
417 && ! UTF8_IS_SUPER(s) \
418 && UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_FIRST_PROBLEMATIC(s) \
419
420#ifdef EBCDIC /* Both versions assume well-formed UTF8 */
421# define UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s) \
422 ((*(s) == UTF_TO_NATIVE(0xF1) \
423 && (*((s) + 1) == UTF_TO_NATIVE(0xBF) \
424 && ((*((s) + 2) == UTF_TO_NATIVE(0xAE) \
425 && *((s) + 3) >= UTF_TO_NATIVE(0xB0)) \
426 || (*((s) + 2) == UTF_TO_NATIVE(0xAF) \
427 && *((s) + 3) <= UTF_TO_NATIVE(0xAF))))) \
428 || (UTF8SKIP(*(s)) > 3 \
429 /* (These were all derived by inspection and experimentation with an */ \
430 /* editor) The next line checks the next to final byte in the char */ \
431 && *((s) + UTF8SKIP(*(s)) - 2) == UTF_TO_NATIVE(0xBF) \
432 && *((s) + UTF8SKIP(*(s)) - 3) == UTF_TO_NATIVE(0xBF) \
433 && (NATIVE_TO_UTF(*((s) + UTF8SKIP(*(s)) - 4)) & 0x81) == 0x81 \
434 && (NATIVE_TO_UTF(*((s) + UTF8SKIP(*(s)) - 1)) & 0xBE) == 0XBE))
435#else
436# define UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s) \
437 ((*(s) == 0xEF \
438 && ((*((s) + 1) == 0xB7 && (*((s) + 2) >= 0x90 && (*((s) + 2) <= 0xAF)))\
439 /* Gets U+FFF[EF] */ \
440 || (*((s) + 1) == 0xBF && ((*((s) + 2) & 0xBE) == 0xBE)))) \
441 || ((*((s) + 2) == 0xBF \
442 && (*((s) + 3) & 0xBE) == 0xBE \
443 /* Excludes things like U+10FFE = \xF0\x90\xBF\xBE */ \
444 && (*((s) + 1) & 0x8F) == 0x8F)))
445#endif
446
c867b360
JH
447#define UNICODE_SURROGATE_FIRST 0xD800
448#define UNICODE_SURROGATE_LAST 0xDFFF
449#define UNICODE_REPLACEMENT 0xFFFD
450#define UNICODE_BYTE_ORDER_MARK 0xFEFF
1d72bdf6 451
b851fbc1 452/* Though our UTF-8 encoding can go beyond this,
c76687c5 453 * let's be conservative and do as Unicode says. */
b851fbc1
JH
454#define PERL_UNICODE_MAX 0x10FFFF
455
949cf498
KW
456#define UNICODE_WARN_SURROGATE 0x0001 /* UTF-16 surrogates */
457#define UNICODE_WARN_NONCHAR 0x0002 /* Non-char code points */
458#define UNICODE_WARN_SUPER 0x0004 /* Above 0x10FFFF */
459#define UNICODE_WARN_FE_FF 0x0008 /* Above 0x10FFFF */
460#define UNICODE_DISALLOW_SURROGATE 0x0010
461#define UNICODE_DISALLOW_NONCHAR 0x0020
462#define UNICODE_DISALLOW_SUPER 0x0040
463#define UNICODE_DISALLOW_FE_FF 0x0080
bb88be5f
KW
464#define UNICODE_WARN_ILLEGAL_INTERCHANGE \
465 (UNICODE_WARN_SURROGATE|UNICODE_WARN_NONCHAR|UNICODE_WARN_SUPER)
466#define UNICODE_DISALLOW_ILLEGAL_INTERCHANGE \
467 (UNICODE_DISALLOW_SURROGATE|UNICODE_DISALLOW_NONCHAR|UNICODE_DISALLOW_SUPER)
949cf498
KW
468
469/* For backward source compatibility, as are now the default */
470#define UNICODE_ALLOW_SURROGATE 0
471#define UNICODE_ALLOW_SUPER 0
472#define UNICODE_ALLOW_ANY 0
b851fbc1 473
1d72bdf6
NIS
474#define UNICODE_IS_SURROGATE(c) ((c) >= UNICODE_SURROGATE_FIRST && \
475 (c) <= UNICODE_SURROGATE_LAST)
a10ec373 476#define UNICODE_IS_REPLACEMENT(c) ((c) == UNICODE_REPLACEMENT)
872c91ae 477#define UNICODE_IS_BYTE_ORDER_MARK(c) ((c) == UNICODE_BYTE_ORDER_MARK)
7131f24d
KW
478#define UNICODE_IS_NONCHAR(c) ((c >= 0xFDD0 && c <= 0xFDEF) \
479 /* The other noncharacters end in FFFE or FFFF, which \
480 * the mask below catches both of, but beyond the last \
481 * official unicode code point, they aren't \
482 * noncharacters, since those aren't Unicode \
483 * characters at all */ \
484 || ((((c & 0xFFFE) == 0xFFFE)) && ! UNICODE_IS_SUPER(c)))
485#define UNICODE_IS_SUPER(c) ((c) > PERL_UNICODE_MAX)
486#define UNICODE_IS_FE_FF(c) ((c) > 0x7FFFFFFF)
1d72bdf6 487
f067b878
NA
488#ifdef HAS_QUAD
489# define UTF8_QUAD_MAX UINT64_C(0x1000000000)
490#endif
3bd709b1 491
ec34087a
KW
492#define LATIN_SMALL_LETTER_SHARP_S LATIN_SMALL_LETTER_SHARP_S_NATIVE
493#define LATIN_SMALL_LETTER_Y_WITH_DIAERESIS \
494 LATIN_SMALL_LETTER_Y_WITH_DIAERESIS_NATIVE
495#define MICRO_SIGN MICRO_SIGN_NATIVE
496#define LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE \
497 LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE_NATIVE
498#define LATIN_SMALL_LETTER_A_WITH_RING_ABOVE \
499 LATIN_SMALL_LETTER_A_WITH_RING_ABOVE_NATIVE
09091399
JH
500#define UNICODE_GREEK_CAPITAL_LETTER_SIGMA 0x03A3
501#define UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA 0x03C2
502#define UNICODE_GREEK_SMALL_LETTER_SIGMA 0x03C3
9dcbe121 503#define GREEK_SMALL_LETTER_MU 0x03BC
78a0d3cc
KW
504#define GREEK_CAPITAL_LETTER_MU 0x039C /* Upper and title case of MICRON */
505#define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178 /* Also is title case */
97298f37 506#define LATIN_CAPITAL_LETTER_SHARP_S 0x1E9E
8d64d87f
KW
507#define LATIN_SMALL_LETTER_LONG_S 0x017F
508#define KELVIN_SIGN 0x212A
509#define ANGSTROM_SIGN 0x212B
09091399 510
9e55ce06 511#define UNI_DISPLAY_ISPRINT 0x0001
c728cb41
JH
512#define UNI_DISPLAY_BACKSLASH 0x0002
513#define UNI_DISPLAY_QQ (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
514#define UNI_DISPLAY_REGEX (UNI_DISPLAY_ISPRINT|UNI_DISPLAY_BACKSLASH)
9e55ce06 515
5cd46e1f
KW
516#define ANYOF_FOLD_SHARP_S(node, input, end) \
517 (ANYOF_BITMAP_TEST(node, LATIN_SMALL_LETTER_SHARP_S) && \
137165a6 518 (ANYOF_NONBITMAP(node)) && \
39065660 519 (ANYOF_FLAGS(node) & ANYOF_LOC_NONBITMAP_FOLD) && \
07b6858f
JH
520 ((end) > (input) + 1) && \
521 toLOWER((input)[0]) == 's' && \
522 toLOWER((input)[1]) == 's')
ebc501f0 523#define SHARP_S_SKIP 2
3b0fc154 524
2f454f11 525#ifndef EBCDIC
a4f7a67c
KW
526/* If you want to exclude surrogates, and beyond legal Unicode, see the blame
527 * log for earlier versions which gave details for these */
c78f6c49 528# define IS_UTF8_CHAR_1(p) \
3b0fc154 529 ((p)[0] <= 0x7F)
c78f6c49 530# define IS_UTF8_CHAR_2(p) \
3b0fc154
JH
531 ((p)[0] >= 0xC2 && (p)[0] <= 0xDF && \
532 (p)[1] >= 0x80 && (p)[1] <= 0xBF)
c78f6c49 533# define IS_UTF8_CHAR_3a(p) \
3b0fc154
JH
534 ((p)[0] == 0xE0 && \
535 (p)[1] >= 0xA0 && (p)[1] <= 0xBF && \
536 (p)[2] >= 0x80 && (p)[2] <= 0xBF)
c78f6c49 537# define IS_UTF8_CHAR_3b(p) \
a4f7a67c 538 ((p)[0] >= 0xE1 && (p)[0] <= 0xEF && \
3b0fc154
JH
539 (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
540 (p)[2] >= 0x80 && (p)[2] <= 0xBF)
c78f6c49 541# define IS_UTF8_CHAR_4a(p) \
3b0fc154
JH
542 ((p)[0] == 0xF0 && \
543 (p)[1] >= 0x90 && (p)[1] <= 0xBF && \
544 (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
545 (p)[3] >= 0x80 && (p)[3] <= 0xBF)
a4f7a67c 546/* The 0xF7 allows us to go to 0x1fffff (0x200000 would
3b0fc154
JH
547 * require five bytes). Not doing any further code points
548 * since that is not needed (and that would not be strict
549 * UTF-8, anyway). The "slow path" in Perl_is_utf8_char()
550 * will take care of the "extended UTF-8". */
a4f7a67c
KW
551# define IS_UTF8_CHAR_4b(p) \
552 ((p)[0] >= 0xF1 && (p)[0] <= 0xF7 && \
3b0fc154
JH
553 (p)[1] >= 0x80 && (p)[1] <= 0xBF && \
554 (p)[2] >= 0x80 && (p)[2] <= 0xBF && \
555 (p)[3] >= 0x80 && (p)[3] <= 0xBF)
556
c78f6c49 557# define IS_UTF8_CHAR_3(p) \
3b0fc154 558 (IS_UTF8_CHAR_3a(p) || \
a4f7a67c 559 IS_UTF8_CHAR_3b(p))
c78f6c49 560# define IS_UTF8_CHAR_4(p) \
3b0fc154 561 (IS_UTF8_CHAR_4a(p) || \
a4f7a67c 562 IS_UTF8_CHAR_4b(p))
3b0fc154
JH
563
564/* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it
565 * (1) allows UTF-8 encoded UTF-16 surrogates
566 * (2) it allows code points past U+10FFFF.
567 * The Perl_is_utf8_char() full "slow" code will handle the Perl
568 * "extended UTF-8". */
c78f6c49 569# define IS_UTF8_CHAR(p, n) \
3b0fc154
JH
570 ((n) == 1 ? IS_UTF8_CHAR_1(p) : \
571 (n) == 2 ? IS_UTF8_CHAR_2(p) : \
572 (n) == 3 ? IS_UTF8_CHAR_3(p) : \
573 (n) == 4 ? IS_UTF8_CHAR_4(p) : 0)
574
c78f6c49 575# define IS_UTF8_CHAR_FAST(n) ((n) <= 4)
768c67ee 576
2f454f11
KW
577#else /* EBCDIC */
578
579/* This is an attempt to port IS_UTF8_CHAR to EBCDIC based on eyeballing.
580 * untested. If want to exclude surrogates and above-Unicode, see the
581 * definitions for UTF8_IS_SURROGATE and UTF8_IS_SUPER */
582# define IS_UTF8_CHAR_1(p) \
583 (NATIVE_TO_ASCII((p)[0]) <= 0x9F)
584# define IS_UTF8_CHAR_2(p) \
585 (NATIVE_TO_I8((p)[0]) >= 0xC5 && NATIVE_TO_I8((p)[0]) <= 0xDF && \
586 NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF)
587# define IS_UTF8_CHAR_3(p) \
588 (NATIVE_TO_I8((p)[0]) == 0xE1 && NATIVE_TO_I8((p)[1]) <= 0xEF && \
589 NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
590 NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF)
591# define IS_UTF8_CHAR_4a(p) \
592 (NATIVE_TO_I8((p)[0]) == 0xF0 && \
593 NATIVE_TO_I8((p)[1]) >= 0xB0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
594 NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
595 NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
596# define IS_UTF8_CHAR_4b(p) \
597 (NATIVE_TO_I8((p)[0]) >= 0xF1 && NATIVE_TO_I8((p)[0]) <= 0xF7 && \
598 NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
599 NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
600 NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
601# define IS_UTF8_CHAR_5a(p) \
602 (NATIVE_TO_I8((p)[0]) == 0xF8 && \
603 NATIVE_TO_I8((p)[1]) >= 0xA8 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
604 NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
605 NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
606 NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
607# define IS_UTF8_CHAR_5b(p) \
608 (NATIVE_TO_I8((p)[0]) >= 0xF9 && NATIVE_TO_I8((p)[1]) <= 0xFB && \
609 NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
610 NATIVE_TO_I8((p)[1]) >= 0xA0 && NATIVE_TO_I8((p)[1]) <= 0xBF && \
611 NATIVE_TO_I8((p)[2]) >= 0xA0 && NATIVE_TO_I8((p)[2]) <= 0xBF && \
612 NATIVE_TO_I8((p)[3]) >= 0xA0 && NATIVE_TO_I8((p)[3]) <= 0xBF)
613
614# define IS_UTF8_CHAR_4(p) \
615 (IS_UTF8_CHAR_4a(p) || \
616 IS_UTF8_CHAR_4b(p))
617# define IS_UTF8_CHAR_5(p) \
618 (IS_UTF8_CHAR_5a(p) || \
619 IS_UTF8_CHAR_5b(p))
620# define IS_UTF8_CHAR(p, n) \
621 ((n) == 1 ? IS_UTF8_CHAR_1(p) : \
622 (n) == 2 ? IS_UTF8_CHAR_2(p) : \
623 (n) == 3 ? IS_UTF8_CHAR_3(p) : \
624 (n) == 4 ? IS_UTF8_CHAR_4(p) : \
625 (n) == 5 ? IS_UTF8_CHAR_5(p) : 0)
626
627# define IS_UTF8_CHAR_FAST(n) ((n) <= 5)
628
77263263 629#endif /* IS_UTF8_CHAR() for UTF-8 */
e9a8c099
MHM
630
631/*
632 * Local variables:
633 * c-indentation-style: bsd
634 * c-basic-offset: 4
14d04a33 635 * indent-tabs-mode: nil
e9a8c099
MHM
636 * End:
637 *
14d04a33 638 * ex: set ts=8 sts=4 sw=4 et:
e9a8c099 639 */