This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Silence win32 compiler warning.
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
a0ed51b3 35
806547a7 36static const char malformed_text[] = "Malformed UTF-8 character";
27da23d5 37static const char unees[] =
806547a7 38 "Malformed UTF-8 character (unexpected end of string)";
760c7c2f 39static const char cp_above_legal_max[] =
147e3846 40 "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf;
760c7c2f 41
114d9c4d 42#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
901b21bf 43
48ef279e 44/*
ccfc67b7 45=head1 Unicode Support
7fefc6c1 46These are various utility functions for manipulating UTF8-encoded
72d33970 47strings. For the uninitiated, this is a method of representing arbitrary
61296642 48Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
49characters in the ASCII range are unmodified, and a zero byte never appears
50within non-zero characters.
166f8a29 51
eaf7a4d2
CS
52=cut
53*/
54
55/*
378516de 56=for apidoc uvoffuni_to_utf8_flags
eebe1485 57
a27992cc 58THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af
KW
59Instead, B<Almost all code should use L</uvchr_to_utf8> or
60L</uvchr_to_utf8_flags>>.
a27992cc 61
de69f3af
KW
62This function is like them, but the input is a strict Unicode
63(as opposed to native) code point. Only in very rare circumstances should code
64not be using the native code point.
949cf498 65
efa9cd84 66For details, see the description for L</uvchr_to_utf8_flags>.
949cf498 67
eebe1485
SC
68=cut
69*/
70
8ee1cdcb
KW
71#define HANDLE_UNICODE_SURROGATE(uv, flags) \
72 STMT_START { \
73 if (flags & UNICODE_WARN_SURROGATE) { \
74 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
147e3846 75 "UTF-16 surrogate U+%04" UVXf, uv); \
8ee1cdcb
KW
76 } \
77 if (flags & UNICODE_DISALLOW_SURROGATE) { \
78 return NULL; \
79 } \
80 } STMT_END;
81
82#define HANDLE_UNICODE_NONCHAR(uv, flags) \
83 STMT_START { \
84 if (flags & UNICODE_WARN_NONCHAR) { \
85 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
147e3846 86 "Unicode non-character U+%04" UVXf " is not " \
8ee1cdcb
KW
87 "recommended for open interchange", uv); \
88 } \
89 if (flags & UNICODE_DISALLOW_NONCHAR) { \
90 return NULL; \
91 } \
92 } STMT_END;
93
ba6ed43c
KW
94/* Use shorter names internally in this file */
95#define SHIFT UTF_ACCUMULATION_SHIFT
96#undef MARK
97#define MARK UTF_CONTINUATION_MARK
98#define MASK UTF_CONTINUATION_MASK
99
dfe13c55 100U8 *
378516de 101Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 102{
378516de 103 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
7918f24d 104
2d1545e5 105 if (OFFUNI_IS_INVARIANT(uv)) {
4c8cd605 106 *d++ = LATIN1_TO_NATIVE(uv);
d9432125
KW
107 return d;
108 }
facc1dc2 109
3ea68d71 110 if (uv <= MAX_UTF8_TWO_BYTE) {
facc1dc2
KW
111 *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
112 *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
3ea68d71
KW
113 return d;
114 }
d9432125 115
ba6ed43c
KW
116 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
117 * below, the 16 is for start bytes E0-EF (which are all the possible ones
118 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
119 * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
120 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
121 * 0x800-0xFFFF on ASCII */
122 if (uv < (16 * (1U << (2 * SHIFT)))) {
123 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
124 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
125 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
126
127#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
128 aren't tested here */
129 /* The most likely code points in this range are below the surrogates.
130 * Do an extra test to quickly exclude those. */
131 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
132 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
133 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
134 {
8ee1cdcb
KW
135 HANDLE_UNICODE_NONCHAR(uv, flags);
136 }
137 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
138 HANDLE_UNICODE_SURROGATE(uv, flags);
760c7c2f 139 }
ba6ed43c
KW
140 }
141#endif
142 return d;
143 }
144
145 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
146 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
147 * happen starting with 4-byte characters on ASCII platforms. We unify the
148 * code for these with EBCDIC, even though some of them require 5-bytes on
149 * those, because khw believes the code saving is worth the very slight
150 * performance hit on these high EBCDIC code points. */
151
152 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
a5bf80e0
KW
153 if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
154 && ckWARN_d(WARN_DEPRECATED))
155 {
156 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
157 cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
158 }
159 if ( (flags & UNICODE_WARN_SUPER)
160 || ( UNICODE_IS_ABOVE_31_BIT(uv)
161 && (flags & UNICODE_WARN_ABOVE_31_BIT)))
162 {
163 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
164
165 /* Choose the more dire applicable warning */
166 (UNICODE_IS_ABOVE_31_BIT(uv))
147e3846
KW
167 ? "Code point 0x%" UVXf " is not Unicode, and not portable"
168 : "Code point 0x%" UVXf " is not Unicode, may not be portable",
a5bf80e0
KW
169 uv);
170 }
171 if (flags & UNICODE_DISALLOW_SUPER
172 || ( UNICODE_IS_ABOVE_31_BIT(uv)
173 && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
174 {
175 return NULL;
176 }
177 }
ba6ed43c
KW
178 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
179 HANDLE_UNICODE_NONCHAR(uv, flags);
507b9800 180 }
d9432125 181
ba6ed43c
KW
182 /* Test for and handle 4-byte result. In the test immediately below, the
183 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
184 * characters). The 3 is for 3 continuation bytes; these each contribute
185 * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
186 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
187 * 0x1_0000-0x1F_FFFF on ASCII */
188 if (uv < (8 * (1U << (3 * SHIFT)))) {
189 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
190 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
191 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
192 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
193
194#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
195 characters. The end-plane non-characters for EBCDIC were
196 handled just above */
197 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
198 HANDLE_UNICODE_NONCHAR(uv, flags);
d528804a 199 }
ba6ed43c
KW
200 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
201 HANDLE_UNICODE_SURROGATE(uv, flags);
202 }
203#endif
204
205 return d;
206 }
207
208 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
209 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
210 * format. The unrolled version above turns out to not save all that much
211 * time, and at these high code points (well above the legal Unicode range
212 * on ASCII platforms, and well above anything in common use in EBCDIC),
213 * khw believes that less code outweighs slight performance gains. */
214
d9432125 215 {
5aaebcb3 216 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
217 U8 *p = d+len-1;
218 while (p > d) {
4c8cd605 219 *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
220 uv >>= UTF_ACCUMULATION_SHIFT;
221 }
4c8cd605 222 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
223 return d+len;
224 }
a0ed51b3 225}
a5bf80e0 226
646ca15d 227/*
07693fe6
KW
228=for apidoc uvchr_to_utf8
229
bcb1a2d4 230Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 231of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
232C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
233the byte after the end of the new character. In other words,
07693fe6
KW
234
235 d = uvchr_to_utf8(d, uv);
236
237is the recommended wide native character-aware way of saying
238
239 *(d++) = uv;
240
760c7c2f
KW
241This function accepts any UV as input, but very high code points (above
242C<IV_MAX> on the platform) will raise a deprecation warning. This is
243typically 0x7FFF_FFFF in a 32-bit word.
244
245It is possible to forbid or warn on non-Unicode code points, or those that may
246be problematic by using L</uvchr_to_utf8_flags>.
de69f3af 247
07693fe6
KW
248=cut
249*/
250
de69f3af
KW
251/* This is also a macro */
252PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
253
07693fe6
KW
254U8 *
255Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
256{
de69f3af 257 return uvchr_to_utf8(d, uv);
07693fe6
KW
258}
259
de69f3af
KW
260/*
261=for apidoc uvchr_to_utf8_flags
262
263Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 264of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
265C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
266the byte after the end of the new character. In other words,
de69f3af
KW
267
268 d = uvchr_to_utf8_flags(d, uv, flags);
269
270or, in most cases,
271
272 d = uvchr_to_utf8_flags(d, uv, 0);
273
274This is the Unicode-aware way of saying
275
276 *(d++) = uv;
277
760c7c2f
KW
278If C<flags> is 0, this function accepts any UV as input, but very high code
279points (above C<IV_MAX> for the platform) will raise a deprecation warning.
280This is typically 0x7FFF_FFFF in a 32-bit word.
281
282Specifying C<flags> can further restrict what is allowed and not warned on, as
283follows:
de69f3af 284
796b6530 285If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
7ee537e6
KW
286the function will raise a warning, provided UTF8 warnings are enabled. If
287instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
288NULL. If both flags are set, the function will both warn and return NULL.
de69f3af 289
760c7c2f
KW
290Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
291affect how the function handles a Unicode non-character.
93e6dbd6 292
760c7c2f
KW
293And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
294affect the handling of code points that are above the Unicode maximum of
2950x10FFFF. Languages other than Perl may not be able to accept files that
296contain these.
93e6dbd6
KW
297
298The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
299the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
ecc1615f
KW
300three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
301allowed inputs to the strict UTF-8 traditionally defined by Unicode.
302Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
303C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
304above-Unicode and surrogate flags, but not the non-character ones, as
305defined in
306L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
307See L<perlunicode/Noncharacter code points>.
93e6dbd6 308
ab8e6d41
KW
309Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
310so using them is more problematic than other above-Unicode code points. Perl
311invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
312likely that non-Perl languages will not be able to read files that contain
313these that written by the perl interpreter; nor would Perl understand files
314written by something that uses a different extension. For these reasons, there
315is a separate set of flags that can warn and/or disallow these extremely high
316code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
317C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags. These
318are entirely independent from the deprecation warning for code points above
319C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
320code point that needs more than 31 bits to represent. When that happens,
321effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
32232-bit machines. (Of course C<UNICODE_DISALLOW_SUPER> will treat all
ab8e6d41
KW
323above-Unicode code points, including these, as malformations; and
324C<UNICODE_WARN_SUPER> warns on these.)
325
326On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
327extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
328than on ASCII. Prior to that, code points 2**31 and higher were simply
329unrepresentable, and a different, incompatible method was used to represent
330code points between 2**30 and 2**31 - 1. The flags C<UNICODE_WARN_ABOVE_31_BIT>
331and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
332platforms, warning and disallowing 2**31 and higher.
de69f3af 333
de69f3af
KW
334=cut
335*/
336
337/* This is also a macro */
338PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
339
07693fe6
KW
340U8 *
341Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
342{
de69f3af 343 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
344}
345
83dc0f42
KW
346PERL_STATIC_INLINE bool
347S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
348{
349 /* Returns TRUE if the first code point represented by the Perl-extended-
350 * UTF-8-encoded string starting at 's', and looking no further than 'e -
351 * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
352 *
353 * The function handles the case where the input bytes do not include all
354 * the ones necessary to represent a full character. That is, they may be
355 * the intial bytes of the representation of a code point, but possibly
356 * the final ones necessary for the complete representation may be beyond
357 * 'e - 1'.
358 *
359 * The function assumes that the sequence is well-formed UTF-8 as far as it
360 * goes, and is for a UTF-8 variant code point. If the sequence is
361 * incomplete, the function returns FALSE if there is any well-formed
362 * UTF-8 byte sequence that can complete it in such a way that a code point
363 * < 2**31 is produced; otherwise it returns TRUE.
364 *
365 * Getting this exactly right is slightly tricky, and has to be done in
366 * several places in this file, so is centralized here. It is based on the
367 * following table:
368 *
369 * U+7FFFFFFF (2 ** 31 - 1)
370 * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
371 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
372 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
373 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
374 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
375 * U+80000000 (2 ** 31):
376 * ASCII: \xFE\x82\x80\x80\x80\x80\x80
377 * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
378 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
379 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
380 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
381 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
382 */
383
384#ifdef EBCDIC
385
37086697
KW
386 /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
387 const U8 prefix[] = "\x41\x41\x41\x41\x41\x41\x42";
83dc0f42
KW
388 const STRLEN prefix_len = sizeof(prefix) - 1;
389 const STRLEN len = e - s;
f880f78a 390 const STRLEN cmp_len = MIN(prefix_len, len - 1);
83dc0f42
KW
391
392#else
393
394 PERL_UNUSED_ARG(e);
395
396#endif
397
398 PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
399
400 assert(! UTF8_IS_INVARIANT(*s));
401
402#ifndef EBCDIC
403
404 /* Technically, a start byte of FE can be for a code point that fits into
405 * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
406 * malformation. */
407 return (*s >= 0xFE);
408
409#else
410
411 /* On the EBCDIC code pages we handle, only 0xFE can mean a 32-bit or
412 * larger code point (0xFF is an invariant). For 0xFE, we need at least 2
413 * bytes, and maybe up through 8 bytes, to be sure if the value is above 31
414 * bits. */
415 if (*s != 0xFE || len == 1) {
416 return FALSE;
417 }
418
419 /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
420 * \x41 and \x42. */
421 return cBOOL(memGT(s + 1, prefix, cmp_len));
422
423#endif
424
425}
426
12a4bed3
KW
427PERL_STATIC_INLINE bool
428S_does_utf8_overflow(const U8 * const s, const U8 * e)
429{
430 const U8 *x;
431 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
432
c551bb01
KW
433#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
434
435 const STRLEN len = e - s;
436
437#endif
438
12a4bed3
KW
439 /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
440 * platform, that is if it represents a code point larger than the highest
441 * representable code point. (For ASCII platforms, we could use memcmp()
442 * because we don't have to convert each byte to I8, but it's very rare
443 * input indeed that would approach overflow, so the loop below will likely
444 * only get executed once.
445 *
446 * 'e' must not be beyond a full character. If it is less than a full
447 * character, the function returns FALSE if there is any input beyond 'e'
448 * that could result in a non-overflowing code point */
449
450 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
b0b342d4
KW
451 assert(s <= e && s + UTF8SKIP(s) >= e);
452
453#if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
454
455 /* On 32 bit ASCII machines, many overlongs that start with FF don't
456 * overflow */
457
c551bb01 458 if (isFF_OVERLONG(s, len)) {
b0b342d4
KW
459 const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84";
460 return memGE(s, max_32_bit_overlong,
c551bb01 461 MIN(len, sizeof(max_32_bit_overlong) - 1));
b0b342d4
KW
462 }
463
464#endif
12a4bed3
KW
465
466 for (x = s; x < e; x++, y++) {
467
468 /* If this byte is larger than the corresponding highest UTF-8 byte, it
469 * overflows */
470 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
471 return TRUE;
472 }
473
474 /* If not the same as this byte, it must be smaller, doesn't overflow */
475 if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
476 return FALSE;
477 }
478 }
479
480 /* Got to the end and all bytes are the same. If the input is a whole
481 * character, it doesn't overflow. And if it is a partial character,
482 * there's not enough information to tell, so assume doesn't overflow */
483 return FALSE;
484}
485
486PERL_STATIC_INLINE bool
487S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
488{
489 /* Overlongs can occur whenever the number of continuation bytes
490 * changes. That means whenever the number of leading 1 bits in a start
491 * byte increases from the next lower start byte. That happens for start
492 * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following
493 * illegal start bytes have already been excluded, so don't need to be
494 * tested here;
495 * ASCII platforms: C0, C1
496 * EBCDIC platforms C0, C1, C2, C3, C4, E0
497 *
498 * At least a second byte is required to determine if other sequences will
499 * be an overlong. */
500
501 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
502 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
503
504 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
505 assert(len > 1 && UTF8_IS_START(*s));
506
507 /* Each platform has overlongs after the start bytes given above (expressed
508 * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
509 * the logic is the same, except the E0 overlong has already been excluded
510 * on EBCDIC platforms. The values below were found by manually
511 * inspecting the UTF-8 patterns. See the tables in utf8.h and
512 * utfebcdic.h. */
513
514# ifdef EBCDIC
515# define F0_ABOVE_OVERLONG 0xB0
516# define F8_ABOVE_OVERLONG 0xA8
517# define FC_ABOVE_OVERLONG 0xA4
518# define FE_ABOVE_OVERLONG 0xA2
519# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
520 /* I8(0xfe) is FF */
521# else
522
523 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
524 return TRUE;
525 }
526
527# define F0_ABOVE_OVERLONG 0x90
528# define F8_ABOVE_OVERLONG 0x88
529# define FC_ABOVE_OVERLONG 0x84
530# define FE_ABOVE_OVERLONG 0x82
531# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
532# endif
533
534
535 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
536 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
537 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
538 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
539 {
540 return TRUE;
541 }
542
b0b342d4
KW
543 /* Check for the FF overlong */
544 return isFF_OVERLONG(s, len);
545}
546
547PERL_STATIC_INLINE bool
548S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
549{
550 PERL_ARGS_ASSERT_ISFF_OVERLONG;
12a4bed3
KW
551
552 /* Check for the FF overlong. This happens only if all these bytes match;
553 * what comes after them doesn't matter. See tables in utf8.h,
b0b342d4 554 * utfebcdic.h. */
12a4bed3 555
b0b342d4
KW
556 return len >= sizeof(FF_OVERLONG_PREFIX) - 1
557 && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
558 sizeof(FF_OVERLONG_PREFIX) - 1));
12a4bed3
KW
559}
560
561#undef F0_ABOVE_OVERLONG
562#undef F8_ABOVE_OVERLONG
563#undef FC_ABOVE_OVERLONG
564#undef FE_ABOVE_OVERLONG
565#undef FF_OVERLONG_PREFIX
566
35f8c9bd 567STRLEN
edc2c47a 568Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
35f8c9bd 569{
2b479609 570 STRLEN len;
12a4bed3 571 const U8 *x;
35f8c9bd 572
2b479609
KW
573 /* A helper function that should not be called directly.
574 *
575 * This function returns non-zero if the string beginning at 's' and
576 * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
577 * code point; otherwise it returns 0. The examination stops after the
578 * first code point in 's' is validated, not looking at the rest of the
579 * input. If 'e' is such that there are not enough bytes to represent a
580 * complete code point, this function will return non-zero anyway, if the
581 * bytes it does have are well-formed UTF-8 as far as they go, and aren't
582 * excluded by 'flags'.
583 *
584 * A non-zero return gives the number of bytes required to represent the
585 * code point. Be aware that if the input is for a partial character, the
586 * return will be larger than 'e - s'.
587 *
588 * This function assumes that the code point represented is UTF-8 variant.
589 * The caller should have excluded this possibility before calling this
590 * function.
591 *
592 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
593 * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
594 * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
595 * disallowed by the flags. If the input is only for a partial character,
596 * the function will return non-zero if there is any sequence of
597 * well-formed UTF-8 that, when appended to the input sequence, could
598 * result in an allowed code point; otherwise it returns 0. Non characters
599 * cannot be determined based on partial character input. But many of the
600 * other excluded types can be determined with just the first one or two
601 * bytes.
602 *
603 */
604
605 PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
606
607 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
608 |UTF8_DISALLOW_ABOVE_31_BIT)));
609 assert(! UTF8_IS_INVARIANT(*s));
35f8c9bd 610
2b479609 611 /* A variant char must begin with a start byte */
35f8c9bd
KW
612 if (UNLIKELY(! UTF8_IS_START(*s))) {
613 return 0;
614 }
615
edc2c47a
KW
616 /* Examine a maximum of a single whole code point */
617 if (e - s > UTF8SKIP(s)) {
618 e = s + UTF8SKIP(s);
619 }
620
2b479609
KW
621 len = e - s;
622
623 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
624 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
35f8c9bd 625
2b479609
KW
626 /* The code below is derived from this table. Keep in mind that legal
627 * continuation bytes range between \x80..\xBF for UTF-8, and
628 * \xA0..\xBF for I8. Anything above those aren't continuation bytes.
629 * Hence, we don't have to test the upper edge because if any of those
630 * are encountered, the sequence is malformed, and will fail elsewhere
631 * in this function.
632 * UTF-8 UTF-EBCDIC I8
633 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
634 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
635 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
636 *
637 */
638
639#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
640# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
19794540 641# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
2b479609 642
19794540
KW
643# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
644 /* B6 and B7 */ \
645 && ((s1) & 0xFE ) == 0xB6)
2b479609
KW
646#else
647# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
19794540
KW
648# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
649# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
2b479609
KW
650#endif
651
652 if ( (flags & UTF8_DISALLOW_SUPER)
653 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) {
654 return 0; /* Above Unicode */
655 }
656
657 if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT)
658 && UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
659 {
660 return 0; /* Above 31 bits */
661 }
662
663 if (len > 1) {
664 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
665
666 if ( (flags & UTF8_DISALLOW_SUPER)
19794540 667 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
2b479609
KW
668 {
669 return 0; /* Above Unicode */
670 }
671
672 if ( (flags & UTF8_DISALLOW_SURROGATE)
19794540 673 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
2b479609
KW
674 {
675 return 0; /* Surrogate */
676 }
677
678 if ( (flags & UTF8_DISALLOW_NONCHAR)
679 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
680 {
681 return 0; /* Noncharacter code point */
682 }
683 }
684 }
685
686 /* Make sure that all that follows are continuation bytes */
35f8c9bd
KW
687 for (x = s + 1; x < e; x++) {
688 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
689 return 0;
690 }
691 }
692
af13dd8a 693 /* Here is syntactically valid. Next, make sure this isn't the start of an
12a4bed3
KW
694 * overlong. */
695 if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
696 return 0;
af13dd8a
KW
697 }
698
12a4bed3
KW
699 /* And finally, that the code point represented fits in a word on this
700 * platform */
701 if (does_utf8_overflow(s, e)) {
702 return 0;
35f8c9bd
KW
703 }
704
2b479609 705 return UTF8SKIP(s);
35f8c9bd
KW
706}
707
7cf8d05d
KW
708STATIC char *
709S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
710{
711 /* Returns a mortalized C string that is a displayable copy of the 'len'
712 * bytes starting at 's', each in a \xXY format. */
713
714 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
715 trailing NUL */
716 const U8 * const e = s + len;
717 char * output;
718 char * d;
719
720 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
721
722 Newx(output, output_len, char);
723 SAVEFREEPV(output);
724
725 d = output;
726 for (; s < e; s++) {
727 const unsigned high_nibble = (*s & 0xF0) >> 4;
728 const unsigned low_nibble = (*s & 0x0F);
729
730 *d++ = '\\';
731 *d++ = 'x';
732
733 if (high_nibble < 10) {
734 *d++ = high_nibble + '0';
735 }
736 else {
737 *d++ = high_nibble - 10 + 'a';
738 }
739
740 if (low_nibble < 10) {
741 *d++ = low_nibble + '0';
742 }
743 else {
744 *d++ = low_nibble - 10 + 'a';
745 }
746 }
747
748 *d = '\0';
749 return output;
750}
751
806547a7 752PERL_STATIC_INLINE char *
7cf8d05d
KW
753S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
754
755 /* How many bytes to print */
3cc6a05e 756 STRLEN print_len,
7cf8d05d
KW
757
758 /* Which one is the non-continuation */
759 const STRLEN non_cont_byte_pos,
760
761 /* How many bytes should there be? */
762 const STRLEN expect_len)
806547a7
KW
763{
764 /* Return the malformation warning text for an unexpected continuation
765 * byte. */
766
7cf8d05d 767 const char * const where = (non_cont_byte_pos == 1)
806547a7 768 ? "immediately"
7cf8d05d
KW
769 : Perl_form(aTHX_ "%d bytes",
770 (int) non_cont_byte_pos);
3cc6a05e 771 unsigned int i;
806547a7
KW
772
773 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
774
7cf8d05d
KW
775 /* We don't need to pass this parameter, but since it has already been
776 * calculated, it's likely faster to pass it; verify under DEBUGGING */
777 assert(expect_len == UTF8SKIP(s));
778
3cc6a05e
KW
779 /* It is possible that utf8n_to_uvchr() was called incorrectly, with a
780 * length that is larger than is actually available in the buffer. If we
781 * print all the bytes based on that length, we will read past the buffer
782 * end. Often, the strings are NUL terminated, so to lower the chances of
783 * this happening, print the malformed bytes only up through any NUL. */
784 for (i = 1; i < print_len; i++) {
785 if (*(s + i) == '\0') {
786 print_len = i + 1; /* +1 gets the NUL printed */
787 break;
788 }
789 }
790
7cf8d05d
KW
791 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
792 " %s after start byte 0x%02x; need %d bytes, got %d)",
793 malformed_text,
794 _byte_dump_string(s, print_len),
795 *(s + non_cont_byte_pos),
796 where,
797 *s,
798 (int) expect_len,
799 (int) non_cont_byte_pos);
806547a7
KW
800}
801
35f8c9bd
KW
802/*
803
de69f3af 804=for apidoc utf8n_to_uvchr
378516de
KW
805
806THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af 807Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
67e989fb 808
9041c2e3 809Bottom level UTF-8 decode routine.
de69f3af 810Returns the native code point value of the first character in the string C<s>,
746afd53
KW
811which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
812C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
813the length, in bytes, of that character.
949cf498
KW
814
815The value of C<flags> determines the behavior when C<s> does not point to a
2b5e7bc2
KW
816well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
817causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
818is the next possible position in C<s> that could begin a non-malformed
819character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
820is raised. Some UTF-8 input sequences may contain multiple malformations.
821This function tries to find every possible one in each call, so multiple
822warnings can be raised for each sequence.
949cf498
KW
823
824Various ALLOW flags can be set in C<flags> to allow (and not warn on)
825individual types of malformations, such as the sequence being overlong (that
826is, when there is a shorter sequence that can express the same code point;
827overlong sequences are expressly forbidden in the UTF-8 standard due to
828potential security issues). Another malformation example is the first byte of
829a character not being a legal first byte. See F<utf8.h> for the list of such
524080c4
KW
830flags. For allowed 0 length strings, this function returns 0; for allowed
831overlong sequences, the computed code point is returned; for all other allowed
832malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
833determinable reasonable value.
949cf498 834
796b6530 835The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
836flags) malformation is found. If this flag is set, the routine assumes that
837the caller will raise a warning, and this function will silently just set
d088425d
KW
838C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
839
75200dff 840Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 841character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
111fa700
KW
842in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
843be set to 1. To disambiguate, upon a zero return, see if the first byte of
844C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
f9380377 845error. Or you can use C<L</utf8n_to_uvchr_error>>.
949cf498
KW
846
847Certain code points are considered problematic. These are Unicode surrogates,
746afd53 848Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 849By default these are considered regular code points, but certain situations
ecc1615f
KW
850warrant special handling for them, which can be specified using the C<flags>
851parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
852three classes are treated as malformations and handled as such. The flags
853C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
854C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
855disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
856restricts the allowed inputs to the strict UTF-8 traditionally defined by
857Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
858definition given by
859L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
860The difference between traditional strictness and C9 strictness is that the
861latter does not forbid non-character code points. (They are still discouraged,
862however.) For more discussion see L<perlunicode/Noncharacter code points>.
863
864The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
865C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
796b6530
KW
866C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
867raised for their respective categories, but otherwise the code points are
868considered valid (not malformations). To get a category to both be treated as
869a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 870(But note that warnings are not raised if lexically disabled nor if
796b6530 871C<UTF8_CHECK_ONLY> is also specified.)
949cf498 872
760c7c2f
KW
873It is now deprecated to have very high code points (above C<IV_MAX> on the
874platforms) and this function will raise a deprecation warning for these (unless
d5944cab 875such warnings are turned off). This value is typically 0x7FFF_FFFF (2**31 -1)
760c7c2f 876in a 32-bit word.
ab8e6d41
KW
877
878Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
879so using them is more problematic than other above-Unicode code points. Perl
880invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
881likely that non-Perl languages will not be able to read files that contain
aff2be59 882these; nor would Perl understand files
ab8e6d41
KW
883written by something that uses a different extension. For these reasons, there
884is a separate set of flags that can warn and/or disallow these extremely high
885code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
886C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags. These
887are entirely independent from the deprecation warning for code points above
888C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
889code point that needs more than 31 bits to represent. When that happens,
890effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
89132-bit machines. (Of course C<UTF8_DISALLOW_SUPER> will treat all
ab8e6d41
KW
892above-Unicode code points, including these, as malformations; and
893C<UTF8_WARN_SUPER> warns on these.)
894
895On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
896extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
897than on ASCII. Prior to that, code points 2**31 and higher were simply
898unrepresentable, and a different, incompatible method was used to represent
899code points between 2**30 and 2**31 - 1. The flags C<UTF8_WARN_ABOVE_31_BIT>
900and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
901platforms, warning and disallowing 2**31 and higher.
949cf498
KW
902
903All other code points corresponding to Unicode characters, including private
904use and those yet to be assigned, are never considered malformed and never
905warn.
67e989fb 906
37607a96 907=cut
f9380377
KW
908
909Also implemented as a macro in utf8.h
910*/
911
912UV
913Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
914 STRLEN curlen,
915 STRLEN *retlen,
916 const U32 flags)
917{
918 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
919
920 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
921}
922
923/*
924
925=for apidoc utf8n_to_uvchr_error
926
927THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
928Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
929
930This function is for code that needs to know what the precise malformation(s)
931are when an error is found.
932
933It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
934all the others, C<errors>. If this parameter is 0, this function behaves
935identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
936to a C<U32> variable, which this function sets to indicate any errors found.
937Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
938C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
939of these bits will be set if a malformation is found, even if the input
7a65503b 940C<flags> parameter indicates that the given malformation is allowed; those
f9380377
KW
941exceptions are noted:
942
943=over 4
944
945=item C<UTF8_GOT_ABOVE_31_BIT>
946
947The code point represented by the input UTF-8 sequence occupies more than 31
948bits.
949This bit is set only if the input C<flags> parameter contains either the
950C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
951
952=item C<UTF8_GOT_CONTINUATION>
953
954The input sequence was malformed in that the first byte was a a UTF-8
955continuation byte.
956
957=item C<UTF8_GOT_EMPTY>
958
959The input C<curlen> parameter was 0.
960
961=item C<UTF8_GOT_LONG>
962
963The input sequence was malformed in that there is some other sequence that
964evaluates to the same code point, but that sequence is shorter than this one.
965
966=item C<UTF8_GOT_NONCHAR>
967
968The code point represented by the input UTF-8 sequence is for a Unicode
969non-character code point.
970This bit is set only if the input C<flags> parameter contains either the
971C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
972
973=item C<UTF8_GOT_NON_CONTINUATION>
974
975The input sequence was malformed in that a non-continuation type byte was found
976in a position where only a continuation type one should be.
977
978=item C<UTF8_GOT_OVERFLOW>
979
980The input sequence was malformed in that it is for a code point that is not
981representable in the number of bits available in a UV on the current platform.
982
983=item C<UTF8_GOT_SHORT>
984
985The input sequence was malformed in that C<curlen> is smaller than required for
986a complete sequence. In other words, the input is for a partial character
987sequence.
988
989=item C<UTF8_GOT_SUPER>
990
991The input sequence was malformed in that it is for a non-Unicode code point;
992that is, one above the legal Unicode maximum.
993This bit is set only if the input C<flags> parameter contains either the
994C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
995
996=item C<UTF8_GOT_SURROGATE>
997
998The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
999code point.
1000This bit is set only if the input C<flags> parameter contains either the
1001C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1002
1003=back
1004
133551d8
KW
1005To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1006flag to suppress any warnings, and then examine the C<*errors> return.
1007
f9380377 1008=cut
37607a96 1009*/
67e989fb 1010
a0ed51b3 1011UV
f9380377
KW
1012Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
1013 STRLEN curlen,
1014 STRLEN *retlen,
1015 const U32 flags,
1016 U32 * errors)
a0ed51b3 1017{
d4c19fe8 1018 const U8 * const s0 = s;
2b5e7bc2
KW
1019 U8 * send = NULL; /* (initialized to silence compilers' wrong
1020 warning) */
1021 U32 possible_problems = 0; /* A bit is set here for each potential problem
1022 found as we go along */
eb83ed87 1023 UV uv = *s;
2b5e7bc2
KW
1024 STRLEN expectlen = 0; /* How long should this sequence be?
1025 (initialized to silence compilers' wrong
1026 warning) */
e308b348 1027 STRLEN avail_len = 0; /* When input is too short, gives what that is */
f9380377
KW
1028 U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
1029 this gets set and discarded */
a0dbb045 1030
2b5e7bc2
KW
1031 /* The below are used only if there is both an overlong malformation and a
1032 * too short one. Otherwise the first two are set to 's0' and 'send', and
1033 * the third not used at all */
1034 U8 * adjusted_s0 = (U8 *) s0;
5ec712b1
KW
1035 U8 * adjusted_send = NULL; /* (Initialized to silence compilers' wrong
1036 warning) */
2b5e7bc2 1037 UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
7918f24d 1038
f9380377
KW
1039 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1040
1041 if (errors) {
1042 *errors = 0;
1043 }
1044 else {
1045 errors = &discard_errors;
1046 }
a0dbb045 1047
eb83ed87
KW
1048 /* The order of malformation tests here is important. We should consume as
1049 * few bytes as possible in order to not skip any valid character. This is
1050 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1051 * http://unicode.org/reports/tr36 for more discussion as to why. For
1052 * example, once we've done a UTF8SKIP, we can tell the expected number of
1053 * bytes, and could fail right off the bat if the input parameters indicate
1054 * that there are too few available. But it could be that just that first
1055 * byte is garbled, and the intended character occupies fewer bytes. If we
1056 * blindly assumed that the first byte is correct, and skipped based on
1057 * that number, we could skip over a valid input character. So instead, we
1058 * always examine the sequence byte-by-byte.
1059 *
1060 * We also should not consume too few bytes, otherwise someone could inject
1061 * things. For example, an input could be deliberately designed to
1062 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 1063 * returning to the caller C<*retlen> pointing to the very next byte (one
eb83ed87
KW
1064 * which is actually part of of the overflowing sequence), that could look
1065 * legitimate to the caller, which could discard the initial partial
2b5e7bc2
KW
1066 * sequence and process the rest, inappropriately.
1067 *
1068 * Some possible input sequences are malformed in more than one way. This
1069 * function goes to lengths to try to find all of them. This is necessary
1070 * for correctness, as the inputs may allow one malformation but not
1071 * another, and if we abandon searching for others after finding the
1072 * allowed one, we could allow in something that shouldn't have been.
1073 */
eb83ed87 1074
b5b9af04 1075 if (UNLIKELY(curlen == 0)) {
2b5e7bc2
KW
1076 possible_problems |= UTF8_GOT_EMPTY;
1077 curlen = 0;
1078 uv = 0; /* XXX It could be argued that this should be
1079 UNICODE_REPLACEMENT? */
1080 goto ready_to_handle_errors;
0c443dc2
JH
1081 }
1082
eb83ed87
KW
1083 expectlen = UTF8SKIP(s);
1084
1085 /* A well-formed UTF-8 character, as the vast majority of calls to this
1086 * function will be for, has this expected length. For efficiency, set
1087 * things up here to return it. It will be overriden only in those rare
1088 * cases where a malformation is found */
1089 if (retlen) {
1090 *retlen = expectlen;
1091 }
1092
1093 /* An invariant is trivially well-formed */
1d72bdf6 1094 if (UTF8_IS_INVARIANT(uv)) {
de69f3af 1095 return uv;
a0ed51b3 1096 }
67e989fb 1097
eb83ed87 1098 /* A continuation character can't start a valid sequence */
b5b9af04 1099 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
2b5e7bc2
KW
1100 possible_problems |= UTF8_GOT_CONTINUATION;
1101 curlen = 1;
1102 uv = UNICODE_REPLACEMENT;
1103 goto ready_to_handle_errors;
ba210ebe 1104 }
9041c2e3 1105
dcd27b3c
KW
1106 /* Here is not a continuation byte, nor an invariant. The only thing left
1107 * is a start byte (possibly for an overlong) */
1108
534752c1
KW
1109 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1110 * that indicate the number of bytes in the character's whole UTF-8
1111 * sequence, leaving just the bits that are part of the value. */
1112 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
ba210ebe 1113
e308b348
KW
1114 /* Setup the loop end point, making sure to not look past the end of the
1115 * input string, and flag it as too short if the size isn't big enough. */
1116 send = (U8*) s0;
1117 if (UNLIKELY(curlen < expectlen)) {
1118 possible_problems |= UTF8_GOT_SHORT;
1119 avail_len = curlen;
1120 send += curlen;
1121 }
1122 else {
1123 send += expectlen;
1124 }
1125 adjusted_send = send;
1126
eb83ed87 1127 /* Now, loop through the remaining bytes in the character's sequence,
e308b348 1128 * accumulating each into the working value as we go. */
eb83ed87 1129 for (s = s0 + 1; s < send; s++) {
b5b9af04 1130 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
8850bf83 1131 uv = UTF8_ACCUMULATE(uv, *s);
2b5e7bc2
KW
1132 continue;
1133 }
1134
1135 /* Here, found a non-continuation before processing all expected bytes.
1136 * This byte indicates the beginning of a new character, so quit, even
1137 * if allowing this malformation. */
2b5e7bc2 1138 possible_problems |= UTF8_GOT_NON_CONTINUATION;
e308b348 1139 break;
eb83ed87
KW
1140 } /* End of loop through the character's bytes */
1141
1142 /* Save how many bytes were actually in the character */
1143 curlen = s - s0;
1144
e308b348
KW
1145 /* A convenience macro that matches either of the too-short conditions. */
1146# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
2f8f112e 1147
e308b348 1148 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
2b5e7bc2
KW
1149 uv_so_far = uv;
1150 uv = UNICODE_REPLACEMENT;
eb83ed87
KW
1151 }
1152
2b5e7bc2
KW
1153 /* Note that there are two types of too-short malformation. One is when
1154 * there is actual wrong data before the normal termination of the
1155 * sequence. The other is that the sequence wasn't complete before the end
1156 * of the data we are allowed to look at, based on the input 'curlen'.
1157 * This means that we were passed data for a partial character, but it is
1158 * valid as far as we saw. The other is definitely invalid. This
1159 * distinction could be important to a caller, so the two types are kept
1160 * separate. */
1161
1162 /* Check for overflow */
1163 if (UNLIKELY(does_utf8_overflow(s0, send))) {
1164 possible_problems |= UTF8_GOT_OVERFLOW;
1165 uv = UNICODE_REPLACEMENT;
eb83ed87 1166 }
eb83ed87 1167
2b5e7bc2
KW
1168 /* Check for overlong. If no problems so far, 'uv' is the correct code
1169 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1170 * we must look at the UTF-8 byte sequence itself to see if it is for an
1171 * overlong */
1172 if ( ( LIKELY(! possible_problems)
1173 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1174 || ( UNLIKELY( possible_problems)
1175 && ( UNLIKELY(! UTF8_IS_START(*s0))
1176 || ( curlen > 1
1177 && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
1178 send - s0))))))
2f8f112e 1179 {
2b5e7bc2
KW
1180 possible_problems |= UTF8_GOT_LONG;
1181
2b5e7bc2
KW
1182 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1183 UV min_uv = uv_so_far;
1184 STRLEN i;
1185
1186 /* Here, the input is both overlong and is missing some trailing
1187 * bytes. There is no single code point it could be for, but there
1188 * may be enough information present to determine if what we have
1189 * so far is for an unallowed code point, such as for a surrogate.
1190 * The code below has the intelligence to determine this, but just
1191 * for non-overlong UTF-8 sequences. What we do here is calculate
1192 * the smallest code point the input could represent if there were
1193 * no too short malformation. Then we compute and save the UTF-8
1194 * for that, which is what the code below looks at instead of the
1195 * raw input. It turns out that the smallest such code point is
1196 * all we need. */
1197 for (i = curlen; i < expectlen; i++) {
1198 min_uv = UTF8_ACCUMULATE(min_uv,
1199 I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
1200 }
1201
1202 Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8);
1203 SAVEFREEPV((U8 *) adjusted_s0); /* Needed because we may not get
1204 to free it ourselves if
1205 warnings are made fatal */
1206 adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1207 }
eb83ed87
KW
1208 }
1209
2b5e7bc2
KW
1210 /* Now check that the input isn't for a problematic code point not allowed
1211 * by the input parameters. */
1212 /* isn't problematic if < this */
1213 if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
1214 || ( UNLIKELY(possible_problems)
1215 && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
760c7c2f
KW
1216 && ((flags & ( UTF8_DISALLOW_NONCHAR
1217 |UTF8_DISALLOW_SURROGATE
1218 |UTF8_DISALLOW_SUPER
1219 |UTF8_DISALLOW_ABOVE_31_BIT
1220 |UTF8_WARN_NONCHAR
1221 |UTF8_WARN_SURROGATE
1222 |UTF8_WARN_SUPER
1223 |UTF8_WARN_ABOVE_31_BIT))
2b5e7bc2
KW
1224 /* In case of a malformation, 'uv' is not valid, and has
1225 * been changed to something in the Unicode range.
1226 * Currently we don't output a deprecation message if there
1227 * is already a malformation, so we don't have to special
1228 * case the test immediately below */
760c7c2f
KW
1229 || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1230 && ckWARN_d(WARN_DEPRECATED))))
eb83ed87 1231 {
2b5e7bc2
KW
1232 /* If there were no malformations, or the only malformation is an
1233 * overlong, 'uv' is valid */
1234 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1235 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1236 possible_problems |= UTF8_GOT_SURROGATE;
1237 }
1238 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
1239 possible_problems |= UTF8_GOT_SUPER;
1240 }
1241 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1242 possible_problems |= UTF8_GOT_NONCHAR;
1243 }
1244 }
1245 else { /* Otherwise, need to look at the source UTF-8, possibly
1246 adjusted to be non-overlong */
1247
1248 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1249 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
ea5ced44 1250 {
2b5e7bc2
KW
1251 possible_problems |= UTF8_GOT_SUPER;
1252 }
1253 else if (curlen > 1) {
1254 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
1255 NATIVE_UTF8_TO_I8(*adjusted_s0),
1256 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
ea5ced44 1257 {
2b5e7bc2 1258 possible_problems |= UTF8_GOT_SUPER;
ea5ced44 1259 }
2b5e7bc2
KW
1260 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
1261 NATIVE_UTF8_TO_I8(*adjusted_s0),
1262 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1263 {
1264 possible_problems |= UTF8_GOT_SURROGATE;
ea5ced44
KW
1265 }
1266 }
c0236afe 1267
2b5e7bc2
KW
1268 /* We need a complete well-formed UTF-8 character to discern
1269 * non-characters, so can't look for them here */
1270 }
1271 }
949cf498 1272
2b5e7bc2
KW
1273 ready_to_handle_errors:
1274
1275 /* At this point:
1276 * curlen contains the number of bytes in the sequence that
1277 * this call should advance the input by.
e308b348
KW
1278 * avail_len gives the available number of bytes passed in, but
1279 * only if this is less than the expected number of
1280 * bytes, based on the code point's start byte.
2b5e7bc2
KW
1281 * possible_problems' is 0 if there weren't any problems; otherwise a bit
1282 * is set in it for each potential problem found.
1283 * uv contains the code point the input sequence
1284 * represents; or if there is a problem that prevents
1285 * a well-defined value from being computed, it is
1286 * some subsitute value, typically the REPLACEMENT
1287 * CHARACTER.
1288 * s0 points to the first byte of the character
1289 * send points to just after where that (potentially
1290 * partial) character ends
1291 * adjusted_s0 normally is the same as s0, but in case of an
1292 * overlong for which the UTF-8 matters below, it is
1293 * the first byte of the shortest form representation
1294 * of the input.
1295 * adjusted_send normally is the same as 'send', but if adjusted_s0
1296 * is set to something other than s0, this points one
1297 * beyond its end
1298 */
eb83ed87 1299
2b5e7bc2
KW
1300 if (UNLIKELY(possible_problems)) {
1301 bool disallowed = FALSE;
1302 const U32 orig_problems = possible_problems;
1303
1304 while (possible_problems) { /* Handle each possible problem */
1305 UV pack_warn = 0;
1306 char * message = NULL;
1307
1308 /* Each 'if' clause handles one problem. They are ordered so that
1309 * the first ones' messages will be displayed before the later
1310 * ones; this is kinda in decreasing severity order */
1311 if (possible_problems & UTF8_GOT_OVERFLOW) {
1312
1313 /* Overflow means also got a super and above 31 bits, but we
1314 * handle all three cases here */
1315 possible_problems
1316 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
f9380377
KW
1317 *errors |= UTF8_GOT_OVERFLOW;
1318
1319 /* But the API says we flag all errors found */
1320 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1321 *errors |= UTF8_GOT_SUPER;
1322 }
1323 if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
1324 *errors |= UTF8_GOT_ABOVE_31_BIT;
1325 }
2b5e7bc2
KW
1326
1327 disallowed = TRUE;
1328
1329 /* The warnings code explicitly says it doesn't handle the case
1330 * of packWARN2 and two categories which have parent-child
1331 * relationship. Even if it works now to raise the warning if
1332 * either is enabled, it wouldn't necessarily do so in the
1333 * future. We output (only) the most dire warning*/
1334 if (! (flags & UTF8_CHECK_ONLY)) {
1335 if (ckWARN_d(WARN_UTF8)) {
1336 pack_warn = packWARN(WARN_UTF8);
1337 }
1338 else if (ckWARN_d(WARN_NON_UNICODE)) {
1339 pack_warn = packWARN(WARN_NON_UNICODE);
1340 }
1341 if (pack_warn) {
1342 message = Perl_form(aTHX_ "%s: %s (overflows)",
1343 malformed_text,
1344 _byte_dump_string(s0, send - s0));
1345 }
1346 }
1347 }
1348 else if (possible_problems & UTF8_GOT_EMPTY) {
1349 possible_problems &= ~UTF8_GOT_EMPTY;
f9380377 1350 *errors |= UTF8_GOT_EMPTY;
2b5e7bc2
KW
1351
1352 if (! (flags & UTF8_ALLOW_EMPTY)) {
1353 disallowed = TRUE;
1354 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1355 pack_warn = packWARN(WARN_UTF8);
1356 message = Perl_form(aTHX_ "%s (empty string)",
1357 malformed_text);
1358 }
1359 }
1360 }
1361 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1362 possible_problems &= ~UTF8_GOT_CONTINUATION;
f9380377 1363 *errors |= UTF8_GOT_CONTINUATION;
2b5e7bc2
KW
1364
1365 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1366 disallowed = TRUE;
1367 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1368 pack_warn = packWARN(WARN_UTF8);
1369 message = Perl_form(aTHX_
1370 "%s: %s (unexpected continuation byte 0x%02x,"
1371 " with no preceding start byte)",
1372 malformed_text,
1373 _byte_dump_string(s0, 1), *s0);
1374 }
1375 }
1376 }
2b5e7bc2
KW
1377 else if (possible_problems & UTF8_GOT_SHORT) {
1378 possible_problems &= ~UTF8_GOT_SHORT;
f9380377 1379 *errors |= UTF8_GOT_SHORT;
2b5e7bc2
KW
1380
1381 if (! (flags & UTF8_ALLOW_SHORT)) {
1382 disallowed = TRUE;
1383 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1384 pack_warn = packWARN(WARN_UTF8);
1385 message = Perl_form(aTHX_
9a6c9c81 1386 "%s: %s (too short; %d byte%s available, need %d)",
2b5e7bc2
KW
1387 malformed_text,
1388 _byte_dump_string(s0, send - s0),
e308b348
KW
1389 (int)avail_len,
1390 avail_len == 1 ? "" : "s",
2b5e7bc2
KW
1391 (int)expectlen);
1392 }
1393 }
ba210ebe 1394
2b5e7bc2 1395 }
e308b348
KW
1396 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1397 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1398 *errors |= UTF8_GOT_NON_CONTINUATION;
1399
1400 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1401 disallowed = TRUE;
1402 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1403 pack_warn = packWARN(WARN_UTF8);
1404 message = Perl_form(aTHX_ "%s",
1405 unexpected_non_continuation_text(s0,
1406 send - s0,
1407 s - s0,
1408 (int) expectlen));
1409 }
1410 }
1411 }
2b5e7bc2
KW
1412 else if (possible_problems & UTF8_GOT_LONG) {
1413 possible_problems &= ~UTF8_GOT_LONG;
f9380377 1414 *errors |= UTF8_GOT_LONG;
2b5e7bc2
KW
1415
1416 if (! (flags & UTF8_ALLOW_LONG)) {
1417 disallowed = TRUE;
1418
1419 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1420 pack_warn = packWARN(WARN_UTF8);
1421
1422 /* These error types cause 'uv' to be something that
1423 * isn't what was intended, so can't use it in the
1424 * message. The other error types either can't
1425 * generate an overlong, or else the 'uv' is valid */
1426 if (orig_problems &
1427 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1428 {
1429 message = Perl_form(aTHX_
1430 "%s: %s (any UTF-8 sequence that starts"
1431 " with \"%s\" is overlong which can and"
1432 " should be represented with a"
1433 " different, shorter sequence)",
1434 malformed_text,
1435 _byte_dump_string(s0, send - s0),
1436 _byte_dump_string(s0, curlen));
1437 }
1438 else {
1439 U8 tmpbuf[UTF8_MAXBYTES+1];
1440 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
1441 uv, 0);
1442 message = Perl_form(aTHX_
1443 "%s: %s (overlong; instead use %s to represent"
147e3846 1444 " U+%0*" UVXf ")",
2b5e7bc2
KW
1445 malformed_text,
1446 _byte_dump_string(s0, send - s0),
1447 _byte_dump_string(tmpbuf, e - tmpbuf),
1448 ((uv < 256) ? 2 : 4), /* Field width of 2 for
1449 small code points */
1450 uv);
1451 }
1452 }
1453 }
1454 }
1455 else if (possible_problems & UTF8_GOT_SURROGATE) {
1456 possible_problems &= ~UTF8_GOT_SURROGATE;
1457
f9380377
KW
1458 if (flags & UTF8_WARN_SURROGATE) {
1459 *errors |= UTF8_GOT_SURROGATE;
1460
1461 if ( ! (flags & UTF8_CHECK_ONLY)
1462 && ckWARN_d(WARN_SURROGATE))
1463 {
2b5e7bc2
KW
1464 pack_warn = packWARN(WARN_SURROGATE);
1465
1466 /* These are the only errors that can occur with a
1467 * surrogate when the 'uv' isn't valid */
1468 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1469 message = Perl_form(aTHX_
1470 "UTF-16 surrogate (any UTF-8 sequence that"
1471 " starts with \"%s\" is for a surrogate)",
1472 _byte_dump_string(s0, curlen));
1473 }
1474 else {
1475 message = Perl_form(aTHX_
147e3846 1476 "UTF-16 surrogate U+%04" UVXf, uv);
2b5e7bc2 1477 }
f9380377 1478 }
2b5e7bc2 1479 }
ba210ebe 1480
2b5e7bc2
KW
1481 if (flags & UTF8_DISALLOW_SURROGATE) {
1482 disallowed = TRUE;
f9380377 1483 *errors |= UTF8_GOT_SURROGATE;
2b5e7bc2
KW
1484 }
1485 }
1486 else if (possible_problems & UTF8_GOT_SUPER) {
1487 possible_problems &= ~UTF8_GOT_SUPER;
949cf498 1488
f9380377
KW
1489 if (flags & UTF8_WARN_SUPER) {
1490 *errors |= UTF8_GOT_SUPER;
1491
1492 if ( ! (flags & UTF8_CHECK_ONLY)
1493 && ckWARN_d(WARN_NON_UNICODE))
1494 {
2b5e7bc2
KW
1495 pack_warn = packWARN(WARN_NON_UNICODE);
1496
1497 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1498 message = Perl_form(aTHX_
1499 "Any UTF-8 sequence that starts with"
1500 " \"%s\" is for a non-Unicode code point,"
1501 " may not be portable",
1502 _byte_dump_string(s0, curlen));
1503 }
1504 else {
1505 message = Perl_form(aTHX_
147e3846 1506 "Code point 0x%04" UVXf " is not"
2b5e7bc2
KW
1507 " Unicode, may not be portable",
1508 uv);
1509 }
f9380377 1510 }
2b5e7bc2 1511 }
ba210ebe 1512
2b5e7bc2
KW
1513 /* The maximum code point ever specified by a standard was
1514 * 2**31 - 1. Anything larger than that is a Perl extension
1515 * that very well may not be understood by other applications
1516 * (including earlier perl versions on EBCDIC platforms). We
1517 * test for these after the regular SUPER ones, and before
1518 * possibly bailing out, so that the slightly more dire warning
1519 * will override the regular one. */
1520 if ( (flags & (UTF8_WARN_ABOVE_31_BIT
1521 |UTF8_WARN_SUPER
1522 |UTF8_DISALLOW_ABOVE_31_BIT))
1523 && ( ( UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
1524 && UNLIKELY(is_utf8_cp_above_31_bits(
1525 adjusted_s0,
1526 adjusted_send)))
1527 || ( LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
1528 && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv)))))
1529 {
1530 if ( ! (flags & UTF8_CHECK_ONLY)
1531 && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
1532 && ckWARN_d(WARN_UTF8))
1533 {
1534 pack_warn = packWARN(WARN_UTF8);
1535
1536 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1537 message = Perl_form(aTHX_
1538 "Any UTF-8 sequence that starts with"
1539 " \"%s\" is for a non-Unicode code"
1540 " point, and is not portable",
1541 _byte_dump_string(s0, curlen));
1542 }
1543 else {
1544 message = Perl_form(aTHX_
147e3846 1545 "Code point 0x%" UVXf " is not Unicode,"
2b5e7bc2
KW
1546 " and not portable",
1547 uv);
1548 }
1549 }
1550
f9380377
KW
1551 if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
1552 *errors |= UTF8_GOT_ABOVE_31_BIT;
1553
1554 if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
1555 disallowed = TRUE;
1556 }
2b5e7bc2
KW
1557 }
1558 }
eb83ed87 1559
2b5e7bc2 1560 if (flags & UTF8_DISALLOW_SUPER) {
f9380377 1561 *errors |= UTF8_GOT_SUPER;
2b5e7bc2
KW
1562 disallowed = TRUE;
1563 }
eb83ed87 1564
2b5e7bc2
KW
1565 /* The deprecated warning overrides any non-deprecated one. If
1566 * there are other problems, a deprecation message is not
1567 * really helpful, so don't bother to raise it in that case.
1568 * This also keeps the code from having to handle the case
1569 * where 'uv' is not valid. */
1570 if ( ! (orig_problems
1571 & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1572 && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1573 && ckWARN_d(WARN_DEPRECATED))
1574 {
1575 message = Perl_form(aTHX_ cp_above_legal_max,
1576 uv, MAX_NON_DEPRECATED_CP);
1577 pack_warn = packWARN(WARN_DEPRECATED);
1578 }
1579 }
1580 else if (possible_problems & UTF8_GOT_NONCHAR) {
1581 possible_problems &= ~UTF8_GOT_NONCHAR;
ba210ebe 1582
f9380377
KW
1583 if (flags & UTF8_WARN_NONCHAR) {
1584 *errors |= UTF8_GOT_NONCHAR;
1585
1586 if ( ! (flags & UTF8_CHECK_ONLY)
1587 && ckWARN_d(WARN_NONCHAR))
1588 {
2b5e7bc2
KW
1589 /* The code above should have guaranteed that we don't
1590 * get here with errors other than overlong */
1591 assert (! (orig_problems
1592 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
1593
1594 pack_warn = packWARN(WARN_NONCHAR);
1595 message = Perl_form(aTHX_ "Unicode non-character"
147e3846 1596 " U+%04" UVXf " is not recommended"
2b5e7bc2 1597 " for open interchange", uv);
f9380377 1598 }
2b5e7bc2 1599 }
5b311467 1600
2b5e7bc2
KW
1601 if (flags & UTF8_DISALLOW_NONCHAR) {
1602 disallowed = TRUE;
f9380377 1603 *errors |= UTF8_GOT_NONCHAR;
2b5e7bc2
KW
1604 }
1605 } /* End of looking through the possible flags */
1606
1607 /* Display the message (if any) for the problem being handled in
1608 * this iteration of the loop */
1609 if (message) {
1610 if (PL_op)
1611 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
1612 OP_DESC(PL_op));
1613 else
1614 Perl_warner(aTHX_ pack_warn, "%s", message);
1615 }
1616 } /* End of 'while (possible_problems) {' */
a0dbb045 1617
2b5e7bc2
KW
1618 /* Since there was a possible problem, the returned length may need to
1619 * be changed from the one stored at the beginning of this function.
1620 * Instead of trying to figure out if that's needed, just do it. */
1621 if (retlen) {
1622 *retlen = curlen;
1623 }
a0dbb045 1624
2b5e7bc2
KW
1625 if (disallowed) {
1626 if (flags & UTF8_CHECK_ONLY && retlen) {
1627 *retlen = ((STRLEN) -1);
1628 }
1629 return 0;
1630 }
eb83ed87 1631 }
ba210ebe 1632
2b5e7bc2 1633 return UNI_TO_NATIVE(uv);
a0ed51b3
LW
1634}
1635
8e84507e 1636/*
ec5f19d0
KW
1637=for apidoc utf8_to_uvchr_buf
1638
1639Returns the native code point of the first character in the string C<s> which
1640is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 1641C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 1642
524080c4
KW
1643If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1644enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 1645C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 1646(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 1647C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 1648the next possible position in C<s> that could begin a non-malformed character.
de69f3af 1649See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 1650returned.
ec5f19d0 1651
760c7c2f
KW
1652Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1653unless those are turned off.
1654
ec5f19d0 1655=cut
52be2536
KW
1656
1657Also implemented as a macro in utf8.h
1658
ec5f19d0
KW
1659*/
1660
1661
1662UV
1663Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1664{
ec5f19d0
KW
1665 assert(s < send);
1666
1667 return utf8n_to_uvchr(s, send - s, retlen,
1668 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1669}
1670
52be2536
KW
1671/* This is marked as deprecated
1672 *
ec5f19d0
KW
1673=for apidoc utf8_to_uvuni_buf
1674
de69f3af
KW
1675Only in very rare circumstances should code need to be dealing in Unicode
1676(as opposed to native) code points. In those few cases, use
1677C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
4f83cdcd
KW
1678
1679Returns the Unicode (not-native) code point of the first character in the
1680string C<s> which
ec5f19d0
KW
1681is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1682C<retlen> will be set to the length, in bytes, of that character.
1683
524080c4
KW
1684If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1685enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1686NULL) to -1. If those warnings are off, the computed value if well-defined (or
1687the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1688is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1689next possible position in C<s> that could begin a non-malformed character.
de69f3af 1690See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0 1691
760c7c2f
KW
1692Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1693unless those are turned off.
1694
ec5f19d0
KW
1695=cut
1696*/
1697
1698UV
1699Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1700{
1701 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1702
1703 assert(send > s);
1704
5962d97e
KW
1705 /* Call the low level routine, asking for checks */
1706 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
ec5f19d0
KW
1707}
1708
b76347f2 1709/*
87cea99e 1710=for apidoc utf8_length
b76347f2
JH
1711
1712Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
1713Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
1714up past C<e>, croaks.
b76347f2
JH
1715
1716=cut
1717*/
1718
1719STRLEN
35a4481c 1720Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
1721{
1722 STRLEN len = 0;
1723
7918f24d
NC
1724 PERL_ARGS_ASSERT_UTF8_LENGTH;
1725
8850bf83
JH
1726 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1727 * the bitops (especially ~) can create illegal UTF-8.
1728 * In other words: in Perl UTF-8 is not just for Unicode. */
1729
a3b680e6
AL
1730 if (e < s)
1731 goto warn_and_return;
b76347f2 1732 while (s < e) {
4cbf4130 1733 s += UTF8SKIP(s);
8e91ec7f
AV
1734 len++;
1735 }
1736
1737 if (e != s) {
1738 len--;
1739 warn_and_return:
9b387841
NC
1740 if (PL_op)
1741 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1742 "%s in %s", unees, OP_DESC(PL_op));
1743 else
61a12c31 1744 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
1745 }
1746
1747 return len;
1748}
1749
b06226ff 1750/*
fed3ba5d
NC
1751=for apidoc bytes_cmp_utf8
1752
a1433954 1753Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
1754sequence of characters (stored as UTF-8)
1755in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
1756equal, -1 or -2 if the first string is less than the second string, +1 or +2
1757if the first string is greater than the second string.
1758
1759-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
1760longer string. -2 or +2 is returned if
1761there was a difference between characters
fed3ba5d
NC
1762within the strings.
1763
1764=cut
1765*/
1766
1767int
1768Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1769{
1770 const U8 *const bend = b + blen;
1771 const U8 *const uend = u + ulen;
1772
1773 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
1774
1775 while (b < bend && u < uend) {
1776 U8 c = *u++;
1777 if (!UTF8_IS_INVARIANT(c)) {
1778 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1779 if (u < uend) {
1780 U8 c1 = *u++;
1781 if (UTF8_IS_CONTINUATION(c1)) {
a62b247b 1782 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
fed3ba5d 1783 } else {
2b5e7bc2 1784 /* diag_listed_as: Malformed UTF-8 character%s */
fed3ba5d 1785 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
806547a7 1786 "%s %s%s",
7cf8d05d 1787 unexpected_non_continuation_text(u - 1, 2, 1, 2),
806547a7
KW
1788 PL_op ? " in " : "",
1789 PL_op ? OP_DESC(PL_op) : "");
fed3ba5d
NC
1790 return -2;
1791 }
1792 } else {
1793 if (PL_op)
1794 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1795 "%s in %s", unees, OP_DESC(PL_op));
1796 else
61a12c31 1797 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
1798 return -2; /* Really want to return undef :-) */
1799 }
1800 } else {
1801 return -2;
1802 }
1803 }
1804 if (*b != c) {
1805 return *b < c ? -2 : +2;
1806 }
1807 ++b;
1808 }
1809
1810 if (b == bend && u == uend)
1811 return 0;
1812
1813 return b < bend ? +1 : -1;
1814}
1815
1816/*
87cea99e 1817=for apidoc utf8_to_bytes
6940069f 1818
2bbc8d55 1819Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954
KW
1820Unlike L</bytes_to_utf8>, this over-writes the original string, and
1821updates C<len> to contain the new length.
67e989fb 1822Returns zero on failure, setting C<len> to -1.
6940069f 1823
a1433954 1824If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 1825
6940069f
GS
1826=cut
1827*/
1828
1829U8 *
37607a96 1830Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 1831{
d4c19fe8
AL
1832 U8 * const save = s;
1833 U8 * const send = s + *len;
6940069f 1834 U8 *d;
246fae53 1835
7918f24d 1836 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 1837 PERL_UNUSED_CONTEXT;
7918f24d 1838
1e54db1a 1839 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 1840 while (s < send) {
d59937ca
KW
1841 if (! UTF8_IS_INVARIANT(*s)) {
1842 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1843 *len = ((STRLEN) -1);
1844 return 0;
1845 }
1846 s++;
dcad2880 1847 }
d59937ca 1848 s++;
246fae53 1849 }
dcad2880
JH
1850
1851 d = s = save;
6940069f 1852 while (s < send) {
80e0b38f
KW
1853 U8 c = *s++;
1854 if (! UTF8_IS_INVARIANT(c)) {
1855 /* Then it is two-byte encoded */
a62b247b 1856 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
80e0b38f
KW
1857 s++;
1858 }
1859 *d++ = c;
6940069f
GS
1860 }
1861 *d = '\0';
246fae53 1862 *len = d - save;
6940069f
GS
1863 return save;
1864}
1865
1866/*
87cea99e 1867=for apidoc bytes_from_utf8
f9a63242 1868
2bbc8d55 1869Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954 1870Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
1871the newly-created string, and updates C<len> to contain the new
1872length. Returns the original string if no conversion occurs, C<len>
72d33970 1873is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55 18740 if C<s> is converted or consisted entirely of characters that are invariant
4a4088c4 1875in UTF-8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 1876
37607a96
PK
1877=cut
1878*/
f9a63242
JH
1879
1880U8 *
e1ec3a88 1881Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 1882{
f9a63242 1883 U8 *d;
e1ec3a88
AL
1884 const U8 *start = s;
1885 const U8 *send;
f9a63242
JH
1886 I32 count = 0;
1887
7918f24d 1888 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 1889 PERL_UNUSED_CONTEXT;
f9a63242 1890 if (!*is_utf8)
73d840c0 1891 return (U8 *)start;
f9a63242 1892
1e54db1a 1893 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 1894 for (send = s + *len; s < send;) {
d59937ca
KW
1895 if (! UTF8_IS_INVARIANT(*s)) {
1896 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
73d840c0 1897 return (U8 *)start;
d59937ca
KW
1898 }
1899 count++;
1900 s++;
db42d148 1901 }
d59937ca 1902 s++;
f9a63242
JH
1903 }
1904
35da51f7 1905 *is_utf8 = FALSE;
f9a63242 1906
212542aa 1907 Newx(d, (*len) - count + 1, U8);
ef9edfd0 1908 s = start; start = d;
f9a63242
JH
1909 while (s < send) {
1910 U8 c = *s++;
1a91c45d 1911 if (! UTF8_IS_INVARIANT(c)) {
c4d5f83a 1912 /* Then it is two-byte encoded */
a62b247b 1913 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
1a91c45d 1914 s++;
c4d5f83a
NIS
1915 }
1916 *d++ = c;
f9a63242
JH
1917 }
1918 *d = '\0';
1919 *len = d - start;
73d840c0 1920 return (U8 *)start;
f9a63242
JH
1921}
1922
1923/*
87cea99e 1924=for apidoc bytes_to_utf8
6940069f 1925
ff97e5cf
KW
1926Converts a string C<s> of length C<len> bytes from the native encoding into
1927UTF-8.
6662521e 1928Returns a pointer to the newly-created string, and sets C<len> to
ff97e5cf 1929reflect the new length in bytes.
6940069f 1930
75200dff 1931A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
1932
1933If you want to convert to UTF-8 from encodings other than
1934the native (Latin1 or EBCDIC),
a1433954 1935see L</sv_recode_to_utf8>().
c9ada85f 1936
497711e7 1937=cut
6940069f
GS
1938*/
1939
c682ebef
FC
1940/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1941 likewise need duplication. */
1942
6940069f 1943U8*
35a4481c 1944Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 1945{
35a4481c 1946 const U8 * const send = s + (*len);
6940069f
GS
1947 U8 *d;
1948 U8 *dst;
7918f24d
NC
1949
1950 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 1951 PERL_UNUSED_CONTEXT;
6940069f 1952
212542aa 1953 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
1954 dst = d;
1955
1956 while (s < send) {
55d09dc8
KW
1957 append_utf8_from_native_byte(*s, &d);
1958 s++;
6940069f
GS
1959 }
1960 *d = '\0';
6662521e 1961 *len = d-dst;
6940069f
GS
1962 return dst;
1963}
1964
a0ed51b3 1965/*
dea0fc0b 1966 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
1967 *
1968 * Destination must be pre-extended to 3/2 source. Do not use in-place.
1969 * We optimize for native, for obvious reasons. */
1970
1971U8*
dea0fc0b 1972Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 1973{
dea0fc0b
JH
1974 U8* pend;
1975 U8* dstart = d;
1976
7918f24d
NC
1977 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1978
dea0fc0b 1979 if (bytelen & 1)
147e3846 1980 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf, (UV)bytelen);
dea0fc0b
JH
1981
1982 pend = p + bytelen;
1983
a0ed51b3 1984 while (p < pend) {
dea0fc0b
JH
1985 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1986 p += 2;
2d1545e5 1987 if (OFFUNI_IS_INVARIANT(uv)) {
56d37426 1988 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
1989 continue;
1990 }
56d37426
KW
1991 if (uv <= MAX_UTF8_TWO_BYTE) {
1992 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1993 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
1994 continue;
1995 }
46956fad
KW
1996#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1997#define LAST_HIGH_SURROGATE 0xDBFF
1998#define FIRST_LOW_SURROGATE 0xDC00
1999#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
e23c50db
KW
2000
2001 /* This assumes that most uses will be in the first Unicode plane, not
2002 * needing surrogates */
2003 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
2004 && uv <= UNICODE_SURROGATE_LAST))
2005 {
2006 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2007 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2008 }
2009 else {
01ea242b 2010 UV low = (p[0] << 8) + p[1];
e23c50db
KW
2011 if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
2012 || UNLIKELY(low > LAST_LOW_SURROGATE))
2013 {
01ea242b 2014 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db
KW
2015 }
2016 p += 2;
46956fad
KW
2017 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
2018 + (low - FIRST_LOW_SURROGATE) + 0x10000;
01ea242b 2019 }
a0ed51b3 2020 }
56d37426
KW
2021#ifdef EBCDIC
2022 d = uvoffuni_to_utf8_flags(d, uv, 0);
2023#else
a0ed51b3 2024 if (uv < 0x10000) {
eb160463
GS
2025 *d++ = (U8)(( uv >> 12) | 0xe0);
2026 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2027 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2028 continue;
2029 }
2030 else {
eb160463
GS
2031 *d++ = (U8)(( uv >> 18) | 0xf0);
2032 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
2033 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2034 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
2035 continue;
2036 }
56d37426 2037#endif
a0ed51b3 2038 }
dea0fc0b 2039 *newlen = d - dstart;
a0ed51b3
LW
2040 return d;
2041}
2042
2043/* Note: this one is slightly destructive of the source. */
2044
2045U8*
dea0fc0b 2046Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
2047{
2048 U8* s = (U8*)p;
d4c19fe8 2049 U8* const send = s + bytelen;
7918f24d
NC
2050
2051 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2052
e0ea5e2d 2053 if (bytelen & 1)
147e3846 2054 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
e0ea5e2d
NC
2055 (UV)bytelen);
2056
a0ed51b3 2057 while (s < send) {
d4c19fe8 2058 const U8 tmp = s[0];
a0ed51b3
LW
2059 s[0] = s[1];
2060 s[1] = tmp;
2061 s += 2;
2062 }
dea0fc0b 2063 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
2064}
2065
922e8cb4
KW
2066bool
2067Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2068{
2069 U8 tmpbuf[UTF8_MAXBYTES+1];
2070 uvchr_to_utf8(tmpbuf, c);
2071 return _is_utf8_FOO(classnum, tmpbuf);
2072}
2073
f9ae8fb6
JD
2074/* Internal function so we can deprecate the external one, and call
2075 this one from other deprecated functions in this file */
2076
f2645549
KW
2077bool
2078Perl__is_utf8_idstart(pTHX_ const U8 *p)
61b19385 2079{
f2645549 2080 PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
61b19385
KW
2081
2082 if (*p == '_')
2083 return TRUE;
f25ce844 2084 return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
61b19385
KW
2085}
2086
5092f92a 2087bool
eba68aa0
KW
2088Perl__is_uni_perl_idcont(pTHX_ UV c)
2089{
2090 U8 tmpbuf[UTF8_MAXBYTES+1];
2091 uvchr_to_utf8(tmpbuf, c);
2092 return _is_utf8_perl_idcont(tmpbuf);
2093}
2094
2095bool
f91dcd13
KW
2096Perl__is_uni_perl_idstart(pTHX_ UV c)
2097{
2098 U8 tmpbuf[UTF8_MAXBYTES+1];
2099 uvchr_to_utf8(tmpbuf, c);
2100 return _is_utf8_perl_idstart(tmpbuf);
2101}
2102
3a4c58c9
KW
2103UV
2104Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
2105{
2106 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2107 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
2108 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2109 * either "SS" or "Ss". Which one to use is passed into the routine in
2110 * 'S_or_s' to avoid a test */
2111
2112 UV converted = toUPPER_LATIN1_MOD(c);
2113
2114 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2115
2116 assert(S_or_s == 'S' || S_or_s == 's');
2117
6f2d5cbc 2118 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 2119 characters in this range */
3a4c58c9
KW
2120 *p = (U8) converted;
2121 *lenp = 1;
2122 return converted;
2123 }
2124
2125 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2126 * which it maps to one of them, so as to only have to have one check for
2127 * it in the main case */
2128 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2129 switch (c) {
2130 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2131 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2132 break;
2133 case MICRO_SIGN:
2134 converted = GREEK_CAPITAL_LETTER_MU;
2135 break;
79e064b9
KW
2136#if UNICODE_MAJOR_VERSION > 2 \
2137 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2138 && UNICODE_DOT_DOT_VERSION >= 8)
3a4c58c9
KW
2139 case LATIN_SMALL_LETTER_SHARP_S:
2140 *(p)++ = 'S';
2141 *p = S_or_s;
2142 *lenp = 2;
2143 return 'S';
79e064b9 2144#endif
3a4c58c9
KW
2145 default:
2146 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
e5964223 2147 NOT_REACHED; /* NOTREACHED */
3a4c58c9
KW
2148 }
2149 }
2150
2151 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2152 *p = UTF8_TWO_BYTE_LO(converted);
2153 *lenp = 2;
2154
2155 return converted;
2156}
2157
50bda2c3
KW
2158/* Call the function to convert a UTF-8 encoded character to the specified case.
2159 * Note that there may be more than one character in the result.
2160 * INP is a pointer to the first byte of the input character
2161 * OUTP will be set to the first byte of the string of changed characters. It
2162 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2163 * LENP will be set to the length in bytes of the string of changed characters
2164 *
2165 * The functions return the ordinal of the first character in the string of OUTP */
b9992569
KW
2166#define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
2167#define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
2168#define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
50bda2c3 2169
b9992569
KW
2170/* This additionally has the input parameter 'specials', which if non-zero will
2171 * cause this to use the specials hash for folding (meaning get full case
50bda2c3 2172 * folding); otherwise, when zero, this implies a simple case fold */
b9992569 2173#define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
c3fd2246 2174
84afefe6
JH
2175UV
2176Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2177{
a1433954
KW
2178 /* Convert the Unicode character whose ordinal is <c> to its uppercase
2179 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2180 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
2181 * the changed version may be longer than the original character.
2182 *
2183 * The ordinal of the first character of the changed version is returned
2184 * (but note, as explained above, that there may be more.) */
2185
7918f24d
NC
2186 PERL_ARGS_ASSERT_TO_UNI_UPPER;
2187
3a4c58c9
KW
2188 if (c < 256) {
2189 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2190 }
2191
0ebc6274 2192 uvchr_to_utf8(p, c);
b9992569 2193 return CALL_UPPER_CASE(c, p, p, lenp);
a0ed51b3
LW
2194}
2195
84afefe6
JH
2196UV
2197Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2198{
7918f24d
NC
2199 PERL_ARGS_ASSERT_TO_UNI_TITLE;
2200
3a4c58c9
KW
2201 if (c < 256) {
2202 return _to_upper_title_latin1((U8) c, p, lenp, 's');
2203 }
2204
0ebc6274 2205 uvchr_to_utf8(p, c);
b9992569 2206 return CALL_TITLE_CASE(c, p, p, lenp);
a0ed51b3
LW
2207}
2208
afc16117 2209STATIC U8
eaf412bf 2210S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
afc16117
KW
2211{
2212 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 2213 * those, converting the result to UTF-8. Since the result is always just
a1433954 2214 * one character, we allow <p> to be NULL */
afc16117
KW
2215
2216 U8 converted = toLOWER_LATIN1(c);
2217
eaf412bf
KW
2218 PERL_UNUSED_ARG(dummy);
2219
afc16117 2220 if (p != NULL) {
6f2d5cbc 2221 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
2222 *p = converted;
2223 *lenp = 1;
2224 }
2225 else {
430c9760
KW
2226 /* Result is known to always be < 256, so can use the EIGHT_BIT
2227 * macros */
2228 *p = UTF8_EIGHT_BIT_HI(converted);
2229 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
afc16117
KW
2230 *lenp = 2;
2231 }
2232 }
2233 return converted;
2234}
2235
84afefe6
JH
2236UV
2237Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 2238{
7918f24d
NC
2239 PERL_ARGS_ASSERT_TO_UNI_LOWER;
2240
afc16117 2241 if (c < 256) {
eaf412bf 2242 return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
bca00c02
KW
2243 }
2244
afc16117 2245 uvchr_to_utf8(p, c);
b9992569 2246 return CALL_LOWER_CASE(c, p, p, lenp);
a0ed51b3
LW
2247}
2248
84afefe6 2249UV
51910141 2250Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 2251{
51910141 2252 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 2253 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 2254 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
2255 *
2256 * Not to be used for locale folds
51910141 2257 */
f673fad4 2258
a1dde8de
KW
2259 UV converted;
2260
2261 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
81611534 2262 PERL_UNUSED_CONTEXT;
a1dde8de 2263
1ca267a5
KW
2264 assert (! (flags & FOLD_FLAGS_LOCALE));
2265
659a7c2d 2266 if (UNLIKELY(c == MICRO_SIGN)) {
a1dde8de
KW
2267 converted = GREEK_SMALL_LETTER_MU;
2268 }
9b63e895
KW
2269#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
2270 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
2271 || UNICODE_DOT_DOT_VERSION > 0)
659a7c2d
KW
2272 else if ( (flags & FOLD_FLAGS_FULL)
2273 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
2274 {
1ca267a5
KW
2275 /* If can't cross 127/128 boundary, can't return "ss"; instead return
2276 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
2277 * under those circumstances. */
2278 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
2279 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2280 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2281 p, *lenp, U8);
2282 return LATIN_SMALL_LETTER_LONG_S;
2283 }
2284 else {
4f489194
KW
2285 *(p)++ = 's';
2286 *p = 's';
2287 *lenp = 2;
2288 return 's';
1ca267a5 2289 }
a1dde8de 2290 }
9b63e895 2291#endif
a1dde8de
KW
2292 else { /* In this range the fold of all other characters is their lower
2293 case */
2294 converted = toLOWER_LATIN1(c);
2295 }
2296
6f2d5cbc 2297 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
2298 *p = (U8) converted;
2299 *lenp = 1;
2300 }
2301 else {
2302 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2303 *p = UTF8_TWO_BYTE_LO(converted);
2304 *lenp = 2;
2305 }
2306
2307 return converted;
2308}
2309
2310UV
31f05a37 2311Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 2312{
4b593389 2313
a0270393
KW
2314 /* Not currently externally documented, and subject to change
2315 * <flags> bits meanings:
2316 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
2317 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2318 * locale are to be used.
a0270393
KW
2319 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2320 */
4b593389 2321
36bb2ab6 2322 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 2323
780fcc9f
KW
2324 if (flags & FOLD_FLAGS_LOCALE) {
2325 /* Treat a UTF-8 locale as not being in locale at all */
2326 if (IN_UTF8_CTYPE_LOCALE) {
2327 flags &= ~FOLD_FLAGS_LOCALE;
2328 }
2329 else {
2330 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
e7b7ac46 2331 goto needs_full_generality;
780fcc9f 2332 }
31f05a37
KW
2333 }
2334
a1dde8de 2335 if (c < 256) {
e7b7ac46 2336 return _to_fold_latin1((U8) c, p, lenp,
31f05a37 2337 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
2338 }
2339
2f306ab9 2340 /* Here, above 255. If no special needs, just use the macro */
a0270393
KW
2341 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
2342 uvchr_to_utf8(p, c);
b9992569 2343 return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
a0270393
KW
2344 }
2345 else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
2346 the special flags. */
2347 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
2348
2349 needs_full_generality:
a0270393 2350 uvchr_to_utf8(utf8_c, c);
445bf929 2351 return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
a0270393 2352 }
84afefe6
JH
2353}
2354
26483009 2355PERL_STATIC_INLINE bool
5141f98e 2356S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
f25ce844 2357 const char *const swashname, SV* const invlist)
bde6a22d 2358{
ea317ccb
KW
2359 /* returns a boolean giving whether or not the UTF8-encoded character that
2360 * starts at <p> is in the swash indicated by <swashname>. <swash>
2361 * contains a pointer to where the swash indicated by <swashname>
2362 * is to be stored; which this routine will do, so that future calls will
f25ce844
KW
2363 * look at <*swash> and only generate a swash if it is not null. <invlist>
2364 * is NULL or an inversion list that defines the swash. If not null, it
2365 * saves time during initialization of the swash.
ea317ccb
KW
2366 *
2367 * Note that it is assumed that the buffer length of <p> is enough to
2368 * contain all the bytes that comprise the character. Thus, <*p> should
2369 * have been checked before this call for mal-formedness enough to assure
2370 * that. */
2371
7918f24d
NC
2372 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2373
492a624f 2374 /* The API should have included a length for the UTF-8 character in <p>,
28123549 2375 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
2376 * as far as there being enough bytes available in it to accommodate the
2377 * character without reading beyond the end, and pass that number on to the
2378 * validating routine */
6302f837 2379 if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
28123549
KW
2380 if (ckWARN_d(WARN_UTF8)) {
2381 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
9816f121 2382 "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
28123549
KW
2383 if (ckWARN(WARN_UTF8)) { /* This will output details as to the
2384 what the malformation is */
2385 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
2386 }
2387 }
2388 return FALSE;
2389 }
87367d5f
KW
2390 if (!*swash) {
2391 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
f25ce844
KW
2392 *swash = _core_swash_init("utf8",
2393
2394 /* Only use the name if there is no inversion
2395 * list; otherwise will go out to disk */
2396 (invlist) ? "" : swashname,
2397
2398 &PL_sv_undef, 1, 0, invlist, &flags);
87367d5f 2399 }
28123549 2400
bde6a22d
NC
2401 return swash_fetch(*swash, p, TRUE) != 0;
2402}
2403
2404bool
922e8cb4
KW
2405Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
2406{
922e8cb4
KW
2407 PERL_ARGS_ASSERT__IS_UTF8_FOO;
2408
2409 assert(classnum < _FIRST_NON_SWASH_CC);
2410
f25ce844
KW
2411 return is_utf8_common(p,
2412 &PL_utf8_swash_ptrs[classnum],
2413 swash_property_names[classnum],
2414 PL_XPosix_ptrs[classnum]);
922e8cb4
KW
2415}
2416
2417bool
f2645549 2418Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
a0ed51b3 2419{
f2645549 2420 SV* invlist = NULL;
7918f24d 2421
f2645549 2422 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
7918f24d 2423
f2645549
KW
2424 if (! PL_utf8_perl_idstart) {
2425 invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
2426 }
60071a22 2427 return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
82686b01
JH
2428}
2429
2430bool
f2645549 2431Perl__is_utf8_xidstart(pTHX_ const U8 *p)
c11ff943 2432{
f2645549 2433 PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
c11ff943
KW
2434
2435 if (*p == '_')
2436 return TRUE;
f25ce844 2437 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
c11ff943
KW
2438}
2439
2440bool
eba68aa0
KW
2441Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
2442{
b24b43f7 2443 SV* invlist = NULL;
eba68aa0
KW
2444
2445 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
2446
b24b43f7
KW
2447 if (! PL_utf8_perl_idcont) {
2448 invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
2449 }
60071a22 2450 return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
eba68aa0
KW
2451}
2452
eba68aa0 2453bool
f2645549 2454Perl__is_utf8_idcont(pTHX_ const U8 *p)
82686b01 2455{
f2645549 2456 PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
7918f24d 2457
f25ce844 2458 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
a0ed51b3
LW
2459}
2460
2461bool
f2645549 2462Perl__is_utf8_xidcont(pTHX_ const U8 *p)
c11ff943 2463{
f2645549 2464 PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
c11ff943 2465
f25ce844 2466 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
c11ff943
KW
2467}
2468
2469bool
7dbf68d2
KW
2470Perl__is_utf8_mark(pTHX_ const U8 *p)
2471{
7dbf68d2
KW
2472 PERL_ARGS_ASSERT__IS_UTF8_MARK;
2473
f25ce844 2474 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
7dbf68d2
KW
2475}
2476
6b5c0936 2477/*
87cea99e 2478=for apidoc to_utf8_case
6b5c0936 2479
9da1e7cb
KW
2480Instead use the appropriate one of L</toUPPER_utf8>,
2481L</toTITLE_utf8>,
2482L</toLOWER_utf8>,
2483or L</toFOLD_utf8>.
2484
6fae5207 2485C<p> contains the pointer to the UTF-8 string encoding
a1433954
KW
2486the character that is being converted. This routine assumes that the character
2487at C<p> is well-formed.
6b5c0936 2488
6fae5207
KW
2489C<ustrp> is a pointer to the character buffer to put the
2490conversion result to. C<lenp> is a pointer to the length
6b5c0936
JH
2491of the result.
2492
6fae5207 2493C<swashp> is a pointer to the swash to use.
6b5c0936 2494
a1433954 2495Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
796b6530 2496and loaded by C<SWASHNEW>, using F<lib/utf8_heavy.pl>. C<special> (usually,
0134edef 2497but not always, a multicharacter mapping), is tried first.
6b5c0936 2498
4a8240a3
KW
2499C<special> is a string, normally C<NULL> or C<"">. C<NULL> means to not use
2500any special mappings; C<""> means to use the special mappings. Values other
2501than these two are treated as the name of the hash containing the special
2502mappings, like C<"utf8::ToSpecLower">.
6b5c0936 2503
796b6530
KW
2504C<normal> is a string like C<"ToLower"> which means the swash
2505C<%utf8::ToLower>.
0134edef 2506
760c7c2f
KW
2507Code points above the platform's C<IV_MAX> will raise a deprecation warning,
2508unless those are turned off.
2509
0134edef 2510=cut */
6b5c0936 2511
2104c8d9 2512UV
9a957fbc
AL
2513Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
2514 SV **swashp, const char *normal, const char *special)
a0ed51b3 2515{
b9992569
KW
2516 PERL_ARGS_ASSERT_TO_UTF8_CASE;
2517
2518 return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special);
2519}
2520
2521 /* change namve uv1 to 'from' */
6a4a25f4 2522STATIC UV
b9992569
KW
2523S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
2524 SV **swashp, const char *normal, const char *special)
2525{
0134edef 2526 STRLEN len = 0;
7918f24d 2527
b9992569 2528 PERL_ARGS_ASSERT__TO_UTF8_CASE;
7918f24d 2529
36eaa811
KW
2530 /* For code points that don't change case, we already know that the output
2531 * of this function is the unchanged input, so we can skip doing look-ups
2532 * for them. Unfortunately the case-changing code points are scattered
2533 * around. But there are some long consecutive ranges where there are no
2534 * case changing code points. By adding tests, we can eliminate the lookup
2535 * for all the ones in such ranges. This is currently done here only for
2536 * just a few cases where the scripts are in common use in modern commerce
2537 * (and scripts adjacent to those which can be included without additional
2538 * tests). */
2539
2540 if (uv1 >= 0x0590) {
2541 /* This keeps from needing further processing the code points most
2542 * likely to be used in the following non-cased scripts: Hebrew,
2543 * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
2544 * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
2545 * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
2546 if (uv1 < 0x10A0) {
2547 goto cases_to_self;
2548 }
2549
2550 /* The following largish code point ranges also don't have case
2551 * changes, but khw didn't think they warranted extra tests to speed
2552 * them up (which would slightly slow down everything else above them):
2553 * 1100..139F Hangul Jamo, Ethiopic
2554 * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
2555 * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
2556 * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
2557 * Combining Diacritical Marks Extended, Balinese,
2558 * Sundanese, Batak, Lepcha, Ol Chiki
2559 * 2000..206F General Punctuation
2560 */
2561
2562 if (uv1 >= 0x2D30) {
2563
2564 /* This keeps the from needing further processing the code points
2565 * most likely to be used in the following non-cased major scripts:
2566 * CJK, Katakana, Hiragana, plus some less-likely scripts.
2567 *
2568 * (0x2D30 above might have to be changed to 2F00 in the unlikely
2569 * event that Unicode eventually allocates the unused block as of
2570 * v8.0 2FE0..2FEF to code points that are cased. khw has verified
2571 * that the test suite will start having failures to alert you
2572 * should that happen) */
2573 if (uv1 < 0xA640) {
2574 goto cases_to_self;
2575 }
2576
2577 if (uv1 >= 0xAC00) {
2578 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
5af9bc97
KW
2579 if (ckWARN_d(WARN_SURROGATE)) {
2580 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2581 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
147e3846 2582 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04" UVXf, desc, uv1);
5af9bc97
KW
2583 }
2584 goto cases_to_self;
2585 }
36eaa811
KW
2586
2587 /* AC00..FAFF Catches Hangul syllables and private use, plus
2588 * some others */
2589 if (uv1 < 0xFB00) {
2590 goto cases_to_self;
2591
2592 }
2593
5af9bc97
KW
2594 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
2595 if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
2596 && ckWARN_d(WARN_DEPRECATED))
2597 {
2598 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2599 cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
2600 }
2601 if (ckWARN_d(WARN_NON_UNICODE)) {
2602 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2603 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
147e3846 2604 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04" UVXf, desc, uv1);
5af9bc97
KW
2605 }
2606 goto cases_to_self;
2607 }
3bfc1e70
KW
2608#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
2609 if (UNLIKELY(uv1
2610 > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
2611 {
2612
2613 /* As of this writing, this means we avoid swash creation
2614 * for anything beyond low Plane 1 */
2615 goto cases_to_self;
2616 }
2617#endif
36eaa811
KW
2618 }
2619 }
9ae3ac1a 2620
36eaa811
KW
2621 /* Note that non-characters are perfectly legal, so no warning should
2622 * be given. There are so few of them, that it isn't worth the extra
2623 * tests to avoid swash creation */
9ae3ac1a
KW
2624 }
2625
0134edef 2626 if (!*swashp) /* load on-demand */
5ab9d2ef 2627 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
0134edef 2628
a6f87d8c 2629 if (special) {
0134edef 2630 /* It might be "special" (sometimes, but not always,
2a37f04d 2631 * a multicharacter mapping) */
4a8240a3 2632 HV *hv = NULL;
b08cf34e
JH
2633 SV **svp;
2634
4a8240a3
KW
2635 /* If passed in the specials name, use that; otherwise use any
2636 * given in the swash */
2637 if (*special != '\0') {
2638 hv = get_hv(special, 0);
2639 }
2640 else {
2641 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
2642 if (svp) {
2643 hv = MUTABLE_HV(SvRV(*svp));
2644 }
2645 }
2646
176fe009 2647 if (hv
5f560d8a 2648 && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
176fe009
KW
2649 && (*svp))
2650 {
cfd0369c 2651 const char *s;
47654450 2652
cfd0369c 2653 s = SvPV_const(*svp, len);
47654450 2654 if (len == 1)
f4cd282c 2655 /* EIGHTBIT */
c80e42f3 2656 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2a37f04d 2657 else {
d2dcd0fb 2658 Copy(s, ustrp, len, U8);
29e98929 2659 }
983ffd37 2660 }
0134edef
JH
2661 }
2662
2663 if (!len && *swashp) {
4a4088c4 2664 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
d4c19fe8 2665
0134edef
JH
2666 if (uv2) {
2667 /* It was "normal" (a single character mapping). */
f4cd282c 2668 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2a37f04d
JH
2669 }
2670 }
1feea2c7 2671
cbe07460
KW
2672 if (len) {
2673 if (lenp) {
2674 *lenp = len;
2675 }
2676 return valid_utf8_to_uvchr(ustrp, 0);
2677 }
2678
2679 /* Here, there was no mapping defined, which means that the code point maps
2680 * to itself. Return the inputs */
e24dfe9c 2681 cases_to_self:
bfdf22ec 2682 len = UTF8SKIP(p);
ca9fab46
KW
2683 if (p != ustrp) { /* Don't copy onto itself */
2684 Copy(p, ustrp, len, U8);
2685 }
0134edef 2686
2a37f04d
JH
2687 if (lenp)
2688 *lenp = len;
2689
f4cd282c 2690 return uv1;
cbe07460 2691
a0ed51b3
LW
2692}
2693
051a06d4 2694STATIC UV
357aadde 2695S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
051a06d4 2696{
4a4088c4 2697 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
2698 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
2699 * result contains a character that crosses the 255/256 boundary, disallow
2700 * the change, and return the original code point. See L<perlfunc/lc> for
2701 * why;
051a06d4 2702 *
a1433954
KW
2703 * p points to the original string whose case was changed; assumed
2704 * by this routine to be well-formed
051a06d4
KW
2705 * result the code point of the first character in the changed-case string
2706 * ustrp points to the changed-case string (<result> represents its first char)
2707 * lenp points to the length of <ustrp> */
2708
2709 UV original; /* To store the first code point of <p> */
2710
2711 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2712
a4f12ed7 2713 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
2714
2715 /* We know immediately if the first character in the string crosses the
2716 * boundary, so can skip */
2717 if (result > 255) {
2718
2719 /* Look at every character in the result; if any cross the
2720 * boundary, the whole thing is disallowed */
2721 U8* s = ustrp + UTF8SKIP(ustrp);
2722 U8* e = ustrp + *lenp;
2723 while (s < e) {
a4f12ed7 2724 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
2725 goto bad_crossing;
2726 }
2727 s += UTF8SKIP(s);
2728 }
2729
613abc6d
KW
2730 /* Here, no characters crossed, result is ok as-is, but we warn. */
2731 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
051a06d4
KW
2732 return result;
2733 }
2734
7b52d656 2735 bad_crossing:
051a06d4
KW
2736
2737 /* Failed, have to return the original */
4b88fb76 2738 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
2739
2740 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2741 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
147e3846
KW
2742 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8 locale; "
2743 "resolved to \"\\x{%" UVXf "}\".",
357aadde 2744 OP_DESC(PL_op),
ab0b796c
KW
2745 original,
2746 original);
051a06d4
KW
2747 Copy(p, ustrp, *lenp, char);
2748 return original;
2749}
2750
eaf412bf
KW
2751/* The process for changing the case is essentially the same for the four case
2752 * change types, except there are complications for folding. Otherwise the
2753 * difference is only which case to change to. To make sure that they all do
2754 * the same thing, the bodies of the functions are extracted out into the
2755 * following two macros. The functions are written with the same variable
2756 * names, and these are known and used inside these macros. It would be
2757 * better, of course, to have inline functions to do it, but since different
2758 * macros are called, depending on which case is being changed to, this is not
2759 * feasible in C (to khw's knowledge). Two macros are created so that the fold
2760 * function can start with the common start macro, then finish with its special
2761 * handling; while the other three cases can just use the common end macro.
2762 *
2763 * The algorithm is to use the proper (passed in) macro or function to change
2764 * the case for code points that are below 256. The macro is used if using
2765 * locale rules for the case change; the function if not. If the code point is
2766 * above 255, it is computed from the input UTF-8, and another macro is called
2767 * to do the conversion. If necessary, the output is converted to UTF-8. If
2768 * using a locale, we have to check that the change did not cross the 255/256
2769 * boundary, see check_locale_boundary_crossing() for further details.
2770 *
2771 * The macros are split with the correct case change for the below-256 case
2772 * stored into 'result', and in the middle of an else clause for the above-255
2773 * case. At that point in the 'else', 'result' is not the final result, but is
2774 * the input code point calculated from the UTF-8. The fold code needs to
2775 * realize all this and take it from there.
2776 *
2777 * If you read the two macros as sequential, it's easier to understand what's
2778 * going on. */
2779#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
2780 L1_func_extra_param) \
2781 if (flags & (locale_flags)) { \
2782 /* Treat a UTF-8 locale as not being in locale at all */ \
2783 if (IN_UTF8_CTYPE_LOCALE) { \
2784 flags &= ~(locale_flags); \
2785 } \
2786 else { \
2787 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
2788 } \
2789 } \
2790 \
2791 if (UTF8_IS_INVARIANT(*p)) { \
2792 if (flags & (locale_flags)) { \
2793 result = LC_L1_change_macro(*p); \
2794 } \
2795 else { \
2796 return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
2797 } \
2798 } \
2799 else if UTF8_IS_DOWNGRADEABLE_START(*p) { \
2800 if (flags & (locale_flags)) { \
2801 result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \
2802 *(p+1))); \
2803 } \
2804 else { \
2805 return L1_func(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), \
2806 ustrp, lenp, L1_func_extra_param); \
2807 } \
2808 } \
2809 else { /* malformed UTF-8 */ \
2810 result = valid_utf8_to_uvchr(p, NULL); \
2811
2812#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
2813 result = change_macro(result, p, ustrp, lenp); \
2814 \
2815 if (flags & (locale_flags)) { \
2816 result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
2817 } \
2818 return result; \
2819 } \
2820 \
2821 /* Here, used locale rules. Convert back to UTF-8 */ \
2822 if (UTF8_IS_INVARIANT(result)) { \
2823 *ustrp = (U8) result; \
2824 *lenp = 1; \
2825 } \
2826 else { \
2827 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); \
2828 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \
2829 *lenp = 2; \
2830 } \
2831 \
2832 return result;
2833
d3e79532 2834/*
87cea99e 2835=for apidoc to_utf8_upper
d3e79532 2836
1f607577 2837Instead use L</toUPPER_utf8>.
a1433954 2838
d3e79532
JH
2839=cut */
2840
051a06d4 2841/* Not currently externally documented, and subject to change:
31f05a37
KW
2842 * <flags> is set iff iff the rules from the current underlying locale are to
2843 * be used. */
051a06d4 2844
2104c8d9 2845UV
31f05a37 2846Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2847{
051a06d4
KW
2848 UV result;
2849
2850 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 2851
eaf412bf
KW
2852 /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
2853 /* 2nd char of uc(U+DF) is 'S' */
2854 CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S');
2855 CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
983ffd37 2856}
a0ed51b3 2857
d3e79532 2858/*
87cea99e 2859=for apidoc to_utf8_title
d3e79532 2860
1f607577 2861Instead use L</toTITLE_utf8>.
a1433954 2862
d3e79532
JH
2863=cut */
2864
051a06d4 2865/* Not currently externally documented, and subject to change:
31f05a37
KW
2866 * <flags> is set iff the rules from the current underlying locale are to be
2867 * used. Since titlecase is not defined in POSIX, for other than a
2868 * UTF-8 locale, uppercase is used instead for code points < 256.
445bf929 2869 */
051a06d4 2870
983ffd37 2871UV
31f05a37 2872Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
983ffd37 2873{
051a06d4
KW
2874 UV result;
2875
2876 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 2877
eaf412bf
KW
2878 /* 2nd char of ucfirst(U+DF) is 's' */
2879 CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's');
2880 CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
a0ed51b3
LW
2881}
2882
d3e79532 2883/*
87cea99e 2884=for apidoc to_utf8_lower
d3e79532 2885
1f607577 2886Instead use L</toLOWER_utf8>.
a1433954 2887
d3e79532
JH
2888=cut */
2889
051a06d4 2890/* Not currently externally documented, and subject to change:
31f05a37
KW
2891 * <flags> is set iff iff the rules from the current underlying locale are to
2892 * be used.
2893 */
051a06d4 2894
2104c8d9 2895UV
31f05a37 2896Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2897{
051a06d4
KW
2898 UV result;
2899
051a06d4 2900 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 2901
eaf412bf
KW
2902 CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */)
2903 CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
b4e400f9
JH
2904}
2905
d3e79532 2906/*
87cea99e 2907=for apidoc to_utf8_fold
d3e79532 2908
1f607577 2909Instead use L</toFOLD_utf8>.
a1433954 2910
d3e79532
JH
2911=cut */
2912
051a06d4
KW
2913/* Not currently externally documented, and subject to change,
2914 * in <flags>
31f05a37
KW
2915 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2916 * locale are to be used.
051a06d4
KW
2917 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
2918 * otherwise simple folds
a0270393
KW
2919 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2920 * prohibited
445bf929 2921 */
36bb2ab6 2922
b4e400f9 2923UV
445bf929 2924Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
b4e400f9 2925{
051a06d4
KW
2926 UV result;
2927
36bb2ab6 2928 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 2929
a0270393
KW
2930 /* These are mutually exclusive */
2931 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2932
50ba90ff
KW
2933 assert(p != ustrp); /* Otherwise overwrites */
2934
eaf412bf
KW
2935 CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
2936 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)));
31f05a37 2937
eaf412bf 2938 result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
a1dde8de 2939
1ca267a5
KW
2940 if (flags & FOLD_FLAGS_LOCALE) {
2941
76f2ffcd 2942# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
0766489e
KW
2943 const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
2944
2945# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
2946# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
76f2ffcd
KW
2947
2948 const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
76f2ffcd 2949
538e84ed
KW
2950 /* Special case these two characters, as what normally gets
2951 * returned under locale doesn't work */
76f2ffcd
KW
2952 if (UTF8SKIP(p) == cap_sharp_s_len
2953 && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
1ca267a5 2954 {
ab0b796c
KW
2955 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2956 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2957 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
2958 "resolved to \"\\x{17F}\\x{17F}\".");
1ca267a5
KW
2959 goto return_long_s;
2960 }
0766489e
KW
2961 else
2962#endif
2963 if (UTF8SKIP(p) == long_s_t_len
76f2ffcd 2964 && memEQ((char *) p, LONG_S_T, long_s_t_len))
9fc2026f 2965 {
ab0b796c
KW
2966 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2967 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2968 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
2969 "resolved to \"\\x{FB06}\".");
9fc2026f
KW
2970 goto return_ligature_st;
2971 }
74894415
KW
2972
2973#if UNICODE_MAJOR_VERSION == 3 \
2974 && UNICODE_DOT_VERSION == 0 \
2975 && UNICODE_DOT_DOT_VERSION == 1
2976# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
2977
2978 /* And special case this on this Unicode version only, for the same
2979 * reaons the other two are special cased. They would cross the
2980 * 255/256 boundary which is forbidden under /l, and so the code
2981 * wouldn't catch that they are equivalent (which they are only in
2982 * this release) */
2983 else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
2984 && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
2985 {
2986 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2987 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2988 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
2989 "resolved to \"\\x{0131}\".");
2990 goto return_dotless_i;
2991 }
2992#endif
2993
357aadde 2994 return check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 2995 }
a0270393
KW
2996 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
2997 return result;
2998 }
2999 else {
4a4088c4 3000 /* This is called when changing the case of a UTF-8-encoded
9fc2026f
KW
3001 * character above the ASCII range, and the result should not
3002 * contain an ASCII character. */
a0270393
KW
3003
3004 UV original; /* To store the first code point of <p> */
3005
3006 /* Look at every character in the result; if any cross the
3007 * boundary, the whole thing is disallowed */
3008 U8* s = ustrp;
3009 U8* e = ustrp + *lenp;
3010 while (s < e) {
3011 if (isASCII(*s)) {
3012 /* Crossed, have to return the original */
3013 original = valid_utf8_to_uvchr(p, lenp);
1ca267a5 3014
9fc2026f 3015 /* But in these instances, there is an alternative we can
1ca267a5 3016 * return that is valid */
0766489e
KW
3017 if (original == LATIN_SMALL_LETTER_SHARP_S
3018#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3019 || original == LATIN_CAPITAL_LETTER_SHARP_S
3020#endif
3021 ) {
1ca267a5
KW
3022 goto return_long_s;
3023 }
9fc2026f
KW
3024 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3025 goto return_ligature_st;
3026 }
74894415
KW
3027#if UNICODE_MAJOR_VERSION == 3 \
3028 && UNICODE_DOT_VERSION == 0 \
3029 && UNICODE_DOT_DOT_VERSION == 1
3030
3031 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3032 goto return_dotless_i;
3033 }
3034#endif
a0270393
KW
3035 Copy(p, ustrp, *lenp, char);
3036 return original;
3037 }
3038 s += UTF8SKIP(s);
3039 }
051a06d4 3040
a0270393
KW
3041 /* Here, no characters crossed, result is ok as-is */
3042 return result;
3043 }
051a06d4
KW
3044 }
3045
4a4088c4 3046 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
3047 if (UTF8_IS_INVARIANT(result)) {
3048 *ustrp = (U8) result;
3049 *lenp = 1;
3050 }
3051 else {
62cb07ea
KW
3052 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3053 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
3054 *lenp = 2;
3055 }
3056
051a06d4 3057 return result;
1ca267a5
KW
3058
3059 return_long_s:
3060 /* Certain folds to 'ss' are prohibited by the options, but they do allow
3061 * folds to a string of two of these characters. By returning this
3062 * instead, then, e.g.,
3063 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3064 * works. */
3065
3066 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
3067 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3068 ustrp, *lenp, U8);
3069 return LATIN_SMALL_LETTER_LONG_S;
9fc2026f
KW
3070
3071 return_ligature_st:
3072 /* Two folds to 'st' are prohibited by the options; instead we pick one and
3073 * have the other one fold to it */
3074
3075 *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
3076 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
3077 return LATIN_SMALL_LIGATURE_ST;
74894415
KW
3078
3079#if UNICODE_MAJOR_VERSION == 3 \
3080 && UNICODE_DOT_VERSION == 0 \
3081 && UNICODE_DOT_DOT_VERSION == 1
3082
3083 return_dotless_i:
3084 *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
3085 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
3086 return LATIN_SMALL_LETTER_DOTLESS_I;
3087
3088#endif
3089
a0ed51b3
LW
3090}
3091
711a919c 3092/* Note:
f90a9a02 3093 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
711a919c
TS
3094 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
3095 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
3096 */
c4a5db0c 3097
a0ed51b3 3098SV*
7fc63493 3099Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 3100{
c4a5db0c
KW
3101 PERL_ARGS_ASSERT_SWASH_INIT;
3102
3103 /* Returns a copy of a swash initiated by the called function. This is the
3104 * public interface, and returning a copy prevents others from doing
3105 * mischief on the original */
3106
5d3d13d1 3107 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
c4a5db0c
KW
3108}
3109
3110SV*
5d3d13d1 3111Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
c4a5db0c 3112{
2c1f00b9
YO
3113
3114 /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
3115 * use the following define */
3116
3117#define CORE_SWASH_INIT_RETURN(x) \
3118 PL_curpm= old_PL_curpm; \
3119 return x
3120
c4a5db0c 3121 /* Initialize and return a swash, creating it if necessary. It does this
87367d5f
KW
3122 * by calling utf8_heavy.pl in the general case. The returned value may be
3123 * the swash's inversion list instead if the input parameters allow it.
3124 * Which is returned should be immaterial to callers, as the only
923b6d4e
KW
3125 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
3126 * and swash_to_invlist() handle both these transparently.
c4a5db0c
KW
3127 *
3128 * This interface should only be used by functions that won't destroy or
3129 * adversely change the swash, as doing so affects all other uses of the
3130 * swash in the program; the general public should use 'Perl_swash_init'
3131 * instead.
3132 *
3133 * pkg is the name of the package that <name> should be in.
3134 * name is the name of the swash to find. Typically it is a Unicode
3135 * property name, including user-defined ones
3136 * listsv is a string to initialize the swash with. It must be of the form
3137 * documented as the subroutine return value in
3138 * L<perlunicode/User-Defined Character Properties>
3139 * minbits is the number of bits required to represent each data element.
3140 * It is '1' for binary properties.
3141 * none I (khw) do not understand this one, but it is used only in tr///.
9a53f6cf 3142 * invlist is an inversion list to initialize the swash with (or NULL)
83199d38
KW
3143 * flags_p if non-NULL is the address of various input and output flag bits
3144 * to the routine, as follows: ('I' means is input to the routine;
3145 * 'O' means output from the routine. Only flags marked O are
3146 * meaningful on return.)
3147 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
3148 * came from a user-defined property. (I O)
5d3d13d1
KW
3149 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
3150 * when the swash cannot be located, to simply return NULL. (I)
87367d5f
KW
3151 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
3152 * return of an inversion list instead of a swash hash if this routine
3153 * thinks that would result in faster execution of swash_fetch() later
3154 * on. (I)
9a53f6cf
KW
3155 *
3156 * Thus there are three possible inputs to find the swash: <name>,
3157 * <listsv>, and <invlist>. At least one must be specified. The result
3158 * will be the union of the specified ones, although <listsv>'s various
aabbdbda
KW
3159 * actions can intersect, etc. what <name> gives. To avoid going out to
3160 * disk at all, <invlist> should specify completely what the swash should
3161 * have, and <listsv> should be &PL_sv_undef and <name> should be "".
9a53f6cf
KW
3162 *
3163 * <invlist> is only valid for binary properties */
c4a5db0c 3164
2c1f00b9
YO
3165 PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
3166
c4a5db0c 3167 SV* retval = &PL_sv_undef;
83199d38 3168 HV* swash_hv = NULL;
87367d5f
KW
3169 const int invlist_swash_boundary =
3170 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
3171 ? 512 /* Based on some benchmarking, but not extensive, see commit
3172 message */
3173 : -1; /* Never return just an inversion list */
9a53f6cf
KW
3174
3175 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
3176 assert(! invlist || minbits == 1);
3177
2c1f00b9
YO
3178 PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
3179 that triggered the swash init and the swash init perl logic itself.
3180 See perl #122747 */
3181
9a53f6cf
KW
3182 /* If data was passed in to go out to utf8_heavy to find the swash of, do
3183 * so */
3184 if (listsv != &PL_sv_undef || strNE(name, "")) {
69794297
KW
3185 dSP;
3186 const size_t pkg_len = strlen(pkg);
3187 const size_t name_len = strlen(name);
3188 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
3189 SV* errsv_save;
3190 GV *method;
3191
3192 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
3193
3194 PUSHSTACKi(PERLSI_MAGIC);
ce3b816e 3195 ENTER;
69794297 3196 SAVEHINTS();
2782061f 3197 save_re_context();
650f067c
JL
3198 /* We might get here via a subroutine signature which uses a utf8
3199 * parameter name, at which point PL_subname will have been set
3200 * but not yet used. */
3201 save_item(PL_subname);
69794297
KW
3202 if (PL_parser && PL_parser->error_count)
3203 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
3204 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
4a4088c4 3205 if (!method) { /* demand load UTF-8 */
69794297 3206 ENTER;
db2c6cb3
FC
3207 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3208 GvSV(PL_errgv) = NULL;
1a419e6b 3209#ifndef NO_TAINT_SUPPORT
69794297
KW
3210 /* It is assumed that callers of this routine are not passing in
3211 * any user derived data. */
2782061f
DM
3212 /* Need to do this after save_re_context() as it will set
3213 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
3214 * in Perl_magic_get). Even line to create errsv_save can turn on
3215 * PL_tainted. */
284167a5
S
3216 SAVEBOOL(TAINT_get);
3217 TAINT_NOT;
3218#endif
69794297
KW
3219 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
3220 NULL);
eed484f9 3221 {
db2c6cb3
FC
3222 /* Not ERRSV, as there is no need to vivify a scalar we are
3223 about to discard. */
3224 SV * const errsv = GvSV(PL_errgv);
3225 if (!SvTRUE(errsv)) {
3226 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3227 SvREFCNT_dec(errsv);
3228 }
eed484f9 3229 }
69794297
KW
3230 LEAVE;
3231 }
3232 SPAGAIN;
3233 PUSHMARK(SP);
3234 EXTEND(SP,5);
3235 mPUSHp(pkg, pkg_len);
3236 mPUSHp(name, name_len);
3237 PUSHs(listsv);
3238 mPUSHi(minbits);
3239 mPUSHi(none);
3240 PUTBACK;
db2c6cb3
FC
3241 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3242 GvSV(PL_errgv) = NULL;
69794297
KW
3243 /* If we already have a pointer to the method, no need to use
3244 * call_method() to repeat the lookup. */
c41800a8
KW
3245 if (method
3246 ? call_sv(MUTABLE_SV(method), G_SCALAR)
69794297
KW
3247 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
3248 {
3249 retval = *PL_stack_sp--;
3250 SvREFCNT_inc(retval);
3251 }
eed484f9 3252 {
db2c6cb3
FC
3253 /* Not ERRSV. See above. */
3254 SV * const errsv = GvSV(PL_errgv);
3255 if (!SvTRUE(errsv)) {
3256 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3257 SvREFCNT_dec(errsv);
3258 }
eed484f9 3259 }
ce3b816e 3260 LEAVE;
69794297
KW
3261 POPSTACK;
3262 if (IN_PERL_COMPILETIME) {
3263 CopHINTS_set(PL_curcop, PL_hints);
3264 }
3265 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
d95e4a00 3266 if (SvPOK(retval)) {
69794297
KW
3267
3268 /* If caller wants to handle missing properties, let them */
5d3d13d1 3269 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
2c1f00b9 3270 CORE_SWASH_INIT_RETURN(NULL);
69794297
KW
3271 }
3272 Perl_croak(aTHX_
147e3846 3273 "Can't find Unicode property definition \"%" SVf "\"",
69794297 3274 SVfARG(retval));
a25b5927 3275 NOT_REACHED; /* NOTREACHED */
d95e4a00 3276 }
69794297 3277 }
9a53f6cf 3278 } /* End of calling the module to find the swash */
36eb48b4 3279
83199d38
KW
3280 /* If this operation fetched a swash, and we will need it later, get it */
3281 if (retval != &PL_sv_undef
3282 && (minbits == 1 || (flags_p
3283 && ! (*flags_p
3284 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
3285 {
3286 swash_hv = MUTABLE_HV(SvRV(retval));
3287
3288 /* If we don't already know that there is a user-defined component to
3289 * this swash, and the user has indicated they wish to know if there is
3290 * one (by passing <flags_p>), find out */
3291 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
3292 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
3293 if (user_defined && SvUV(*user_defined)) {
3294 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
3295 }
3296 }
3297 }
3298
36eb48b4
KW
3299 /* Make sure there is an inversion list for binary properties */
3300 if (minbits == 1) {
3301 SV** swash_invlistsvp = NULL;
3302 SV* swash_invlist = NULL;
9a53f6cf 3303 bool invlist_in_swash_is_valid = FALSE;
02c85471
FC
3304 bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
3305 an unclaimed reference count */
36eb48b4 3306
9a53f6cf 3307 /* If this operation fetched a swash, get its already existing
83199d38 3308 * inversion list, or create one for it */
36eb48b4 3309
83199d38 3310 if (swash_hv) {
5c9f4bd2 3311 swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
9a53f6cf
KW
3312 if (swash_invlistsvp) {
3313 swash_invlist = *swash_invlistsvp;
3314 invlist_in_swash_is_valid = TRUE;
3315 }
3316 else {
36eb48b4 3317 swash_invlist = _swash_to_invlist(retval);
02c85471 3318 swash_invlist_unclaimed = TRUE;
9a53f6cf
KW
3319 }
3320 }
3321
3322 /* If an inversion list was passed in, have to include it */
3323 if (invlist) {
3324
3325 /* Any fetched swash will by now have an inversion list in it;
3326 * otherwise <swash_invlist> will be NULL, indicating that we
3327 * didn't fetch a swash */
3328 if (swash_invlist) {
3329
3330 /* Add the passed-in inversion list, which invalidates the one
3331 * already stored in the swash */
3332 invlist_in_swash_is_valid = FALSE;
eee4c920 3333 SvREADONLY_off(swash_invlist); /* Turned on again below */
9a53f6cf
KW
3334 _invlist_union(invlist, swash_invlist, &swash_invlist);
3335 }
3336 else {
3337
87367d5f
KW
3338 /* Here, there is no swash already. Set up a minimal one, if
3339 * we are going to return a swash */
3340 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
971d486f 3341 swash_hv = newHV();
4aca0fe6 3342 retval = newRV_noinc(MUTABLE_SV(swash_hv));
87367d5f 3343 }
9a53f6cf
KW
3344 swash_invlist = invlist;
3345 }
9a53f6cf
KW
3346 }
3347
3348 /* Here, we have computed the union of all the passed-in data. It may
3349 * be that there was an inversion list in the swash which didn't get
538e84ed 3350 * touched; otherwise save the computed one */
87367d5f
KW
3351 if (! invlist_in_swash_is_valid
3352 && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
3353 {
5c9f4bd2 3354 if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
69794297
KW
3355 {
3356 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3357 }
cc34d8c5
FC
3358 /* We just stole a reference count. */
3359 if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
3360 else SvREFCNT_inc_simple_void_NN(swash_invlist);
9a53f6cf 3361 }
87367d5f 3362
ee3222e3 3363 /* The result is immutable. Forbid attempts to change it. */
dbfdbd26
KW
3364 SvREADONLY_on(swash_invlist);
3365
c41800a8 3366 /* Use the inversion list stand-alone if small enough */
87367d5f
KW
3367 if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
3368 SvREFCNT_dec(retval);
02c85471
FC
3369 if (!swash_invlist_unclaimed)
3370 SvREFCNT_inc_simple_void_NN(swash_invlist);
3371 retval = newRV_noinc(swash_invlist);
87367d5f 3372 }
36eb48b4
KW
3373 }
3374
2c1f00b9
YO
3375 CORE_SWASH_INIT_RETURN(retval);
3376#undef CORE_SWASH_INIT_RETURN
a0ed51b3
LW
3377}
3378
035d37be
JH
3379
3380/* This API is wrong for special case conversions since we may need to
3381 * return several Unicode characters for a single Unicode character
3382 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
3383 * the lower-level routine, and it is similarly broken for returning
38684baa 3384 * multiple values. --jhi
b9992569 3385 * For those, you should use S__to_utf8_case() instead */
b0e3252e 3386/* Now SWASHGET is recasted into S_swatch_get in this file. */
680c470c
TS
3387
3388/* Note:
3389 * Returns the value of property/mapping C<swash> for the first character
3390 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
4a4088c4 3391 * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
3d0f8846 3392 * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
af2af982
KW
3393 *
3394 * A "swash" is a hash which contains initially the keys/values set up by
3395 * SWASHNEW. The purpose is to be able to completely represent a Unicode
3396 * property for all possible code points. Things are stored in a compact form
3397 * (see utf8_heavy.pl) so that calculation is required to find the actual
3398 * property value for a given code point. As code points are looked up, new
3399 * key/value pairs are added to the hash, so that the calculation doesn't have
3400 * to ever be re-done. Further, each calculation is done, not just for the
3401 * desired one, but for a whole block of code points adjacent to that one.
3402 * For binary properties on ASCII machines, the block is usually for 64 code
3403 * points, starting with a code point evenly divisible by 64. Thus if the
3404 * property value for code point 257 is requested, the code goes out and
3405 * calculates the property values for all 64 code points between 256 and 319,
3406 * and stores these as a single 64-bit long bit vector, called a "swatch",
3407 * under the key for code point 256. The key is the UTF-8 encoding for code
3408 * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
3409 * for a code point is 13 bytes, the key will be 12 bytes long. If the value
3410 * for code point 258 is then requested, this code realizes that it would be
3411 * stored under the key for 256, and would find that value and extract the
3412 * relevant bit, offset from 256.
3413 *
3414 * Non-binary properties are stored in as many bits as necessary to represent
3415 * their values (32 currently, though the code is more general than that), not
fc273927 3416 * as single bits, but the principle is the same: the value for each key is a
af2af982
KW
3417 * vector that encompasses the property values for all code points whose UTF-8
3418 * representations are represented by the key. That is, for all code points
3419 * whose UTF-8 representations are length N bytes, and the key is the first N-1
3420 * bytes of that.
680c470c 3421 */
a0ed51b3 3422UV
680c470c 3423Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 3424{
ef8f7699 3425 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
3426 U32 klen;
3427 U32 off;
9b56a019 3428 STRLEN slen = 0;
7d85a32c 3429 STRLEN needents;
cfd0369c 3430 const U8 *tmps = NULL;
979f2922 3431 SV *swatch;
08fb1ac5 3432 const U8 c = *ptr;
3568d838 3433
7918f24d
NC
3434 PERL_ARGS_ASSERT_SWASH_FETCH;
3435
87367d5f
KW
3436 /* If it really isn't a hash, it isn't really swash; must be an inversion
3437 * list */
3438 if (SvTYPE(hv) != SVt_PVHV) {
3439 return _invlist_contains_cp((SV*)hv,
3440 (do_utf8)
3441 ? valid_utf8_to_uvchr(ptr, NULL)
3442 : c);
3443 }
3444
08fb1ac5
KW
3445 /* We store the values in a "swatch" which is a vec() value in a swash
3446 * hash. Code points 0-255 are a single vec() stored with key length
3447 * (klen) 0. All other code points have a UTF-8 representation
3448 * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
3449 * share 0xAA..0xYY, which is the key in the hash to that vec. So the key
3450 * length for them is the length of the encoded char - 1. ptr[klen] is the
3451 * final byte in the sequence representing the character */
3452 if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
3453 klen = 0;
3454 needents = 256;
3455 off = c;
3568d838 3456 }
08fb1ac5
KW
3457 else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
3458 klen = 0;
3459 needents = 256;
a62b247b 3460 off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
979f2922
TS
3461 }
3462 else {
08fb1ac5
KW
3463 klen = UTF8SKIP(ptr) - 1;
3464
3465 /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
3466 * the vec is the final byte in the sequence. (In EBCDIC this is
3467 * converted to I8 to get consecutive values.) To help you visualize
3468 * all this:
3469 * Straight 1047 After final byte
3470 * UTF-8 UTF-EBCDIC I8 transform
3471 * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
3472 * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
3473 * ...
3474 * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
3475 * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
3476 * ...
3477 * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
3478 * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
3479 * ...
3480 * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
3481 * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
3482 * ...
3483 * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
3484 * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
3485 *
3486 * (There are no discontinuities in the elided (...) entries.)
3487 * The UTF-8 key for these 33 code points is '\xD0' (which also is the
3488 * key for the next 31, up through U+043F, whose UTF-8 final byte is
3489 * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
3490 * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
3491 * index into the vec() swatch (after subtracting 0x80, which we
3492 * actually do with an '&').
3493 * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
3494 * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
3495 * dicontinuities which go away by transforming it into I8, and we
3496 * effectively subtract 0xA0 to get the index. */
979f2922 3497 needents = (1 << UTF_ACCUMULATION_SHIFT);
bc3632a8 3498 off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
979f2922 3499 }
7d85a32c 3500
a0ed51b3 3501 /*
4a4088c4 3502 * This single-entry cache saves about 1/3 of the UTF-8 overhead in test
a0ed51b3
LW
3503 * suite. (That is, only 7-8% overall over just a hash cache. Still,
3504 * it's nothing to sniff at.) Pity we usually come through at least
3505 * two function calls to get here...
3506 *
3507 * NB: this code assumes that swatches are never modified, once generated!
3508 */
3509
3568d838 3510 if (hv == PL_last_swash_hv &&
a0ed51b3 3511 klen == PL_last_swash_klen &&
27da23d5 3512 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
3513 {
3514 tmps = PL_last_swash_tmps;
3515 slen = PL_last_swash_slen;
3516 }
3517 else {
3518 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 3519 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 3520
b0e3252e 3521 /* If not cached, generate it via swatch_get */
979f2922 3522 if (!svp || !SvPOK(*svp)
08fb1ac5
KW
3523 || !(tmps = (const U8*)SvPV_const(*svp, slen)))
3524 {
3525 if (klen) {
3526 const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
3527 swatch = swatch_get(swash,
3528 code_point & ~((UV)needents - 1),
3529 needents);
3530 }
3531 else { /* For the first 256 code points, the swatch has a key of
3532 length 0 */
3533 swatch = swatch_get(swash, 0, needents);
3534 }
979f2922 3535
923e4eb5 3536 if (IN_PERL_COMPILETIME)
623e6609 3537 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 3538
979f2922 3539 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 3540
979f2922
TS
3541 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
3542 || (slen << 3) < needents)
5637ef5b 3543 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
147e3846 3544 "svp=%p, tmps=%p, slen=%" UVuf ", needents=%" UVuf,
5637ef5b 3545 svp, tmps, (UV)slen, (UV)needents);
a0ed51b3
LW
3546 }
3547
3548 PL_last_swash_hv = hv;
16d8f38a 3549 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 3550 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
3551 /* FIXME change interpvar.h? */
3552 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
3553 PL_last_swash_slen = slen;
3554 if (klen)
3555 Copy(ptr, PL_last_swash_key, klen, U8);
3556 }
3557
9faf8d75 3558 switch ((int)((slen << 3) / needents)) {
a0ed51b3 3559 case 1:
e7aca353 3560 return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
a0ed51b3 3561 case 8:
e7aca353 3562 return ((UV) tmps[off]);
a0ed51b3
LW
3563 case 16:
3564 off <<= 1;
e7aca353
JH
3565 return
3566 ((UV) tmps[off ] << 8) +
3567 ((UV) tmps[off + 1]);
a0ed51b3
LW
3568 case 32:
3569 off <<= 2;
e7aca353
JH
3570 return
3571 ((UV) tmps[off ] << 24) +
3572 ((UV) tmps[off + 1] << 16) +
3573 ((UV) tmps[off + 2] << 8) +
3574 ((UV) tmps[off + 3]);
a0ed51b3 3575 }
5637ef5b 3576 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
147e3846 3577 "slen=%" UVuf ", needents=%" UVuf, (UV)slen, (UV)needents);
670f1322 3578 NORETURN_FUNCTION_END;
a0ed51b3 3579}
2b9d42f0 3580
319009ee
KW
3581/* Read a single line of the main body of the swash input text. These are of
3582 * the form:
3583 * 0053 0056 0073
3584 * where each number is hex. The first two numbers form the minimum and
3585 * maximum of a range, and the third is the value associated with the range.
3586 * Not all swashes should have a third number
3587 *
3588 * On input: l points to the beginning of the line to be examined; it points
3589 * to somewhere in the string of the whole input text, and is
3590 * terminated by a \n or the null string terminator.
3591 * lend points to the null terminator of that string
3592 * wants_value is non-zero if the swash expects a third number
3593 * typestr is the name of the swash's mapping, like 'ToLower'
3594 * On output: *min, *max, and *val are set to the values read from the line.
3595 * returns a pointer just beyond the line examined. If there was no
3596 * valid min number on the line, returns lend+1
3597 */
3598
3599STATIC U8*
3600S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3601 const bool wants_value, const U8* const typestr)
3602{
3603 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
3604 STRLEN numlen; /* Length of the number */
02470786
KW
3605 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3606 | PERL_SCAN_DISALLOW_PREFIX
3607 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3608
3609 /* nl points to the next \n in the scan */
3610 U8* const nl = (U8*)memchr(l, '\n', lend - l);
3611
95543e92
KW
3612 PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
3613
319009ee
KW
3614 /* Get the first number on the line: the range minimum */
3615 numlen = lend - l;
3616 *min = grok_hex((char *)l, &numlen, &flags, NULL);
c88850db 3617 *max = *min; /* So can never return without setting max */
319009ee
KW
3618 if (numlen) /* If found a hex number, position past it */
3619 l += numlen;
3620 else if (nl) { /* Else, go handle next line, if any */
3621 return nl + 1; /* 1 is length of "\n" */
3622 }
3623 else { /* Else, no next line */
3624 return lend + 1; /* to LIST's end at which \n is not found */
3625 }
3626
3627 /* The max range value follows, separated by a BLANK */
3628 if (isBLANK(*l)) {
3629 ++l;
02470786
KW
3630 flags = PERL_SCAN_SILENT_ILLDIGIT
3631 | PERL_SCAN_DISALLOW_PREFIX
3632 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3633 numlen = lend - l;
3634 *max = grok_hex((char *)l, &numlen, &flags, NULL);
3635 if (numlen)
3636 l += numlen;
3637 else /* If no value here, it is a single element range */
3638 *max = *min;
3639
3640 /* Non-binary tables have a third entry: what the first element of the
24303724 3641 * range maps to. The map for those currently read here is in hex */
319009ee
KW
3642 if (wants_value) {
3643 if (isBLANK(*l)) {
3644 ++l;
f2a7d0fc
KW
3645 flags = PERL_SCAN_SILENT_ILLDIGIT
3646 | PERL_SCAN_DISALLOW_PREFIX
3647 | PERL_SCAN_SILENT_NON_PORTABLE;
3648 numlen = lend - l;
3649 *val = grok_hex((char *)l, &numlen, &flags, NULL);
3650 if (numlen)
3651 l += numlen;
3652 else
3653 *val = 0;
319009ee
KW
3654 }
3655 else {
3656 *val = 0;
3657 if (typeto) {
dcbac5bb 3658 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
3659 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
3660 typestr, l);
3661 }
3662 }
3663 }
3664 else
3665 *val = 0; /* bits == 1, then any val should be ignored */
3666 }
3667 else { /* Nothing following range min, should be single element with no
3668 mapping expected */
319009ee
KW
3669 if (wants_value) {
3670 *val = 0;
3671 if (typeto) {
dcbac5bb 3672 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
3673 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
3674 }
3675 }
3676 else
3677 *val = 0; /* bits == 1, then val should be ignored */
3678 }
3679
3680 /* Position to next line if any, or EOF */
3681 if (nl)
3682 l = nl + 1;
3683 else
3684 l = lend;
3685
3686 return l;
3687}
3688
979f2922
TS
3689/* Note:
3690 * Returns a swatch (a bit vector string) for a code point sequence
3691 * that starts from the value C<start> and comprises the number C<span>.
3692 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3693 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
3694 */
3695STATIC SV*
b0e3252e 3696S_swatch_get(pTHX_ SV* swash, UV start, UV span)
979f2922
TS
3697{
3698 SV *swatch;
77f9f126 3699 U8 *l, *lend, *x, *xend, *s, *send;
979f2922 3700 STRLEN lcur, xcur, scur;
ef8f7699 3701 HV *const hv = MUTABLE_HV(SvRV(swash));
5c9f4bd2 3702 SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
36eb48b4 3703
88d45d28
KW
3704 SV** listsvp = NULL; /* The string containing the main body of the table */
3705 SV** extssvp = NULL;
3706 SV** invert_it_svp = NULL;
3707 U8* typestr = NULL;
786861f5
KW
3708 STRLEN bits;
3709 STRLEN octets; /* if bits == 1, then octets == 0 */
3710 UV none;
3711 UV end = start + span;
972dd592 3712
36eb48b4 3713 if (invlistsvp == NULL) {
786861f5
KW
3714 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3715 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3716 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3717 extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3718 listsvp = hv_fetchs(hv, "LIST", FALSE);
3719 invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3720
3721 bits = SvUV(*bitssvp);
3722 none = SvUV(*nonesvp);
3723 typestr = (U8*)SvPV_nolen(*typesvp);
3724 }
36eb48b4
KW
3725 else {
3726 bits = 1;
3727 none = 0;
3728 }
786861f5 3729 octets = bits >> 3; /* if bits == 1, then octets == 0 */
979f2922 3730
b0e3252e 3731 PERL_ARGS_ASSERT_SWATCH_GET;
7918f24d 3732
979f2922 3733 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
147e3846 3734 Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %" UVuf,
660a4616 3735 (UV)bits);
979f2922
TS
3736 }
3737
84ea5ef6
KW
3738 /* If overflowed, use the max possible */
3739 if (end < start) {
3740 end = UV_MAX;
3741 span = end - start;
3742 }
3743
979f2922 3744 /* create and initialize $swatch */
979f2922 3745 scur = octets ? (span * octets) : (span + 7) / 8;
e524fe40
NC
3746 swatch = newSV(scur);
3747 SvPOK_on(swatch);
979f2922
TS
3748 s = (U8*)SvPVX(swatch);
3749 if (octets && none) {
0bd48802 3750 const U8* const e = s + scur;
979f2922
TS
3751 while (s < e) {
3752 if (bits == 8)
3753 *s++ = (U8)(none & 0xff);
3754 else if (bits == 16) {
3755 *s++ = (U8)((none >> 8) & 0xff);
3756 *s++ = (U8)( none & 0xff);
3757 }
3758 else if (bits == 32) {
3759 *s++ = (U8)((none >> 24) & 0xff);
3760 *s++ = (U8)((none >> 16) & 0xff);
3761 *s++ = (U8)((none >> 8) & 0xff);
3762 *s++ = (U8)( none & 0xff);
3763 }
3764 }
3765 *s = '\0';
3766 }
3767 else {
3768 (void)memzero((U8*)s, scur + 1);
3769 }
3770 SvCUR_set(swatch, scur);
3771 s = (U8*)SvPVX(swatch);
3772
36eb48b4
KW
3773 if (invlistsvp) { /* If has an inversion list set up use that */
3774 _invlist_populate_swatch(*invlistsvp, start, end, s);
3775 return swatch;
3776 }
3777
3778 /* read $swash->{LIST} */
979f2922
TS
3779 l = (U8*)SvPV(*listsvp, lcur);
3780 lend = l + lcur;
3781 while (l < lend) {
8ed25d53 3782 UV min, max, val, upper;
95543e92
KW
3783 l = swash_scan_list_line(l, lend, &min, &max, &val,
3784 cBOOL(octets), typestr);
319009ee 3785 if (l > lend) {
979f2922
TS
3786 break;
3787 }
3788
972dd592 3789 /* If looking for something beyond this range, go try the next one */
979f2922
TS
3790 if (max < start)
3791 continue;
3792
8ed25d53
KW
3793 /* <end> is generally 1 beyond where we want to set things, but at the
3794 * platform's infinity, where we can't go any higher, we want to
3795 * include the code point at <end> */
3796 upper = (max < end)
3797 ? max
3798 : (max != UV_MAX || end != UV_MAX)
3799 ? end - 1
3800 : end;
3801
979f2922 3802 if (octets) {
35da51f7 3803 UV key;
979f2922
TS
3804 if (min < start) {
3805 if (!none || val < none) {
3806 val += start - min;
3807 }
3808 min = start;
3809 }
8ed25d53 3810 for (key = min; key <= upper; key++) {
979f2922 3811 STRLEN offset;
979f2922
TS
3812 /* offset must be non-negative (start <= min <= key < end) */
3813 offset = octets * (key - start);
3814 if (bits == 8)
3815 s[offset] = (U8)(val & 0xff);
3816 else if (bits == 16) {
3817 s[offset ] = (U8)((val >> 8) & 0xff);
3818 s[offset + 1] = (U8)( val & 0xff);
3819 }
3820 else if (bits == 32) {
3821 s[offset ] = (U8)((val >> 24) & 0xff);
3822 s[offset + 1] = (U8)((val >> 16) & 0xff);
3823 s[offset + 2] = (U8)((val >> 8) & 0xff);
3824 s[offset + 3] = (U8)( val & 0xff);
3825 }
3826
3827 if (!none || val < none)
3828 ++val;
3829 }
3830 }
711a919c 3831 else { /* bits == 1, then val should be ignored */
35da51f7 3832 UV key;
979f2922
TS
3833 if (min < start)
3834 min = start;
6cb05c12 3835
8ed25d53 3836 for (key = min; key <= upper; key++) {
0bd48802 3837 const STRLEN offset = (STRLEN)(key - start);
979f2922
TS
3838 s[offset >> 3] |= 1 << (offset & 7);
3839 }
3840 }
3841 } /* while */
979f2922 3842
9479a769 3843 /* Invert if the data says it should be. Assumes that bits == 1 */
77f9f126 3844 if (invert_it_svp && SvUV(*invert_it_svp)) {
0bda3001
KW
3845
3846 /* Unicode properties should come with all bits above PERL_UNICODE_MAX
3847 * be 0, and their inversion should also be 0, as we don't succeed any
3848 * Unicode property matches for non-Unicode code points */
3849 if (start <= PERL_UNICODE_MAX) {
3850
3851 /* The code below assumes that we never cross the
3852 * Unicode/above-Unicode boundary in a range, as otherwise we would
3853 * have to figure out where to stop flipping the bits. Since this
3854 * boundary is divisible by a large power of 2, and swatches comes
3855 * in small powers of 2, this should be a valid assumption */
3856 assert(start + span - 1 <= PERL_UNICODE_MAX);
3857
507a8485
KW
3858 send = s + scur;
3859 while (s < send) {
3860 *s = ~(*s);
3861 s++;
3862 }
0bda3001 3863 }
77f9f126
KW
3864 }
3865
d73c39c5
KW
3866 /* read $swash->{EXTRAS}
3867 * This code also copied to swash_to_invlist() below */
979f2922
TS
3868 x = (U8*)SvPV(*extssvp, xcur);
3869 xend = x + xcur;
3870 while (x < xend) {
3871 STRLEN namelen;
3872 U8 *namestr;
3873 SV** othersvp;
3874 HV* otherhv;
3875 STRLEN otherbits;
3876 SV **otherbitssvp, *other;
711a919c 3877 U8 *s, *o, *nl;
979f2922
TS
3878 STRLEN slen, olen;
3879
35da51f7 3880 const U8 opc = *x++;
979f2922
TS
3881 if (opc == '\n')
3882 continue;
3883
3884 nl = (U8*)memchr(x, '\n', xend - x);
3885
3886 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3887 if (nl) {
3888 x = nl + 1; /* 1 is length of "\n" */
3889 continue;
3890 }
3891 else {
3892 x = xend; /* to EXTRAS' end at which \n is not found */
3893 break;
3894 }
3895 }
3896
3897 namestr = x;
3898 if (nl) {
3899 namelen = nl - namestr;
3900 x = nl + 1;
3901 }
3902 else {
3903 namelen = xend - namestr;
3904 x = xend;
3905 }
3906
3907 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
ef8f7699 3908 otherhv = MUTABLE_HV(SvRV(*othersvp));
017a3ce5 3909 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922
TS
3910 otherbits = (STRLEN)SvUV(*otherbitssvp);
3911 if (bits < otherbits)
5637ef5b 3912 Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
147e3846 3913 "bits=%" UVuf ", otherbits=%" UVuf, (UV)bits, (UV)otherbits);
979f2922
TS
3914
3915 /* The "other" swatch must be destroyed after. */
b0e3252e 3916 other = swatch_get(*othersvp, start, span);
979f2922
TS
3917 o = (U8*)SvPV(other, olen);
3918
3919 if (!olen)
b0e3252e 3920 Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
979f2922
TS
3921
3922 s = (U8*)SvPV(swatch, slen);
3923 if (bits == 1 && otherbits == 1) {
3924 if (slen != olen)
5637ef5b 3925 Perl_croak(aTHX_ "panic: swatch_get found swatch length "
147e3846 3926 "mismatch, slen=%" UVuf ", olen=%" UVuf,
5637ef5b 3927 (UV)slen, (UV)olen);
979f2922
TS
3928
3929 switch (opc) {
3930 case '+':
3931 while (slen--)
3932 *s++ |= *o++;
3933 break;
3934 case '!':
3935 while (slen--)
3936 *s++ |= ~*o++;
3937 break;
3938 case '-':
3939 while (slen--)
3940 *s++ &= ~*o++;
3941 break;
3942 case '&':
3943 while (slen--)
3944 *s++ &= *o++;
3945 break;
3946 default:
3947 break;
3948 }
3949 }
711a919c 3950 else {
979f2922
TS
3951 STRLEN otheroctets = otherbits >> 3;
3952 STRLEN offset = 0;
35da51f7 3953 U8* const send = s + slen;
979f2922
TS
3954
3955 while (s < send) {
3956 UV otherval = 0;
3957
3958 if (otherbits == 1) {
3959 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
3960 ++offset;
3961 }
3962 else {
3963 STRLEN vlen = otheroctets;
3964 otherval = *o++;
3965 while (--vlen) {
3966 otherval <<= 8;
3967 otherval |= *o++;
3968 }
3969 }
3970
711a919c 3971 if (opc == '+' && otherval)
6f207bd3 3972 NOOP; /* replace with otherval */
979f2922
TS
3973 else if (opc == '!' && !otherval)
3974 otherval = 1;
3975 else if (opc == '-' && otherval)
3976 otherval = 0;
3977 else if (opc == '&' && !otherval)
3978 otherval = 0;
3979 else {
711a919c 3980 s += octets; /* no replacement */
979f2922
TS
3981 continue;
3982 }
3983
3984 if (bits == 8)
3985 *s++ = (U8)( otherval & 0xff);
3986 else if (bits == 16) {
3987 *s++ = (U8)((otherval >> 8) & 0xff);
3988 *s++ = (U8)( otherval & 0xff);
3989 }
3990 else if (bits == 32) {
3991 *s++ = (U8)((otherval >> 24) & 0xff);
3992 *s++ = (U8)((otherval >> 16) & 0xff);
3993 *s++ = (U8)((otherval >> 8) & 0xff);
3994 *s++ = (U8)( otherval & 0xff);
3995 }
3996 }
3997 }
3998 sv_free(other); /* through with it! */
3999 } /* while */
4000 return swatch;
4001}
4002
064c021d 4003HV*
4c2e1131 4004Perl__swash_inversion_hash(pTHX_ SV* const swash)
064c021d
KW
4005{
4006
79a2a0e8 4007 /* Subject to change or removal. For use only in regcomp.c and regexec.c
5662e334
KW
4008 * Can't be used on a property that is subject to user override, as it
4009 * relies on the value of SPECIALS in the swash which would be set by
4010 * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
4011 * for overridden properties
064c021d
KW
4012 *
4013 * Returns a hash which is the inversion and closure of a swash mapping.
4014 * For example, consider the input lines:
4015 * 004B 006B
4016 * 004C 006C
4017 * 212A 006B
4018 *
4a4088c4 4019 * The returned hash would have two keys, the UTF-8 for 006B and the UTF-8 for
064c021d 4020 * 006C. The value for each key is an array. For 006C, the array would
4a4088c4
KW
4021 * have two elements, the UTF-8 for itself, and for 004C. For 006B, there
4022 * would be three elements in its array, the UTF-8 for 006B, 004B and 212A.
064c021d 4023 *
538e84ed
KW
4024 * Note that there are no elements in the hash for 004B, 004C, 212A. The
4025 * keys are only code points that are folded-to, so it isn't a full closure.
4026 *
064c021d
KW
4027 * Essentially, for any code point, it gives all the code points that map to
4028 * it, or the list of 'froms' for that point.
4029 *
5662e334
KW
4030 * Currently it ignores any additions or deletions from other swashes,
4031 * looking at just the main body of the swash, and if there are SPECIALS
4032 * in the swash, at that hash
4033 *
4034 * The specials hash can be extra code points, and most likely consists of
4035 * maps from single code points to multiple ones (each expressed as a string
4a4088c4 4036 * of UTF-8 characters). This function currently returns only 1-1 mappings.
5662e334
KW
4037 * However consider this possible input in the specials hash:
4038 * "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074
4039 * "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074
4040 *
4041 * Both FB05 and FB06 map to the same multi-char sequence, which we don't
4042 * currently handle. But it also means that FB05 and FB06 are equivalent in
4043 * a 1-1 mapping which we should handle, and this relationship may not be in
4044 * the main table. Therefore this function examines all the multi-char
74894415
KW
4045 * sequences and adds the 1-1 mappings that come out of that.
4046 *
4047 * XXX This function was originally intended to be multipurpose, but its
4048 * only use is quite likely to remain for constructing the inversion of
4049 * the CaseFolding (//i) property. If it were more general purpose for
4050 * regex patterns, it would have to do the FB05/FB06 game for simple folds,
4051 * because certain folds are prohibited under /iaa and /il. As an example,
4052 * in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both
4053 * equivalent under /i. But under /iaa and /il, the folds to 'i' are
4054 * prohibited, so we would not figure out that they fold to each other.
4055 * Code could be written to automatically figure this out, similar to the
4056 * code that does this for multi-character folds, but this is the only case
4057 * where something like this is ever likely to happen, as all the single
7ee537e6 4058 * char folds to the 0-255 range are now quite settled. Instead there is a
74894415
KW
4059 * little special code that is compiled only for this Unicode version. This
4060 * is smaller and didn't require much coding time to do. But this makes
4061 * this routine strongly tied to being used just for CaseFolding. If ever
4062 * it should be generalized, this would have to be fixed */
064c021d
KW
4063
4064 U8 *l, *lend;
4065 STRLEN lcur;
4066 HV *const hv = MUTABLE_HV(SvRV(swash));
4067
923b6d4e
KW
4068 /* The string containing the main body of the table. This will have its
4069 * assertion fail if the swash has been converted to its inversion list */
064c021d
KW
4070 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
4071
4072 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
4073 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
4074 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
4075 /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
4076 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
4077 const STRLEN bits = SvUV(*bitssvp);
4078 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
4079 const UV none = SvUV(*nonesvp);
5662e334 4080 SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
064c021d
KW
4081
4082 HV* ret = newHV();
4083
4084 PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
4085
4086 /* Must have at least 8 bits to get the mappings */
4087 if (bits != 8 && bits != 16 && bits != 32) {
147e3846 4088 Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %" UVuf,
064c021d
KW
4089 (UV)bits);
4090 }
4091
5662e334
KW
4092 if (specials_p) { /* It might be "special" (sometimes, but not always, a
4093 mapping to more than one character */
4094
4095 /* Construct an inverse mapping hash for the specials */
4096 HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
4097 HV * specials_inverse = newHV();
4098 char *char_from; /* the lhs of the map */
4099 I32 from_len; /* its byte length */
4100 char *char_to; /* the rhs of the map */
4101 I32 to_len; /* its byte length */
4102 SV *sv_to; /* and in a sv */
4103 AV* from_list; /* list of things that map to each 'to' */
4104
4105 hv_iterinit(specials_hv);
4106
4a4088c4
KW
4107 /* The keys are the characters (in UTF-8) that map to the corresponding
4108 * UTF-8 string value. Iterate through the list creating the inverse
5662e334
KW
4109 * list. */
4110 while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
4111 SV** listp;
4112 if (! SvPOK(sv_to)) {
5637ef5b
NC
4113 Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
4114 "unexpectedly is not a string, flags=%lu",
4115 (unsigned long)SvFLAGS(sv_to));
5662e334 4116 }
147e3846 4117 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %" UVXf ", First char of to is %" UVXf "\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
5662e334
KW
4118
4119 /* Each key in the inverse list is a mapped-to value, and the key's
4a4088c4 4120 * hash value is a list of the strings (each in UTF-8) that map to
5662e334
KW
4121 * it. Those strings are all one character long */
4122 if ((listp = hv_fetch(specials_inverse,
4123 SvPVX(sv_to),
4124 SvCUR(sv_to), 0)))
4125 {
4126 from_list = (AV*) *listp;
4127 }
4128 else { /* No entry yet for it: create one */
4129 from_list = newAV();
4130 if (! hv_store(specials_inverse,
4131 SvPVX(sv_to),
4132 SvCUR(sv_to),
4133 (SV*) from_list, 0))
4134 {
4135 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
4136 }
4137 }
4138
4139 /* Here have the list associated with this 'to' (perhaps newly
4140 * created and empty). Just add to it. Note that we ASSUME that
4141 * the input is guaranteed to not have duplications, so we don't
4142 * check for that. Duplications just slow down execution time. */
4143 av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
4144 }
4145
4146 /* Here, 'specials_inverse' contains the inverse mapping. Go through
4147 * it looking for cases like the FB05/FB06 examples above. There would
4148 * be an entry in the hash like
4149 * 'st' => [ FB05, FB06 ]
4150 * In this example we will create two lists that get stored in the
4151 * returned hash, 'ret':
4152 * FB05 => [ FB05, FB06 ]
4153 * FB06 => [ FB05, FB06 ]
4154 *
4155 * Note that there is nothing to do if the array only has one element.
4156 * (In the normal 1-1 case handled below, we don't have to worry about
4157 * two lists, as everything gets tied to the single list that is
4158 * generated for the single character 'to'. But here, we are omitting
4159 * that list, ('st' in the example), so must have multiple lists.) */
4160 while ((from_list = (AV *) hv_iternextsv(specials_inverse,
4161 &char_to, &to_len)))
4162 {
7e13d567 4163 if (av_tindex_nomg(from_list) > 0) {
c70927a6 4164 SSize_t i;
5662e334
KW
4165
4166 /* We iterate over all combinations of i,j to place each code
4167 * point on each list */
7e13d567 4168 for (i = 0; i <= av_tindex_nomg(from_list); i++) {
c70927a6 4169 SSize_t j;
5662e334
KW
4170 AV* i_list = newAV();
4171 SV** entryp = av_fetch(from_list, i, FALSE);
4172 if (entryp == NULL) {
4173 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
4174 }
4175 if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) {
4176 Perl_croak(aTHX_ "panic: unexpected entry for %s", SvPVX(*entryp));
4177 }
4178 if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp),
4179 (SV*) i_list, FALSE))
4180 {
4181 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
4182 }
4183
538e84ed 4184 /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
7e13d567 4185 for (j = 0; j <= av_tindex_nomg(from_list); j++) {
5662e334
KW
4186 entryp = av_fetch(from_list, j, FALSE);
4187 if (entryp == NULL) {
4188 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
4189 }
4190
4191 /* When i==j this adds itself to the list */
4b88fb76
KW
4192 av_push(i_list, newSVuv(utf8_to_uvchr_buf(
4193 (U8*) SvPVX(*entryp),
4194 (U8*) SvPVX(*entryp) + SvCUR(*entryp),
4195 0)));
147e3846 4196 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
5662e334
KW
4197 }
4198 }
4199 }
4200 }
4201 SvREFCNT_dec(specials_inverse); /* done with it */
4202 } /* End of specials */
4203
064c021d 4204 /* read $swash->{LIST} */
74894415
KW
4205
4206#if UNICODE_MAJOR_VERSION == 3 \
4207 && UNICODE_DOT_VERSION == 0 \
4208 && UNICODE_DOT_DOT_VERSION == 1
4209
4210 /* For this version only U+130 and U+131 are equivalent under qr//i. Add a
4211 * rule so that things work under /iaa and /il */
4212
4213 SV * mod_listsv = sv_mortalcopy(*listsvp);
4214 sv_catpv(mod_listsv, "130\t130\t131\n");
4215 l = (U8*)SvPV(mod_listsv, lcur);
4216
4217#else
4218
064c021d 4219 l = (U8*)SvPV(*listsvp, lcur);
74894415
KW
4220
4221#endif
4222
064c021d
KW
4223 lend = l + lcur;
4224
4225 /* Go through each input line */
4226 while (l < lend) {
4227 UV min, max, val;
4228 UV inverse;
95543e92
KW
4229 l = swash_scan_list_line(l, lend, &min, &max, &val,
4230 cBOOL(octets), typestr);
064c021d
KW
4231 if (l > lend) {
4232 break;
4233 }
4234
4235 /* Each element in the range is to be inverted */
4236 for (inverse = min; inverse <= max; inverse++) {
4237 AV* list;
064c021d
KW
4238 SV** listp;
4239 IV i;
4240 bool found_key = FALSE;
5662e334 4241 bool found_inverse = FALSE;
064c021d
KW
4242
4243 /* The key is the inverse mapping */
4244 char key[UTF8_MAXBYTES+1];
c80e42f3 4245 char* key_end = (char *) uvchr_to_utf8((U8*) key, val);
064c021d
KW
4246 STRLEN key_len = key_end - key;
4247
064c021d
KW
4248 /* Get the list for the map */
4249 if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
4250 list = (AV*) *listp;
4251 }
4252 else { /* No entry yet for it: create one */
4253 list = newAV();
4254 if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
4255 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
4256 }
4257 }
4258
5662e334
KW
4259 /* Look through list to see if this inverse mapping already is
4260 * listed, or if there is a mapping to itself already */
7e13d567 4261 for (i = 0; i <= av_tindex_nomg(list); i++) {
064c021d
KW
4262 SV** entryp = av_fetch(list, i, FALSE);
4263 SV* entry;
414db8a1 4264 UV uv;
064c021d
KW
4265 if (entryp == NULL) {
4266 Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
4267 }
4268 entry = *entryp;
414db8a1 4269 uv = SvUV(entry);
147e3846 4270 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %" UVXf " contains %" UVXf "\n", val, uv));*/
414db8a1 4271 if (uv == val) {
064c021d 4272 found_key = TRUE;
5662e334 4273 }
414db8a1 4274 if (uv == inverse) {
5662e334
KW
4275 found_inverse = TRUE;
4276 }
4277
4278 /* No need to continue searching if found everything we are
4279 * looking for */
4280 if (found_key && found_inverse) {
064c021d
KW
4281 break;
4282 }
4283 }
56ca34ca
KW
4284
4285 /* Make sure there is a mapping to itself on the list */
064c021d 4286 if (! found_key) {
d397ff6a 4287 av_push(list, newSVuv(val));
147e3846 4288 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, val, val));*/
064c021d
KW
4289 }
4290
4291
4292 /* Simply add the value to the list */
5662e334
KW
4293 if (! found_inverse) {
4294 av_push(list, newSVuv(inverse));
147e3846 4295 /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, inverse, val));*/
5662e334 4296 }
064c021d 4297
b0e3252e 4298 /* swatch_get() increments the value of val for each element in the
064c021d
KW
4299 * range. That makes more compact tables possible. You can
4300 * express the capitalization, for example, of all consecutive
4301 * letters with a single line: 0061\t007A\t0041 This maps 0061 to
4302 * 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
bd3f2f94 4303 * and it's not documented; it appears to be used only in
b0e3252e 4304 * implementing tr//; I copied the semantics from swatch_get(), just
bd3f2f94 4305 * in case */
064c021d
KW
4306 if (!none || val < none) {
4307 ++val;
4308 }
4309 }
4310 }
4311
4312 return ret;
4313}
4314
a25abddc 4315SV*
d764b54e
KW
4316Perl__swash_to_invlist(pTHX_ SV* const swash)
4317{
4318
ed92f1b3
KW
4319 /* Subject to change or removal. For use only in one place in regcomp.c.
4320 * Ownership is given to one reference count in the returned SV* */
d764b54e
KW
4321
4322 U8 *l, *lend;
4323 char *loc;
4324 STRLEN lcur;
4325 HV *const hv = MUTABLE_HV(SvRV(swash));
4326 UV elements = 0; /* Number of elements in the inversion list */
b443038a 4327 U8 empty[] = "";
923b6d4e
KW
4328 SV** listsvp;
4329 SV** typesvp;
4330 SV** bitssvp;
4331 SV** extssvp;
4332 SV** invert_it_svp;
d764b54e 4333
923b6d4e
KW
4334 U8* typestr;
4335 STRLEN bits;
4336 STRLEN octets; /* if bits == 1, then octets == 0 */
d73c39c5
KW
4337 U8 *x, *xend;
4338 STRLEN xcur;
d764b54e 4339
a25abddc 4340 SV* invlist;
d764b54e 4341
b81740c0
KW
4342 PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
4343
923b6d4e
KW
4344 /* If not a hash, it must be the swash's inversion list instead */
4345 if (SvTYPE(hv) != SVt_PVHV) {
ed92f1b3 4346 return SvREFCNT_inc_simple_NN((SV*) hv);
923b6d4e
KW
4347 }
4348
4349 /* The string containing the main body of the table */
4350 listsvp = hv_fetchs(hv, "LIST", FALSE);
4351 typesvp = hv_fetchs(hv, "TYPE", FALSE);
4352 bitssvp = hv_fetchs(hv, "BITS", FALSE);
4353 extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
4354 invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
4355
4356 typestr = (U8*)SvPV_nolen(*typesvp);
4357 bits = SvUV(*bitssvp);
4358 octets = bits >> 3; /* if bits == 1, then octets == 0 */
4359
d764b54e 4360 /* read $swash->{LIST} */
b443038a
KW
4361 if (SvPOK(*listsvp)) {
4362 l = (U8*)SvPV(*listsvp, lcur);
4363 }
4364 else {
4365 /* LIST legitimately doesn't contain a string during compilation phases
4366 * of Perl itself, before the Unicode tables are generated. In this
4367 * case, just fake things up by creating an empty list */
4368 l = empty;
4369 lcur = 0;
4370 }
d764b54e
KW
4371 loc = (char *) l;
4372 lend = l + lcur;
4373
31aa6e0b 4374 if (*l == 'V') { /* Inversion list format */
99944878 4375 const char *after_atou = (char *) lend;
31aa6e0b
KW
4376 UV element0;
4377 UV* other_elements_ptr;
4378
4379 /* The first number is a count of the rest */
4380 l++;
22ff3130
HS
4381 if (!grok_atoUV((const char *)l, &elements, &after_atou)) {
4382 Perl_croak(aTHX_ "panic: Expecting a valid count of elements at start of inversion list");
4383 }
eb092534
KW
4384 if (elements == 0) {
4385 invlist = _new_invlist(0);
4386 }
4387 else {
99944878
JH
4388 while (isSPACE(*l)) l++;
4389 l = (U8 *) after_atou;
1f9f7d4c
KW
4390
4391 /* Get the 0th element, which is needed to setup the inversion list */
99944878 4392 while (isSPACE(*l)) l++;
22ff3130
HS
4393 if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
4394 Perl_croak(aTHX_ "panic: Expecting a valid 0th element for inversion list");
4395 }
99944878 4396 l = (U8 *) after_atou;
1f9f7d4c
KW
4397 invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
4398 elements--;
4399
4400 /* Then just populate the rest of the input */
4401 while (elements-- > 0) {
4402 if (l > lend) {
147e3846 4403 Perl_croak(aTHX_ "panic: Expecting %" UVuf " more elements than available", elements);
1f9f7d4c 4404 }
99944878 4405 while (isSPACE(*l)) l++;
22ff3130
HS
4406 if (!grok_atoUV((const char *)l, other_elements_ptr++, &after_atou)) {
4407 Perl_croak(aTHX_ "panic: Expecting a valid element in inversion list");
4408 }
99944878 4409 l = (U8 *) after_atou;
1f9f7d4c 4410 }
eb092534 4411 }
31aa6e0b
KW
4412 }
4413 else {
4414
1784d2f9
KW
4415 /* Scan the input to count the number of lines to preallocate array
4416 * size based on worst possible case, which is each line in the input
4417 * creates 2 elements in the inversion list: 1) the beginning of a
4418 * range in the list; 2) the beginning of a range not in the list. */
4419 while ((loc = (strchr(loc, '\n'))) != NULL) {
4420 elements += 2;
4421 loc++;
4422 }
d764b54e 4423
1784d2f9
KW
4424 /* If the ending is somehow corrupt and isn't a new line, add another
4425 * element for the final range that isn't in the inversion list */
4426 if (! (*lend == '\n'
4427 || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
4428 {
4429 elements++;
4430 }
d764b54e 4431
1784d2f9 4432 invlist = _new_invlist(elements);
d764b54e 4433
1784d2f9
KW
4434 /* Now go through the input again, adding each range to the list */
4435 while (l < lend) {
4436 UV start, end;
4437 UV val; /* Not used by this function */
d764b54e 4438
95543e92
KW
4439 l = swash_scan_list_line(l, lend, &start, &end, &val,
4440 cBOOL(octets), typestr);
d764b54e 4441
1784d2f9
KW
4442 if (l > lend) {
4443 break;
4444 }
4445
4446 invlist = _add_range_to_invlist(invlist, start, end);
4447 }
31aa6e0b 4448 }
d764b54e 4449
77f9f126
KW
4450 /* Invert if the data says it should be */
4451 if (invert_it_svp && SvUV(*invert_it_svp)) {
25151030 4452 _invlist_invert(invlist);
77f9f126
KW
4453 }
4454
b0e3252e 4455 /* This code is copied from swatch_get()
d73c39c5
KW
4456 * read $swash->{EXTRAS} */
4457 x = (U8*)SvPV(*extssvp, xcur);
4458 xend = x + xcur;
4459 while (x < xend) {
4460 STRLEN namelen;
4461 U8 *namestr;
4462 SV** othersvp;
4463 HV* otherhv;
4464 STRLEN otherbits;
4465 SV **otherbitssvp, *other;
4466 U8 *nl;
4467
4468 const U8 opc = *x++;
4469 if (opc == '\n')
4470 continue;
4471
4472 nl = (U8*)memchr(x, '\n', xend - x);
4473
4474 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
4475 if (nl) {
4476 x = nl + 1; /* 1 is length of "\n" */
4477 continue;
4478 }
4479 else {
4480 x = xend; /* to EXTRAS' end at which \n is not found */
4481 break;
4482 }
4483 }
4484
4485 namestr = x;
4486 if (nl) {
4487 namelen = nl - namestr;
4488 x = nl + 1;
4489 }
4490 else {
4491 namelen = xend - namestr;
4492 x = xend;
4493 }
4494
4495 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
4496 otherhv = MUTABLE_HV(SvRV(*othersvp));
4497 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
4498 otherbits = (STRLEN)SvUV(*otherbitssvp);
4499
4500 if (bits != otherbits || bits != 1) {
5637ef5b 4501 Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
147e3846 4502 "properties, bits=%" UVuf ", otherbits=%" UVuf,
5637ef5b 4503 (UV)bits, (UV)otherbits);
d73c39c5
KW
4504 }
4505
4506 /* The "other" swatch must be destroyed after. */
4507 other = _swash_to_invlist((SV *)*othersvp);
4508
b0e3252e 4509 /* End of code copied from swatch_get() */
d73c39c5
KW
4510 switch (opc) {
4511 case '+':
4512 _invlist_union(invlist, other, &invlist);
4513 break;
4514 case '!':
6c46377d 4515 _invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
d73c39c5
KW
4516 break;
4517 case '-':
4518 _invlist_subtract(invlist, other, &invlist);
4519 break;
4520 case '&':
4521 _invlist_intersection(invlist, other, &invlist);
4522 break;
4523 default:
4524 break;
4525 }
4526 sv_free(other); /* through with it! */
4527 }
4528
dbfdbd26 4529 SvREADONLY_on(invlist);
d764b54e
KW
4530 return invlist;
4531}
4532
3fdfee00
KW
4533SV*
4534Perl__get_swash_invlist(pTHX_ SV* const swash)
4535{
872dd7e0 4536 SV** ptr;
3fdfee00
KW
4537
4538 PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
4539
87367d5f 4540 if (! SvROK(swash)) {
872dd7e0
KW
4541 return NULL;
4542 }
4543
87367d5f
KW
4544 /* If it really isn't a hash, it isn't really swash; must be an inversion
4545 * list */
4546 if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
4547 return SvRV(swash);
4548 }
872dd7e0 4549
87367d5f 4550 ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
3fdfee00
KW
4551 if (! ptr) {
4552 return NULL;
4553 }
4554
4555 return *ptr;
4556}
4557
0876b9a0 4558bool
5aaab254 4559Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
0876b9a0
KW
4560{
4561 /* May change: warns if surrogates, non-character code points, or
af2af982
KW
4562 * non-Unicode code points are in s which has length len bytes. Returns
4563 * TRUE if none found; FALSE otherwise. The only other validity check is
760c7c2f
KW
4564 * to make sure that this won't exceed the string's length.
4565 *
4566 * Code points above the platform's C<IV_MAX> will raise a deprecation
4567 * warning, unless those are turned off. */
0876b9a0
KW
4568
4569 const U8* const e = s + len;
4570 bool ok = TRUE;
4571
4572 PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
4573
4574 while (s < e) {
4575 if (UTF8SKIP(s) > len) {
4576 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
4577 "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
4578 return FALSE;
4579 }
ac6f1fbe 4580 if (UNLIKELY(isUTF8_POSSIBLY_PROBLEMATIC(*s))) {
f2bf18cc 4581 if (UNLIKELY(UTF8_IS_SUPER(s, e))) {
760c7c2f
KW
4582 if ( ckWARN_d(WARN_NON_UNICODE)
4583 || ( ckWARN_d(WARN_DEPRECATED)
83dc0f42
KW
4584#ifndef UV_IS_QUAD
4585 && UNLIKELY(is_utf8_cp_above_31_bits(s, e))
4586#else /* Below is 64-bit words */
760c7c2f
KW
4587 /* 2**63 and up meet these conditions provided we have
4588 * a 64-bit word. */
4589# ifdef EBCDIC
83dc0f42
KW
4590 && *s == 0xFE
4591 && NATIVE_UTF8_TO_I8(s[1]) >= 0xA8
760c7c2f 4592# else
83dc0f42
KW
4593 && *s == 0xFF
4594 /* s[1] being above 0x80 overflows */
760c7c2f
KW
4595 && s[2] >= 0x88
4596# endif
760c7c2f
KW
4597#endif
4598 )) {
15ca5930 4599 /* A side effect of this function will be to warn */
2db24202 4600 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_SUPER);
7ee537e6
KW
4601 ok = FALSE;
4602 }
0876b9a0 4603 }
f2bf18cc 4604 else if (UNLIKELY(UTF8_IS_SURROGATE(s, e))) {
8457b38f 4605 if (ckWARN_d(WARN_SURROGATE)) {
15ca5930
KW
4606 /* This has a different warning than the one the called
4607 * function would output, so can't just call it, unlike we
4608 * do for the non-chars and above-unicodes */
2db24202 4609 UV uv = utf8_to_uvchr_buf(s, e, NULL);
8457b38f 4610 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
147e3846 4611 "Unicode surrogate U+%04" UVXf " is illegal in UTF-8", uv);
8457b38f
KW
4612 ok = FALSE;
4613 }
0876b9a0 4614 }
f2bf18cc 4615 else if (UNLIKELY(UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) {
15ca5930 4616 /* A side effect of this function will be to warn */
2db24202 4617 (void) utf8n_to_uvchr(s, e - s, NULL, UTF8_WARN_NONCHAR);
0876b9a0
KW
4618 ok = FALSE;
4619 }
4620 }
4621 s += UTF8SKIP(s);
4622 }
4623
4624 return ok;
4625}
4626
0f830e0b 4627/*
87cea99e 4628=for apidoc pv_uni_display
d2cc3551 4629
a1433954
KW
4630Build to the scalar C<dsv> a displayable version of the string C<spv>,
4631length C<len>, the displayable version being at most C<pvlim> bytes long
796b6530 4632(if longer, the rest is truncated and C<"..."> will be appended).
0a2ef054 4633
796b6530
KW
4634The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
4635C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
4636to display the C<\\[nrfta\\]> as the backslashed versions (like C<"\n">)
4637(C<UNI_DISPLAY_BACKSLASH> is preferred over C<UNI_DISPLAY_ISPRINT> for C<"\\">).
4638C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
4639C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
0a2ef054 4640
a1433954 4641The pointer to the PV of the C<dsv> is returned.
d2cc3551 4642
119bc988
KW
4643See also L</sv_uni_display>.
4644
d2cc3551 4645=cut */
e6b2e755 4646char *
e1ec3a88 4647Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
4648{
4649 int truncated = 0;
e1ec3a88 4650 const char *s, *e;
e6b2e755 4651
7918f24d
NC
4652 PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
4653
9e2aa2e7 4654 SvPVCLEAR(dsv);
7fddd944 4655 SvUTF8_off(dsv);
e1ec3a88 4656 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 4657 UV u;
a49f32c6
NC
4658 /* This serves double duty as a flag and a character to print after
4659 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
4660 */
4661 char ok = 0;
c728cb41 4662
e6b2e755
JH
4663 if (pvlim && SvCUR(dsv) >= pvlim) {
4664 truncated++;
4665 break;
4666 }
4b88fb76 4667 u = utf8_to_uvchr_buf((U8*)s, (U8*)e, 0);
c728cb41 4668 if (u < 256) {
a3b680e6 4669 const unsigned char c = (unsigned char)u & 0xFF;
0bd48802 4670 if (flags & UNI_DISPLAY_BACKSLASH) {
a49f32c6 4671 switch (c) {
c728cb41 4672 case '\n':
a49f32c6 4673 ok = 'n'; break;
c728cb41 4674 case '\r':
a49f32c6 4675 ok = 'r'; break;
c728cb41 4676 case '\t':
a49f32c6 4677 ok = 't'; break;
c728cb41 4678 case '\f':
a49f32c6 4679 ok = 'f'; break;
c728cb41 4680 case '\a':
a49f32c6 4681 ok = 'a'; break;
c728cb41 4682 case '\\':
a49f32c6 4683 ok = '\\'; break;
c728cb41
JH
4684 default: break;
4685 }
a49f32c6 4686 if (ok) {
88c9ea1e 4687 const char string = ok;
76f68e9b 4688 sv_catpvs(dsv, "\\");
5e7aa789 4689 sv_catpvn(dsv, &string, 1);
a49f32c6 4690 }
c728cb41 4691 }
00e86452 4692 /* isPRINT() is the locale-blind version. */
a49f32c6 4693 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
88c9ea1e 4694 const char string = c;
5e7aa789 4695 sv_catpvn(dsv, &string, 1);
a49f32c6 4696 ok = 1;
0a2ef054 4697 }
c728cb41
JH
4698 }
4699 if (!ok)
147e3846 4700 Perl_sv_catpvf(aTHX_ dsv, "\\x{%" UVxf "}", u);
e6b2e755
JH
4701 }
4702 if (truncated)
396482e1 4703 sv_catpvs(dsv, "...");
48ef279e 4704
e6b2e755
JH
4705 return SvPVX(dsv);
4706}
2b9d42f0 4707
d2cc3551 4708/*
87cea99e 4709=for apidoc sv_uni_display
d2cc3551 4710
a1433954
KW
4711Build to the scalar C<dsv> a displayable version of the scalar C<sv>,
4712the displayable version being at most C<pvlim> bytes long
d2cc3551 4713(if longer, the rest is truncated and "..." will be appended).
0a2ef054 4714
a1433954 4715The C<flags> argument is as in L</pv_uni_display>().
0a2ef054 4716
a1433954 4717The pointer to the PV of the C<dsv> is returned.
d2cc3551 4718
d4c19fe8
AL
4719=cut
4720*/
e6b2e755
JH
4721char *
4722Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
4723{
8cdde9f8
NC
4724 const char * const ptr =
4725 isREGEXP(ssv) ? RX_WRAPPED((REGEXP*)ssv) : SvPVX_const(ssv);
4726
7918f24d
NC
4727 PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
4728
8cdde9f8 4729 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)ptr,
cfd0369c 4730 SvCUR(ssv), pvlim, flags);
701a277b
JH
4731}
4732
d2cc3551 4733/*
e6226b18 4734=for apidoc foldEQ_utf8
d2cc3551 4735
a1433954 4736Returns true if the leading portions of the strings C<s1> and C<s2> (either or both
e6226b18 4737of which may be in UTF-8) are the same case-insensitively; false otherwise.
d51c1b21 4738How far into the strings to compare is determined by other input parameters.
8b35872c 4739
a1433954
KW
4740If C<u1> is true, the string C<s1> is assumed to be in UTF-8-encoded Unicode;
4741otherwise it is assumed to be in native 8-bit encoding. Correspondingly for C<u2>
4742with respect to C<s2>.
8b35872c 4743
a1433954
KW
4744If the byte length C<l1> is non-zero, it says how far into C<s1> to check for fold
4745equality. In other words, C<s1>+C<l1> will be used as a goal to reach. The
8b35872c 4746scan will not be considered to be a match unless the goal is reached, and
a1433954
KW
4747scanning won't continue past that goal. Correspondingly for C<l2> with respect to
4748C<s2>.
4749
796b6530 4750If C<pe1> is non-C<NULL> and the pointer it points to is not C<NULL>, that pointer is
03bb5c85
KW
4751considered an end pointer to the position 1 byte past the maximum point
4752in C<s1> beyond which scanning will not continue under any circumstances.
4753(This routine assumes that UTF-8 encoded input strings are not malformed;
4754malformed input can cause it to read past C<pe1>).
4755This means that if both C<l1> and C<pe1> are specified, and C<pe1>
a1433954
KW
4756is less than C<s1>+C<l1>, the match will never be successful because it can
4757never
d51c1b21 4758get as far as its goal (and in fact is asserted against). Correspondingly for
a1433954 4759C<pe2> with respect to C<s2>.
8b35872c 4760
a1433954
KW
4761At least one of C<s1> and C<s2> must have a goal (at least one of C<l1> and
4762C<l2> must be non-zero), and if both do, both have to be
8b35872c
KW
4763reached for a successful match. Also, if the fold of a character is multiple
4764characters, all of them must be matched (see tr21 reference below for
4765'folding').
4766
796b6530 4767Upon a successful match, if C<pe1> is non-C<NULL>,
a1433954
KW
4768it will be set to point to the beginning of the I<next> character of C<s1>
4769beyond what was matched. Correspondingly for C<pe2> and C<s2>.
d2cc3551
JH
4770
4771For case-insensitiveness, the "casefolding" of Unicode is used
4772instead of upper/lowercasing both the characters, see
a1433954 4773L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
d2cc3551
JH
4774
4775=cut */
a33c29bc
KW
4776
4777/* A flags parameter has been added which may change, and hence isn't
4778 * externally documented. Currently it is:
4779 * 0 for as-documented above
4780 * FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
4781 ASCII one, to not match
31f05a37
KW
4782 * FOLDEQ_LOCALE is set iff the rules from the current underlying
4783 * locale are to be used.
4784 * FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
aa8ebe62
KW
4785 * routine. This allows that step to be skipped.
4786 * Currently, this requires s1 to be encoded as UTF-8
4787 * (u1 must be true), which is asserted for.
d635b710
KW
4788 * FOLDEQ_S1_FOLDS_SANE With either NOMIX_ASCII or LOCALE, no folds may
4789 * cross certain boundaries. Hence, the caller should
4790 * let this function do the folding instead of
4791 * pre-folding. This code contains an assertion to
4792 * that effect. However, if the caller knows what
4793 * it's doing, it can pass this flag to indicate that,
4794 * and the assertion is skipped.
31f05a37 4795 * FOLDEQ_S2_ALREADY_FOLDED Similarly.
d635b710 4796 * FOLDEQ_S2_FOLDS_SANE
a33c29bc 4797 */
701a277b 4798I32
5aaab254 4799Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags)
332ddc25 4800{
eb578fdb
KW
4801 const U8 *p1 = (const U8*)s1; /* Point to current char */
4802 const U8 *p2 = (const U8*)s2;
4803 const U8 *g1 = NULL; /* goal for s1 */
4804 const U8 *g2 = NULL;
4805 const U8 *e1 = NULL; /* Don't scan s1 past this */
4806 U8 *f1 = NULL; /* Point to current folded */
4807 const U8 *e2 = NULL;
4808 U8 *f2 = NULL;
48ef279e 4809 STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
8b35872c
KW
4810 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
4811 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
1d39b2cd 4812 U8 flags_for_folder = FOLD_FLAGS_FULL;
8b35872c 4813
eda9cac1 4814 PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
8b35872c 4815
cea315b6 4816 assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
d635b710
KW
4817 && (((flags & FOLDEQ_S1_ALREADY_FOLDED)
4818 && !(flags & FOLDEQ_S1_FOLDS_SANE))
4819 || ((flags & FOLDEQ_S2_ALREADY_FOLDED)
4820 && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
b08f1bd5
KW
4821 /* The algorithm is to trial the folds without regard to the flags on
4822 * the first line of the above assert(), and then see if the result
4823 * violates them. This means that the inputs can't be pre-folded to a
4824 * violating result, hence the assert. This could be changed, with the
4825 * addition of extra tests here for the already-folded case, which would
4826 * slow it down. That cost is more than any possible gain for when these
4827 * flags are specified, as the flags indicate /il or /iaa matching which
4828 * is less common than /iu, and I (khw) also believe that real-world /il
4829 * and /iaa matches are most likely to involve code points 0-255, and this
4830 * function only under rare conditions gets called for 0-255. */
18f762c3 4831
1d39b2cd
KW
4832 if (flags & FOLDEQ_LOCALE) {
4833 if (IN_UTF8_CTYPE_LOCALE) {
4834 flags &= ~FOLDEQ_LOCALE;
4835 }
4836 else {
4837 flags_for_folder |= FOLD_FLAGS_LOCALE;
4838 }
31f05a37
KW
4839 }
4840
8b35872c 4841 if (pe1) {
48ef279e 4842 e1 = *(U8**)pe1;
8b35872c
KW
4843 }
4844
4845 if (l1) {
48ef279e 4846 g1 = (const U8*)s1 + l1;
8b35872c
KW
4847 }
4848
4849 if (pe2) {
48ef279e 4850 e2 = *(U8**)pe2;
8b35872c
KW
4851 }
4852
4853 if (l2) {
48ef279e 4854 g2 = (const U8*)s2 + l2;
8b35872c
KW
4855 }
4856
4857 /* Must have at least one goal */
4858 assert(g1 || g2);
4859
4860 if (g1) {
4861
48ef279e
KW
4862 /* Will never match if goal is out-of-bounds */
4863 assert(! e1 || e1 >= g1);
8b35872c 4864
48ef279e
KW
4865 /* Here, there isn't an end pointer, or it is beyond the goal. We
4866 * only go as far as the goal */
4867 e1 = g1;
8b35872c 4868 }
313b38e5
NC
4869 else {
4870 assert(e1); /* Must have an end for looking at s1 */
4871 }
8b35872c
KW
4872
4873 /* Same for goal for s2 */
4874 if (g2) {
48ef279e
KW
4875 assert(! e2 || e2 >= g2);
4876 e2 = g2;
8b35872c 4877 }
313b38e5
NC
4878 else {
4879 assert(e2);
4880 }
8b35872c 4881
18f762c3
KW
4882 /* If both operands are already folded, we could just do a memEQ on the
4883 * whole strings at once, but it would be better if the caller realized
4884 * this and didn't even call us */
4885
8b35872c
KW
4886 /* Look through both strings, a character at a time */
4887 while (p1 < e1 && p2 < e2) {
4888
d51c1b21 4889 /* If at the beginning of a new character in s1, get its fold to use
1d39b2cd 4890 * and the length of the fold. */
48ef279e 4891 if (n1 == 0) {
18f762c3
KW
4892 if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
4893 f1 = (U8 *) p1;
aa8ebe62 4894 assert(u1);
18f762c3 4895 n1 = UTF8SKIP(f1);
18f762c3
KW
4896 }
4897 else {
1d39b2cd
KW
4898 if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
4899
4900 /* We have to forbid mixing ASCII with non-ASCII if the
4901 * flags so indicate. And, we can short circuit having to
4902 * call the general functions for this common ASCII case,
4903 * all of whose non-locale folds are also ASCII, and hence
4904 * UTF-8 invariants, so the UTF8ness of the strings is not
4905 * relevant. */
4906 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
4907 return 0;
4908 }
4909 n1 = 1;
4910 *foldbuf1 = toFOLD(*p1);
4911 }
4912 else if (u1) {
4913 _to_utf8_fold_flags(p1, foldbuf1, &n1, flags_for_folder);
4914 }
4a4088c4 4915 else { /* Not UTF-8, get UTF-8 fold */
1d39b2cd
KW
4916 _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
4917 }
4918 f1 = foldbuf1;
4919 }
48ef279e 4920 }
8b35872c 4921
48ef279e 4922 if (n2 == 0) { /* Same for s2 */
18f762c3
KW
4923 if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
4924 f2 = (U8 *) p2;
aa8ebe62 4925 assert(u2);
18f762c3
KW
4926 n2 = UTF8SKIP(f2);
4927 }
4928 else {
1d39b2cd
KW
4929 if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
4930 if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
4931 return 0;
4932 }
4933 n2 = 1;
4934 *foldbuf2 = toFOLD(*p2);
4935 }
4936 else if (u2) {
4937 _to_utf8_fold_flags(p2, foldbuf2, &n2, flags_for_folder);
4938 }
4939 else {
4940 _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
4941 }
4942 f2 = foldbuf2;
18f762c3 4943 }
48ef279e 4944 }
8b35872c 4945
5001101e 4946 /* Here f1 and f2 point to the beginning of the strings to compare.
227968da 4947 * These strings are the folds of the next character from each input
4a4088c4 4948 * string, stored in UTF-8. */
5e64d0fa 4949
48ef279e
KW
4950 /* While there is more to look for in both folds, see if they
4951 * continue to match */
4952 while (n1 && n2) {
4953 U8 fold_length = UTF8SKIP(f1);
4954 if (fold_length != UTF8SKIP(f2)
4955 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
4956 function call for single
a6d5f321 4957 byte */
48ef279e
KW
4958 || memNE((char*)f1, (char*)f2, fold_length))
4959 {
e6226b18 4960 return 0; /* mismatch */
48ef279e
KW
4961 }
4962
4963 /* Here, they matched, advance past them */
4964 n1 -= fold_length;
4965 f1 += fold_length;
4966 n2 -= fold_length;
4967 f2 += fold_length;
4968 }
8b35872c 4969
48ef279e
KW
4970 /* When reach the end of any fold, advance the input past it */
4971 if (n1 == 0) {
4972 p1 += u1 ? UTF8SKIP(p1) : 1;
4973 }
4974 if (n2 == 0) {
4975 p2 += u2 ? UTF8SKIP(p2) : 1;
4976 }
8b35872c
KW
4977 } /* End of loop through both strings */
4978
4979 /* A match is defined by each scan that specified an explicit length
4980 * reaching its final goal, and the other not having matched a partial
4981 * character (which can happen when the fold of a character is more than one
4982 * character). */
4983 if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
e6226b18 4984 return 0;
8b35872c
KW
4985 }
4986
4987 /* Successful match. Set output pointers */
4988 if (pe1) {
48ef279e 4989 *pe1 = (char*)p1;
8b35872c
KW
4990 }
4991 if (pe2) {
48ef279e 4992 *pe2 = (char*)p2;
8b35872c 4993 }
e6226b18 4994 return 1;
e6b2e755 4995}
701a277b 4996
f2645549 4997/* XXX The next two functions should likely be moved to mathoms.c once all
37e7596b
KW
4998 * occurrences of them are removed from the core; some cpan-upstream modules
4999 * still use them */
5000
5001U8 *
5002Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
5003{
5004 PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
5005
5006 return Perl_uvoffuni_to_utf8_flags(aTHX_ d, uv, 0);
5007}
5008
e505af10
KW
5009/*
5010=for apidoc utf8n_to_uvuni
5011
5012Instead use L</utf8_to_uvchr_buf>, or rarely, L</utf8n_to_uvchr>.
5013
5014This function was useful for code that wanted to handle both EBCDIC and
5015ASCII platforms with Unicode properties, but starting in Perl v5.20, the
5016distinctions between the platforms have mostly been made invisible to most
5017code, so this function is quite unlikely to be what you want. If you do need
5018this precise functionality, use instead
5019C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>>
5020or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))|/utf8n_to_uvchr>>.
5021
5022=cut
5023*/
5024
37e7596b
KW
5025UV
5026Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
5027{
5028 PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
5029
5030 return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
5031}
5032
5033/*
5034=for apidoc uvuni_to_utf8_flags
5035
5036Instead you almost certainly want to use L</uvchr_to_utf8> or
efa9cd84 5037L</uvchr_to_utf8_flags>.
37e7596b
KW
5038
5039This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
5040which itself, while not deprecated, should be used only in isolated
5041circumstances. These functions were useful for code that wanted to handle
5042both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
5043v5.20, the distinctions between the platforms have mostly been made invisible
5044to most code, so this function is quite unlikely to be what you want.
5045
5046=cut
5047*/
5048
5049U8 *
5050Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
5051{
5052 PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
5053
5054 return uvoffuni_to_utf8_flags(d, uv, flags);
5055}
5056
5057/*
14d04a33 5058 * ex: set ts=8 sts=4 sw=4 et:
37442d52 5059 */