This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8.c: Rename a couple of macros for clarity
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
1129b882 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
4ac71550
TC
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
cdad3b53 16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
a0ed51b3
LW
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
4ac71550 20 * as is the custom in the West, if you wish to be answered?'
cdad3b53 21 * --Gandalf, addressing Théoden's door wardens
4ac71550
TC
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
4ac71550
TC
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
a0ed51b3
LW
29 */
30
31#include "EXTERN.h"
864dbfa3 32#define PERL_IN_UTF8_C
a0ed51b3 33#include "perl.h"
b992490d 34#include "invlist_inline.h"
a0ed51b3 35
806547a7 36static const char malformed_text[] = "Malformed UTF-8 character";
27da23d5 37static const char unees[] =
806547a7 38 "Malformed UTF-8 character (unexpected end of string)";
760c7c2f 39static const char cp_above_legal_max[] =
2d212e86 40 "Use of code point 0x%"UVXf" is deprecated; the permissible max is 0x%"UVXf"";
760c7c2f 41
114d9c4d 42#define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
901b21bf 43
48ef279e 44/*
ccfc67b7 45=head1 Unicode Support
7fefc6c1 46These are various utility functions for manipulating UTF8-encoded
72d33970 47strings. For the uninitiated, this is a method of representing arbitrary
61296642 48Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
49characters in the ASCII range are unmodified, and a zero byte never appears
50within non-zero characters.
166f8a29 51
eaf7a4d2
CS
52=cut
53*/
54
55/*
378516de 56=for apidoc uvoffuni_to_utf8_flags
eebe1485 57
a27992cc 58THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af
KW
59Instead, B<Almost all code should use L</uvchr_to_utf8> or
60L</uvchr_to_utf8_flags>>.
a27992cc 61
de69f3af
KW
62This function is like them, but the input is a strict Unicode
63(as opposed to native) code point. Only in very rare circumstances should code
64not be using the native code point.
949cf498 65
efa9cd84 66For details, see the description for L</uvchr_to_utf8_flags>.
949cf498 67
eebe1485
SC
68=cut
69*/
70
8ee1cdcb
KW
71#define HANDLE_UNICODE_SURROGATE(uv, flags) \
72 STMT_START { \
73 if (flags & UNICODE_WARN_SURROGATE) { \
74 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
75 "UTF-16 surrogate U+%04"UVXf, uv); \
76 } \
77 if (flags & UNICODE_DISALLOW_SURROGATE) { \
78 return NULL; \
79 } \
80 } STMT_END;
81
82#define HANDLE_UNICODE_NONCHAR(uv, flags) \
83 STMT_START { \
84 if (flags & UNICODE_WARN_NONCHAR) { \
85 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
86 "Unicode non-character U+%04"UVXf" is not " \
87 "recommended for open interchange", uv); \
88 } \
89 if (flags & UNICODE_DISALLOW_NONCHAR) { \
90 return NULL; \
91 } \
92 } STMT_END;
93
ba6ed43c
KW
94/* Use shorter names internally in this file */
95#define SHIFT UTF_ACCUMULATION_SHIFT
96#undef MARK
97#define MARK UTF_CONTINUATION_MARK
98#define MASK UTF_CONTINUATION_MASK
99
dfe13c55 100U8 *
378516de 101Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 102{
378516de 103 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
7918f24d 104
2d1545e5 105 if (OFFUNI_IS_INVARIANT(uv)) {
4c8cd605 106 *d++ = LATIN1_TO_NATIVE(uv);
d9432125
KW
107 return d;
108 }
facc1dc2 109
3ea68d71 110 if (uv <= MAX_UTF8_TWO_BYTE) {
facc1dc2
KW
111 *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
112 *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
3ea68d71
KW
113 return d;
114 }
d9432125 115
ba6ed43c
KW
116 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
117 * below, the 16 is for start bytes E0-EF (which are all the possible ones
118 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
119 * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
120 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
121 * 0x800-0xFFFF on ASCII */
122 if (uv < (16 * (1U << (2 * SHIFT)))) {
123 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
124 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
125 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
126
127#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
128 aren't tested here */
129 /* The most likely code points in this range are below the surrogates.
130 * Do an extra test to quickly exclude those. */
131 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
132 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
133 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
134 {
8ee1cdcb
KW
135 HANDLE_UNICODE_NONCHAR(uv, flags);
136 }
137 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
138 HANDLE_UNICODE_SURROGATE(uv, flags);
760c7c2f 139 }
ba6ed43c
KW
140 }
141#endif
142 return d;
143 }
144
145 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
146 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
147 * happen starting with 4-byte characters on ASCII platforms. We unify the
148 * code for these with EBCDIC, even though some of them require 5-bytes on
149 * those, because khw believes the code saving is worth the very slight
150 * performance hit on these high EBCDIC code points. */
151
152 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
a5bf80e0
KW
153 if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
154 && ckWARN_d(WARN_DEPRECATED))
155 {
156 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
157 cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
158 }
159 if ( (flags & UNICODE_WARN_SUPER)
160 || ( UNICODE_IS_ABOVE_31_BIT(uv)
161 && (flags & UNICODE_WARN_ABOVE_31_BIT)))
162 {
163 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
164
165 /* Choose the more dire applicable warning */
166 (UNICODE_IS_ABOVE_31_BIT(uv))
167 ? "Code point 0x%"UVXf" is not Unicode, and not portable"
168 : "Code point 0x%"UVXf" is not Unicode, may not be portable",
169 uv);
170 }
171 if (flags & UNICODE_DISALLOW_SUPER
172 || ( UNICODE_IS_ABOVE_31_BIT(uv)
173 && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
174 {
175 return NULL;
176 }
177 }
ba6ed43c
KW
178 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
179 HANDLE_UNICODE_NONCHAR(uv, flags);
507b9800 180 }
d9432125 181
ba6ed43c
KW
182 /* Test for and handle 4-byte result. In the test immediately below, the
183 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
184 * characters). The 3 is for 3 continuation bytes; these each contribute
185 * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
186 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
187 * 0x1_0000-0x1F_FFFF on ASCII */
188 if (uv < (8 * (1U << (3 * SHIFT)))) {
189 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
190 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
191 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
192 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
193
194#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
195 characters. The end-plane non-characters for EBCDIC were
196 handled just above */
197 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
198 HANDLE_UNICODE_NONCHAR(uv, flags);
d528804a 199 }
ba6ed43c
KW
200 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
201 HANDLE_UNICODE_SURROGATE(uv, flags);
202 }
203#endif
204
205 return d;
206 }
207
208 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
209 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
210 * format. The unrolled version above turns out to not save all that much
211 * time, and at these high code points (well above the legal Unicode range
212 * on ASCII platforms, and well above anything in common use in EBCDIC),
213 * khw believes that less code outweighs slight performance gains. */
214
d9432125 215 {
5aaebcb3 216 STRLEN len = OFFUNISKIP(uv);
1d72bdf6
NIS
217 U8 *p = d+len-1;
218 while (p > d) {
4c8cd605 219 *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
220 uv >>= UTF_ACCUMULATION_SHIFT;
221 }
4c8cd605 222 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
223 return d+len;
224 }
a0ed51b3 225}
a5bf80e0 226
646ca15d 227/*
07693fe6
KW
228=for apidoc uvchr_to_utf8
229
bcb1a2d4 230Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 231of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
232C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
233the byte after the end of the new character. In other words,
07693fe6
KW
234
235 d = uvchr_to_utf8(d, uv);
236
237is the recommended wide native character-aware way of saying
238
239 *(d++) = uv;
240
760c7c2f
KW
241This function accepts any UV as input, but very high code points (above
242C<IV_MAX> on the platform) will raise a deprecation warning. This is
243typically 0x7FFF_FFFF in a 32-bit word.
244
245It is possible to forbid or warn on non-Unicode code points, or those that may
246be problematic by using L</uvchr_to_utf8_flags>.
de69f3af 247
07693fe6
KW
248=cut
249*/
250
de69f3af
KW
251/* This is also a macro */
252PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
253
07693fe6
KW
254U8 *
255Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
256{
de69f3af 257 return uvchr_to_utf8(d, uv);
07693fe6
KW
258}
259
de69f3af
KW
260/*
261=for apidoc uvchr_to_utf8_flags
262
263Adds the UTF-8 representation of the native code point C<uv> to the end
f2fc1b45 264of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
c749c9fd
KW
265C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
266the byte after the end of the new character. In other words,
de69f3af
KW
267
268 d = uvchr_to_utf8_flags(d, uv, flags);
269
270or, in most cases,
271
272 d = uvchr_to_utf8_flags(d, uv, 0);
273
274This is the Unicode-aware way of saying
275
276 *(d++) = uv;
277
760c7c2f
KW
278If C<flags> is 0, this function accepts any UV as input, but very high code
279points (above C<IV_MAX> for the platform) will raise a deprecation warning.
280This is typically 0x7FFF_FFFF in a 32-bit word.
281
282Specifying C<flags> can further restrict what is allowed and not warned on, as
283follows:
de69f3af 284
796b6530 285If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
7ee537e6
KW
286the function will raise a warning, provided UTF8 warnings are enabled. If
287instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
288NULL. If both flags are set, the function will both warn and return NULL.
de69f3af 289
760c7c2f
KW
290Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
291affect how the function handles a Unicode non-character.
93e6dbd6 292
760c7c2f
KW
293And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
294affect the handling of code points that are above the Unicode maximum of
2950x10FFFF. Languages other than Perl may not be able to accept files that
296contain these.
93e6dbd6
KW
297
298The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
299the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
ecc1615f
KW
300three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
301allowed inputs to the strict UTF-8 traditionally defined by Unicode.
302Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
303C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
304above-Unicode and surrogate flags, but not the non-character ones, as
305defined in
306L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
307See L<perlunicode/Noncharacter code points>.
93e6dbd6 308
ab8e6d41
KW
309Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
310so using them is more problematic than other above-Unicode code points. Perl
311invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
312likely that non-Perl languages will not be able to read files that contain
313these that written by the perl interpreter; nor would Perl understand files
314written by something that uses a different extension. For these reasons, there
315is a separate set of flags that can warn and/or disallow these extremely high
316code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
317C<UNICODE_WARN_ABOVE_31_BIT> and C<UNICODE_DISALLOW_ABOVE_31_BIT> flags. These
318are entirely independent from the deprecation warning for code points above
319C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
320code point that needs more than 31 bits to represent. When that happens,
321effectively the C<UNICODE_DISALLOW_ABOVE_31_BIT> flag will always be set on
32232-bit machines. (Of course C<UNICODE_DISALLOW_SUPER> will treat all
ab8e6d41
KW
323above-Unicode code points, including these, as malformations; and
324C<UNICODE_WARN_SUPER> warns on these.)
325
326On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
327extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
328than on ASCII. Prior to that, code points 2**31 and higher were simply
329unrepresentable, and a different, incompatible method was used to represent
330code points between 2**30 and 2**31 - 1. The flags C<UNICODE_WARN_ABOVE_31_BIT>
331and C<UNICODE_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
332platforms, warning and disallowing 2**31 and higher.
de69f3af 333
de69f3af
KW
334=cut
335*/
336
337/* This is also a macro */
338PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
339
07693fe6
KW
340U8 *
341Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
342{
de69f3af 343 return uvchr_to_utf8_flags(d, uv, flags);
07693fe6
KW
344}
345
83dc0f42
KW
346PERL_STATIC_INLINE bool
347S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
348{
349 /* Returns TRUE if the first code point represented by the Perl-extended-
350 * UTF-8-encoded string starting at 's', and looking no further than 'e -
351 * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
352 *
353 * The function handles the case where the input bytes do not include all
354 * the ones necessary to represent a full character. That is, they may be
355 * the intial bytes of the representation of a code point, but possibly
356 * the final ones necessary for the complete representation may be beyond
357 * 'e - 1'.
358 *
359 * The function assumes that the sequence is well-formed UTF-8 as far as it
360 * goes, and is for a UTF-8 variant code point. If the sequence is
361 * incomplete, the function returns FALSE if there is any well-formed
362 * UTF-8 byte sequence that can complete it in such a way that a code point
363 * < 2**31 is produced; otherwise it returns TRUE.
364 *
365 * Getting this exactly right is slightly tricky, and has to be done in
366 * several places in this file, so is centralized here. It is based on the
367 * following table:
368 *
369 * U+7FFFFFFF (2 ** 31 - 1)
370 * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
371 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
372 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
373 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
374 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
375 * U+80000000 (2 ** 31):
376 * ASCII: \xFE\x82\x80\x80\x80\x80\x80
377 * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
378 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
379 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
380 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
381 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
382 */
383
384#ifdef EBCDIC
385
b343c774
KW
386 /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
387 const U8 * const prefix = (U8 *) "\x41\x41\x41\x41\x41\x41\x42";
83dc0f42
KW
388 const STRLEN prefix_len = sizeof(prefix) - 1;
389 const STRLEN len = e - s;
f880f78a 390 const STRLEN cmp_len = MIN(prefix_len, len - 1);
83dc0f42
KW
391
392#else
393
394 PERL_UNUSED_ARG(e);
395
396#endif
397
398 PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
399
400 assert(! UTF8_IS_INVARIANT(*s));
401
402#ifndef EBCDIC
403
404 /* Technically, a start byte of FE can be for a code point that fits into
405 * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
406 * malformation. */
407 return (*s >= 0xFE);
408
409#else
410
411 /* On the EBCDIC code pages we handle, only 0xFE can mean a 32-bit or
412 * larger code point (0xFF is an invariant). For 0xFE, we need at least 2
413 * bytes, and maybe up through 8 bytes, to be sure if the value is above 31
414 * bits. */
415 if (*s != 0xFE || len == 1) {
416 return FALSE;
417 }
418
419 /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
420 * \x41 and \x42. */
421 return cBOOL(memGT(s + 1, prefix, cmp_len));
422
423#endif
424
425}
426
35f8c9bd 427STRLEN
edc2c47a 428Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
35f8c9bd 429{
2b479609 430 STRLEN len;
35f8c9bd
KW
431 const U8 *x, *y;
432
2b479609
KW
433 /* A helper function that should not be called directly.
434 *
435 * This function returns non-zero if the string beginning at 's' and
436 * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
437 * code point; otherwise it returns 0. The examination stops after the
438 * first code point in 's' is validated, not looking at the rest of the
439 * input. If 'e' is such that there are not enough bytes to represent a
440 * complete code point, this function will return non-zero anyway, if the
441 * bytes it does have are well-formed UTF-8 as far as they go, and aren't
442 * excluded by 'flags'.
443 *
444 * A non-zero return gives the number of bytes required to represent the
445 * code point. Be aware that if the input is for a partial character, the
446 * return will be larger than 'e - s'.
447 *
448 * This function assumes that the code point represented is UTF-8 variant.
449 * The caller should have excluded this possibility before calling this
450 * function.
451 *
452 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
453 * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
454 * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
455 * disallowed by the flags. If the input is only for a partial character,
456 * the function will return non-zero if there is any sequence of
457 * well-formed UTF-8 that, when appended to the input sequence, could
458 * result in an allowed code point; otherwise it returns 0. Non characters
459 * cannot be determined based on partial character input. But many of the
460 * other excluded types can be determined with just the first one or two
461 * bytes.
462 *
463 */
464
465 PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
466
467 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
468 |UTF8_DISALLOW_ABOVE_31_BIT)));
469 assert(! UTF8_IS_INVARIANT(*s));
35f8c9bd 470
2b479609 471 /* A variant char must begin with a start byte */
35f8c9bd
KW
472 if (UNLIKELY(! UTF8_IS_START(*s))) {
473 return 0;
474 }
475
edc2c47a
KW
476 /* Examine a maximum of a single whole code point */
477 if (e - s > UTF8SKIP(s)) {
478 e = s + UTF8SKIP(s);
479 }
480
2b479609
KW
481 len = e - s;
482
483 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
484 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
35f8c9bd 485
2b479609
KW
486 /* The code below is derived from this table. Keep in mind that legal
487 * continuation bytes range between \x80..\xBF for UTF-8, and
488 * \xA0..\xBF for I8. Anything above those aren't continuation bytes.
489 * Hence, we don't have to test the upper edge because if any of those
490 * are encountered, the sequence is malformed, and will fail elsewhere
491 * in this function.
492 * UTF-8 UTF-EBCDIC I8
493 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
494 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
495 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
496 *
497 */
498
499#ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
500# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
19794540 501# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
2b479609 502
19794540
KW
503# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
504 /* B6 and B7 */ \
505 && ((s1) & 0xFE ) == 0xB6)
2b479609
KW
506#else
507# define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
19794540
KW
508# define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
509# define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
2b479609
KW
510#endif
511
512 if ( (flags & UTF8_DISALLOW_SUPER)
513 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER)) {
514 return 0; /* Above Unicode */
515 }
516
517 if ( (flags & UTF8_DISALLOW_ABOVE_31_BIT)
518 && UNLIKELY(is_utf8_cp_above_31_bits(s, e)))
519 {
520 return 0; /* Above 31 bits */
521 }
522
523 if (len > 1) {
524 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
525
526 if ( (flags & UTF8_DISALLOW_SUPER)
19794540 527 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
2b479609
KW
528 {
529 return 0; /* Above Unicode */
530 }
531
532 if ( (flags & UTF8_DISALLOW_SURROGATE)
19794540 533 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
2b479609
KW
534 {
535 return 0; /* Surrogate */
536 }
537
538 if ( (flags & UTF8_DISALLOW_NONCHAR)
539 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
540 {
541 return 0; /* Noncharacter code point */
542 }
543 }
544 }
545
546 /* Make sure that all that follows are continuation bytes */
35f8c9bd
KW
547 for (x = s + 1; x < e; x++) {
548 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
549 return 0;
550 }
551 }
552
af13dd8a
KW
553 /* Here is syntactically valid. Next, make sure this isn't the start of an
554 * overlong. Overlongs can occur whenever the number of continuation bytes
555 * changes. That means whenever the number of leading 1 bits in a start
556 * byte increases from the next lower start byte. That happens for start
557 * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following
558 * illegal start bytes have already been excluded, so don't need to be
559 * tested here;
560 * ASCII platforms: C0, C1
561 * EBCDIC platforms C0, C1, C2, C3, C4, E0
562 *
563 * At least a second byte is required to determine if other sequences will
564 * be an overlong. */
35f8c9bd
KW
565
566 if (len > 1) {
af13dd8a
KW
567 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
568 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
569
570 /* Each platform has overlongs after the start bytes given above
571 * (expressed in I8 for EBCDIC). What constitutes an overlong varies
572 * by platform, but the logic is the same, except the E0 overlong has
573 * already been excluded on EBCDIC platforms. The values below were
574 * found by manually inspecting the UTF-8 patterns. See the tables in
575 * utf8.h and utfebcdic.h */
576
577# ifdef EBCDIC
578# define F0_ABOVE_OVERLONG 0xB0
579# define F8_ABOVE_OVERLONG 0xA8
580# define FC_ABOVE_OVERLONG 0xA4
581# define FE_ABOVE_OVERLONG 0xA2
582# define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
583 /* I8(0xfe) is FF */
584# else
585
586 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
587 return 0; /* Overlong */
35f8c9bd 588 }
35f8c9bd 589
af13dd8a
KW
590# define F0_ABOVE_OVERLONG 0x90
591# define F8_ABOVE_OVERLONG 0x88
592# define FC_ABOVE_OVERLONG 0x84
593# define FE_ABOVE_OVERLONG 0x82
594# define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
595# endif
35f8c9bd 596
af13dd8a
KW
597
598 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
599 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
600 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
601 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
602 {
603 return 0; /* Overlong */
35f8c9bd
KW
604 }
605
af13dd8a 606# if defined(UV_IS_QUAD) || defined(EBCDIC)
35f8c9bd 607
af13dd8a
KW
608 /* Check for the FF overlong. This happens only if all these bytes
609 * match; what comes after them doesn't matter. See tables in utf8.h,
610 * utfebcdic.h. (Can't happen on ASCII 32-bit platforms, as overflows
611 * instead.) */
612
613 if ( len >= sizeof(FF_OVERLONG_PREFIX) - 1
614 && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
615 sizeof(FF_OVERLONG_PREFIX) - 1)))
616 {
617 return 0; /* Overlong */
35f8c9bd 618 }
35f8c9bd
KW
619
620#endif
621
af13dd8a
KW
622 }
623
3a498dae
KW
624 /* Finally, see if this would overflow a UV on this platform. See if the
625 * UTF8 for this code point is larger than that for the highest
626 * representable code point. (For ASCII platforms, we could use memcmp()
627 * because we don't have to convert each byte to I8, but it's very rare
628 * input indeed that would approach overflow, so the loop below will likely
629 * only get executed once */
35f8c9bd
KW
630 y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
631
632 for (x = s; x < e; x++, y++) {
633
3a498dae 634 /* If the same as this byte, go on to the next */
35f8c9bd
KW
635 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) == *y)) {
636 continue;
637 }
638
639 /* If this is larger, it overflows */
640 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
641 return 0;
642 }
643
644 /* But if smaller, it won't */
645 break;
646 }
647
2b479609 648 return UTF8SKIP(s);
35f8c9bd
KW
649}
650
2b479609
KW
651#undef F0_ABOVE_OVERLONG
652#undef F8_ABOVE_OVERLONG
653#undef FC_ABOVE_OVERLONG
654#undef FE_ABOVE_OVERLONG
655#undef FF_OVERLONG_PREFIX
656
7cf8d05d
KW
657STATIC char *
658S__byte_dump_string(pTHX_ const U8 * s, const STRLEN len)
659{
660 /* Returns a mortalized C string that is a displayable copy of the 'len'
661 * bytes starting at 's', each in a \xXY format. */
662
663 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
664 trailing NUL */
665 const U8 * const e = s + len;
666 char * output;
667 char * d;
668
669 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
670
671 Newx(output, output_len, char);
672 SAVEFREEPV(output);
673
674 d = output;
675 for (; s < e; s++) {
676 const unsigned high_nibble = (*s & 0xF0) >> 4;
677 const unsigned low_nibble = (*s & 0x0F);
678
679 *d++ = '\\';
680 *d++ = 'x';
681
682 if (high_nibble < 10) {
683 *d++ = high_nibble + '0';
684 }
685 else {
686 *d++ = high_nibble - 10 + 'a';
687 }
688
689 if (low_nibble < 10) {
690 *d++ = low_nibble + '0';
691 }
692 else {
693 *d++ = low_nibble - 10 + 'a';
694 }
695 }
696
697 *d = '\0';
698 return output;
699}
700
806547a7 701PERL_STATIC_INLINE char *
7cf8d05d
KW
702S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
703
704 /* How many bytes to print */
705 const STRLEN print_len,
706
707 /* Which one is the non-continuation */
708 const STRLEN non_cont_byte_pos,
709
710 /* How many bytes should there be? */
711 const STRLEN expect_len)
806547a7
KW
712{
713 /* Return the malformation warning text for an unexpected continuation
714 * byte. */
715
7cf8d05d 716 const char * const where = (non_cont_byte_pos == 1)
806547a7 717 ? "immediately"
7cf8d05d
KW
718 : Perl_form(aTHX_ "%d bytes",
719 (int) non_cont_byte_pos);
806547a7
KW
720
721 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
722
7cf8d05d
KW
723 /* We don't need to pass this parameter, but since it has already been
724 * calculated, it's likely faster to pass it; verify under DEBUGGING */
725 assert(expect_len == UTF8SKIP(s));
726
727 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
728 " %s after start byte 0x%02x; need %d bytes, got %d)",
729 malformed_text,
730 _byte_dump_string(s, print_len),
731 *(s + non_cont_byte_pos),
732 where,
733 *s,
734 (int) expect_len,
735 (int) non_cont_byte_pos);
806547a7
KW
736}
737
35f8c9bd
KW
738/*
739
de69f3af 740=for apidoc utf8n_to_uvchr
378516de
KW
741
742THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
de69f3af 743Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
67e989fb 744
9041c2e3 745Bottom level UTF-8 decode routine.
de69f3af 746Returns the native code point value of the first character in the string C<s>,
746afd53
KW
747which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
748C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
749the length, in bytes, of that character.
949cf498
KW
750
751The value of C<flags> determines the behavior when C<s> does not point to a
752well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
524080c4
KW
753zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
754next possible position in C<s> that could begin a non-malformed character.
755Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
949cf498
KW
756
757Various ALLOW flags can be set in C<flags> to allow (and not warn on)
758individual types of malformations, such as the sequence being overlong (that
759is, when there is a shorter sequence that can express the same code point;
760overlong sequences are expressly forbidden in the UTF-8 standard due to
761potential security issues). Another malformation example is the first byte of
762a character not being a legal first byte. See F<utf8.h> for the list of such
524080c4
KW
763flags. For allowed 0 length strings, this function returns 0; for allowed
764overlong sequences, the computed code point is returned; for all other allowed
765malformations, the Unicode REPLACEMENT CHARACTER is returned, as these have no
766determinable reasonable value.
949cf498 767
796b6530 768The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
949cf498
KW
769flags) malformation is found. If this flag is set, the routine assumes that
770the caller will raise a warning, and this function will silently just set
d088425d
KW
771C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
772
75200dff 773Note that this API requires disambiguation between successful decoding a C<NUL>
796b6530 774character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
111fa700
KW
775in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
776be set to 1. To disambiguate, upon a zero return, see if the first byte of
777C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
778error.
949cf498
KW
779
780Certain code points are considered problematic. These are Unicode surrogates,
746afd53 781Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
949cf498 782By default these are considered regular code points, but certain situations
ecc1615f
KW
783warrant special handling for them, which can be specified using the C<flags>
784parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
785three classes are treated as malformations and handled as such. The flags
786C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
787C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
788disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
789restricts the allowed inputs to the strict UTF-8 traditionally defined by
790Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
791definition given by
792L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
793The difference between traditional strictness and C9 strictness is that the
794latter does not forbid non-character code points. (They are still discouraged,
795however.) For more discussion see L<perlunicode/Noncharacter code points>.
796
797The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
798C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
796b6530
KW
799C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
800raised for their respective categories, but otherwise the code points are
801considered valid (not malformations). To get a category to both be treated as
802a malformation and raise a warning, specify both the WARN and DISALLOW flags.
949cf498 803(But note that warnings are not raised if lexically disabled nor if
796b6530 804C<UTF8_CHECK_ONLY> is also specified.)
949cf498 805
760c7c2f
KW
806It is now deprecated to have very high code points (above C<IV_MAX> on the
807platforms) and this function will raise a deprecation warning for these (unless
808such warnings are turned off). This value, is typically 0x7FFF_FFFF (2**31 -1)
809in a 32-bit word.
ab8e6d41
KW
810
811Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
812so using them is more problematic than other above-Unicode code points. Perl
813invented an extension to UTF-8 to represent the ones above 2**36-1, so it is
814likely that non-Perl languages will not be able to read files that contain
815these that written by the perl interpreter; nor would Perl understand files
816written by something that uses a different extension. For these reasons, there
817is a separate set of flags that can warn and/or disallow these extremely high
818code points, even if other above-Unicode ones are accepted. These are the
760c7c2f
KW
819C<UTF8_WARN_ABOVE_31_BIT> and C<UTF8_DISALLOW_ABOVE_31_BIT> flags. These
820are entirely independent from the deprecation warning for code points above
821C<IV_MAX>. On 32-bit machines, it will eventually be forbidden to have any
822code point that needs more than 31 bits to represent. When that happens,
823effectively the C<UTF8_DISALLOW_ABOVE_31_BIT> flag will always be set on
82432-bit machines. (Of course C<UTF8_DISALLOW_SUPER> will treat all
ab8e6d41
KW
825above-Unicode code points, including these, as malformations; and
826C<UTF8_WARN_SUPER> warns on these.)
827
828On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
829extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
830than on ASCII. Prior to that, code points 2**31 and higher were simply
831unrepresentable, and a different, incompatible method was used to represent
832code points between 2**30 and 2**31 - 1. The flags C<UTF8_WARN_ABOVE_31_BIT>
833and C<UTF8_DISALLOW_ABOVE_31_BIT> have the same function as on ASCII
834platforms, warning and disallowing 2**31 and higher.
949cf498
KW
835
836All other code points corresponding to Unicode characters, including private
837use and those yet to be assigned, are never considered malformed and never
838warn.
67e989fb 839
37607a96
PK
840=cut
841*/
67e989fb 842
a0ed51b3 843UV
de69f3af 844Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 845{
d4c19fe8 846 const U8 * const s0 = s;
0b8d30e8 847 U8 * send;
eb83ed87
KW
848 UV uv = *s;
849 STRLEN expectlen;
949cf498 850 SV* sv = NULL;
eb83ed87
KW
851 UV outlier_ret = 0; /* return value when input is in error or problematic
852 */
853 UV pack_warn = 0; /* Save result of packWARN() for later */
854 bool unexpected_non_continuation = FALSE;
855 bool overflowed = FALSE;
2f8f112e 856 bool do_overlong_test = TRUE; /* May have to skip this test */
a0dbb045 857
7918f24d 858
de69f3af 859 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
a0dbb045 860
eb83ed87
KW
861 /* The order of malformation tests here is important. We should consume as
862 * few bytes as possible in order to not skip any valid character. This is
863 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
864 * http://unicode.org/reports/tr36 for more discussion as to why. For
865 * example, once we've done a UTF8SKIP, we can tell the expected number of
866 * bytes, and could fail right off the bat if the input parameters indicate
867 * that there are too few available. But it could be that just that first
868 * byte is garbled, and the intended character occupies fewer bytes. If we
869 * blindly assumed that the first byte is correct, and skipped based on
870 * that number, we could skip over a valid input character. So instead, we
871 * always examine the sequence byte-by-byte.
872 *
873 * We also should not consume too few bytes, otherwise someone could inject
874 * things. For example, an input could be deliberately designed to
875 * overflow, and if this code bailed out immediately upon discovering that,
e2660c54 876 * returning to the caller C<*retlen> pointing to the very next byte (one
eb83ed87
KW
877 * which is actually part of of the overflowing sequence), that could look
878 * legitimate to the caller, which could discard the initial partial
879 * sequence and process the rest, inappropriately */
880
881 /* Zero length strings, if allowed, of necessity are zero */
b5b9af04 882 if (UNLIKELY(curlen == 0)) {
eb83ed87
KW
883 if (retlen) {
884 *retlen = 0;
885 }
a0dbb045 886
eb83ed87
KW
887 if (flags & UTF8_ALLOW_EMPTY) {
888 return 0;
889 }
890 if (! (flags & UTF8_CHECK_ONLY)) {
7cf8d05d
KW
891 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)",
892 malformed_text));
eb83ed87 893 }
0c443dc2
JH
894 goto malformed;
895 }
896
eb83ed87
KW
897 expectlen = UTF8SKIP(s);
898
899 /* A well-formed UTF-8 character, as the vast majority of calls to this
900 * function will be for, has this expected length. For efficiency, set
901 * things up here to return it. It will be overriden only in those rare
902 * cases where a malformation is found */
903 if (retlen) {
904 *retlen = expectlen;
905 }
906
907 /* An invariant is trivially well-formed */
1d72bdf6 908 if (UTF8_IS_INVARIANT(uv)) {
de69f3af 909 return uv;
a0ed51b3 910 }
67e989fb 911
eb83ed87 912 /* A continuation character can't start a valid sequence */
b5b9af04 913 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
eb83ed87
KW
914 if (flags & UTF8_ALLOW_CONTINUATION) {
915 if (retlen) {
916 *retlen = 1;
917 }
918 return UNICODE_REPLACEMENT;
919 }
ba210ebe 920
eb83ed87 921 if (! (flags & UTF8_CHECK_ONLY)) {
7cf8d05d
KW
922 sv = sv_2mortal(Perl_newSVpvf(aTHX_
923 "%s: %s (unexpected continuation byte 0x%02x,"
924 " with no preceding start byte)",
925 malformed_text,
926 _byte_dump_string(s0, 1), *s0));
eb83ed87
KW
927 }
928 curlen = 1;
ba210ebe
JH
929 goto malformed;
930 }
9041c2e3 931
dcd27b3c
KW
932 /* Here is not a continuation byte, nor an invariant. The only thing left
933 * is a start byte (possibly for an overlong) */
934
534752c1
KW
935 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
936 * that indicate the number of bytes in the character's whole UTF-8
937 * sequence, leaving just the bits that are part of the value. */
938 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
ba210ebe 939
eb83ed87
KW
940 /* Now, loop through the remaining bytes in the character's sequence,
941 * accumulating each into the working value as we go. Be sure to not look
942 * past the end of the input string */
0b8d30e8
KW
943 send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
944
eb83ed87 945 for (s = s0 + 1; s < send; s++) {
b5b9af04 946 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
eb83ed87
KW
947 if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
948
949 /* The original implementors viewed this malformation as more
950 * serious than the others (though I, khw, don't understand
951 * why, since other malformations also give very very wrong
952 * results), so there is no way to turn off checking for it.
953 * Set a flag, but keep going in the loop, so that we absorb
954 * the rest of the bytes that comprise the character. */
955 overflowed = TRUE;
eb83ed87 956 }
8850bf83 957 uv = UTF8_ACCUMULATE(uv, *s);
eb83ed87
KW
958 }
959 else {
960 /* Here, found a non-continuation before processing all expected
961 * bytes. This byte begins a new character, so quit, even if
962 * allowing this malformation. */
963 unexpected_non_continuation = TRUE;
964 break;
965 }
966 } /* End of loop through the character's bytes */
967
968 /* Save how many bytes were actually in the character */
969 curlen = s - s0;
970
971 /* The loop above finds two types of malformations: non-continuation and/or
972 * overflow. The non-continuation malformation is really a too-short
973 * malformation, as it means that the current character ended before it was
974 * expected to (being terminated prematurely by the beginning of the next
975 * character, whereas in the too-short malformation there just are too few
976 * bytes available to hold the character. In both cases, the check below
977 * that we have found the expected number of bytes would fail if executed.)
978 * Thus the non-continuation malformation is really unnecessary, being a
979 * subset of the too-short malformation. But there may be existing
980 * applications that are expecting the non-continuation type, so we retain
981 * it, and return it in preference to the too-short malformation. (If this
982 * code were being written from scratch, the two types might be collapsed
983 * into one.) I, khw, am also giving priority to returning the
984 * non-continuation and too-short malformations over overflow when multiple
985 * ones are present. I don't know of any real reason to prefer one over
986 * the other, except that it seems to me that multiple-byte errors trumps
987 * errors from a single byte */
b5b9af04 988 if (UNLIKELY(unexpected_non_continuation)) {
eb83ed87
KW
989 if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
990 if (! (flags & UTF8_CHECK_ONLY)) {
806547a7 991 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s",
7cf8d05d
KW
992 unexpected_non_continuation_text(s0,
993 send - s0,
994 s - s0,
995 (int) expectlen)));
a0dbb045 996 }
eb83ed87
KW
997 goto malformed;
998 }
999 uv = UNICODE_REPLACEMENT;
2f8f112e
KW
1000
1001 /* Skip testing for overlongs, as the REPLACEMENT may not be the same
1002 * as what the original expectations were. */
1003 do_overlong_test = FALSE;
eb83ed87
KW
1004 if (retlen) {
1005 *retlen = curlen;
1006 }
1007 }
b5b9af04 1008 else if (UNLIKELY(curlen < expectlen)) {
eb83ed87
KW
1009 if (! (flags & UTF8_ALLOW_SHORT)) {
1010 if (! (flags & UTF8_CHECK_ONLY)) {
7cf8d05d
KW
1011 sv = sv_2mortal(Perl_newSVpvf(aTHX_
1012 "%s: %s (too short; got %d byte%s, need %d)",
1013 malformed_text,
1014 _byte_dump_string(s0, send - s0),
1015 (int)curlen,
1016 curlen == 1 ? "" : "s",
1017 (int)expectlen));
a0dbb045 1018 }
eb83ed87
KW
1019 goto malformed;
1020 }
1021 uv = UNICODE_REPLACEMENT;
2f8f112e 1022 do_overlong_test = FALSE;
eb83ed87
KW
1023 if (retlen) {
1024 *retlen = curlen;
1025 }
1026 }
1027
b5b9af04 1028 if (UNLIKELY(overflowed)) {
7cf8d05d
KW
1029 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s: %s (overflows)",
1030 malformed_text,
1031 _byte_dump_string(s0, send - s0)));
ba210ebe 1032 goto malformed;
eb83ed87 1033 }
eb83ed87 1034
2f8f112e 1035 if (do_overlong_test
5aaebcb3 1036 && expectlen > (STRLEN) OFFUNISKIP(uv)
2f8f112e
KW
1037 && ! (flags & UTF8_ALLOW_LONG))
1038 {
eb83ed87
KW
1039 /* The overlong malformation has lower precedence than the others.
1040 * Note that if this malformation is allowed, we return the actual
1041 * value, instead of the replacement character. This is because this
1042 * value is actually well-defined. */
1043 if (! (flags & UTF8_CHECK_ONLY)) {
7cf8d05d
KW
1044 U8 tmpbuf[UTF8_MAXBYTES+1];
1045 const U8 * const e = uvchr_to_utf8(tmpbuf, uv);
1046 sv = sv_2mortal(Perl_newSVpvf(aTHX_
1047 "%s: %s (overlong; instead use %s to represent U+%0*"UVXf")",
1048 malformed_text,
1049 _byte_dump_string(s0, send - s0),
1050 _byte_dump_string(tmpbuf, e - tmpbuf),
1051 ((uv < 256) ? 2 : 4), /* Field width of 2 for small code
1052 points */
1053 uv));
eb83ed87
KW
1054 }
1055 goto malformed;
1056 }
1057
1a89bb6c 1058 /* Here, the input is considered to be well-formed, but it still could be a
eb83ed87
KW
1059 * problematic code point that is not allowed by the input parameters. */
1060 if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
760c7c2f
KW
1061 && ((flags & ( UTF8_DISALLOW_NONCHAR
1062 |UTF8_DISALLOW_SURROGATE
1063 |UTF8_DISALLOW_SUPER
1064 |UTF8_DISALLOW_ABOVE_31_BIT
1065 |UTF8_WARN_NONCHAR
1066 |UTF8_WARN_SURROGATE
1067 |UTF8_WARN_SUPER
1068 |UTF8_WARN_ABOVE_31_BIT))
1069 || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1070 && ckWARN_d(WARN_DEPRECATED))))
eb83ed87 1071 {
889c8856 1072 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
ea5ced44
KW
1073
1074 /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
1075 * generation of the sv, since no warnings are raised under CHECK */
eb83ed87 1076 if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
54f4afef 1077 && ckWARN_d(WARN_SURROGATE))
eb83ed87 1078 {
111d382d 1079 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
54f4afef 1080 pack_warn = packWARN(WARN_SURROGATE);
949cf498
KW
1081 }
1082 if (flags & UTF8_DISALLOW_SURROGATE) {
1083 goto disallowed;
1084 }
1085 }
889c8856 1086 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
eb83ed87 1087 if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
ea5ced44 1088 && ckWARN_d(WARN_NON_UNICODE))
eb83ed87 1089 {
0bcdd8f6
KW
1090 sv = sv_2mortal(Perl_newSVpvf(aTHX_
1091 "Code point 0x%04"UVXf" is not Unicode, may not be portable",
1092 uv));
54f4afef 1093 pack_warn = packWARN(WARN_NON_UNICODE);
949cf498 1094 }
c0236afe
KW
1095
1096 /* The maximum code point ever specified by a standard was
1097 * 2**31 - 1. Anything larger than that is a Perl extension that
1098 * very well may not be understood by other applications (including
83dc0f42
KW
1099 * earlier perl versions on EBCDIC platforms). We test for these
1100 * after the regular SUPER ones, and before possibly bailing out,
1101 * so that the slightly more dire warning will override the regular
1102 * one. */
1103 if ( (flags & (UTF8_WARN_ABOVE_31_BIT
1104 |UTF8_WARN_SUPER
1105 |UTF8_DISALLOW_ABOVE_31_BIT))
1106 && UNLIKELY(is_utf8_cp_above_31_bits(s0, send)))
ea5ced44 1107 {
0bcdd8f6 1108 if ( ! (flags & UTF8_CHECK_ONLY)
d35f2ca5 1109 && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
0bcdd8f6 1110 && ckWARN_d(WARN_UTF8))
ea5ced44 1111 {
0bcdd8f6
KW
1112 sv = sv_2mortal(Perl_newSVpvf(aTHX_
1113 "Code point 0x%"UVXf" is not Unicode, and not portable",
1114 uv));
ea5ced44
KW
1115 pack_warn = packWARN(WARN_UTF8);
1116 }
d35f2ca5 1117 if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
ea5ced44
KW
1118 goto disallowed;
1119 }
1120 }
c0236afe 1121
949cf498
KW
1122 if (flags & UTF8_DISALLOW_SUPER) {
1123 goto disallowed;
1124 }
760c7c2f
KW
1125
1126 /* The deprecated warning overrides any non-deprecated one */
1127 if (UNLIKELY(uv > MAX_NON_DEPRECATED_CP) && ckWARN_d(WARN_DEPRECATED))
1128 {
1129 sv = sv_2mortal(Perl_newSVpvf(aTHX_ cp_above_legal_max,
1130 uv, MAX_NON_DEPRECATED_CP));
1131 pack_warn = packWARN(WARN_DEPRECATED);
1132 }
949cf498 1133 }
889c8856 1134 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
4190d317 1135 if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
54f4afef 1136 && ckWARN_d(WARN_NONCHAR))
4190d317 1137 {
ba707cdc 1138 sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is not recommended for open interchange", uv));
54f4afef 1139 pack_warn = packWARN(WARN_NONCHAR);
4190d317
KW
1140 }
1141 if (flags & UTF8_DISALLOW_NONCHAR) {
1142 goto disallowed;
1143 }
1144 }
949cf498 1145
eb83ed87 1146 if (sv) {
7ae54b83 1147 outlier_ret = UNI_TO_NATIVE(uv);
eb83ed87
KW
1148 goto do_warn;
1149 }
1150
949cf498
KW
1151 /* Here, this is not considered a malformed character, so drop through
1152 * to return it */
a0ed51b3 1153 }
ba210ebe 1154
de69f3af 1155 return UNI_TO_NATIVE(uv);
ba210ebe 1156
eb83ed87
KW
1157 /* There are three cases which get to beyond this point. In all 3 cases:
1158 * <sv> if not null points to a string to print as a warning.
1159 * <curlen> is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
1160 * set.
1161 * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
1162 * This is done by initializing it to 0, and changing it only
1163 * for case 1).
1164 * The 3 cases are:
1165 * 1) The input is valid but problematic, and to be warned about. The
1166 * return value is the resultant code point; <*retlen> is set to
1167 * <curlen>, the number of bytes that comprise the code point.
1168 * <pack_warn> contains the result of packWARN() for the warning
1169 * types. The entry point for this case is the label <do_warn>;
1170 * 2) The input is a valid code point but disallowed by the parameters to
1171 * this function. The return value is 0. If UTF8_CHECK_ONLY is set,
1172 * <*relen> is -1; otherwise it is <curlen>, the number of bytes that
1173 * comprise the code point. <pack_warn> contains the result of
1174 * packWARN() for the warning types. The entry point for this case is
1175 * the label <disallowed>.
1176 * 3) The input is malformed. The return value is 0. If UTF8_CHECK_ONLY
1177 * is set, <*relen> is -1; otherwise it is <curlen>, the number of
1178 * bytes that comprise the malformation. All such malformations are
1179 * assumed to be warning type <utf8>. The entry point for this case
1180 * is the label <malformed>.
1181 */
949cf498 1182
7b52d656 1183 malformed:
ba210ebe 1184
eb83ed87
KW
1185 if (sv && ckWARN_d(WARN_UTF8)) {
1186 pack_warn = packWARN(WARN_UTF8);
1187 }
1188
7b52d656 1189 disallowed:
eb83ed87 1190
fcc8fcf6 1191 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 1192 if (retlen)
10edeb5d 1193 *retlen = ((STRLEN) -1);
ba210ebe
JH
1194 return 0;
1195 }
1196
7b52d656 1197 do_warn:
5b311467 1198
eb83ed87
KW
1199 if (pack_warn) { /* <pack_warn> was initialized to 0, and changed only
1200 if warnings are to be raised. */
f555bc63 1201 const char * const string = SvPVX_const(sv);
a0dbb045 1202
f555bc63
KW
1203 if (PL_op)
1204 Perl_warner(aTHX_ pack_warn, "%s in %s", string, OP_DESC(PL_op));
1205 else
1206 Perl_warner(aTHX_ pack_warn, "%s", string);
a0dbb045
JH
1207 }
1208
eb83ed87
KW
1209 if (retlen) {
1210 *retlen = curlen;
1211 }
ba210ebe 1212
eb83ed87 1213 return outlier_ret;
a0ed51b3
LW
1214}
1215
8e84507e 1216/*
ec5f19d0
KW
1217=for apidoc utf8_to_uvchr_buf
1218
1219Returns the native code point of the first character in the string C<s> which
1220is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
524080c4 1221C<*retlen> will be set to the length, in bytes, of that character.
ec5f19d0 1222
524080c4
KW
1223If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1224enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
796b6530 1225C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
173db420 1226(or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
796b6530 1227C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
173db420 1228the next possible position in C<s> that could begin a non-malformed character.
de69f3af 1229See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
173db420 1230returned.
ec5f19d0 1231
760c7c2f
KW
1232Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1233unless those are turned off.
1234
ec5f19d0 1235=cut
52be2536
KW
1236
1237Also implemented as a macro in utf8.h
1238
ec5f19d0
KW
1239*/
1240
1241
1242UV
1243Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1244{
ec5f19d0
KW
1245 assert(s < send);
1246
1247 return utf8n_to_uvchr(s, send - s, retlen,
1248 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1249}
1250
52be2536
KW
1251/* This is marked as deprecated
1252 *
ec5f19d0
KW
1253=for apidoc utf8_to_uvuni_buf
1254
de69f3af
KW
1255Only in very rare circumstances should code need to be dealing in Unicode
1256(as opposed to native) code points. In those few cases, use
1257C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
4f83cdcd
KW
1258
1259Returns the Unicode (not-native) code point of the first character in the
1260string C<s> which
ec5f19d0
KW
1261is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1262C<retlen> will be set to the length, in bytes, of that character.
1263
524080c4
KW
1264If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1265enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1266NULL) to -1. If those warnings are off, the computed value if well-defined (or
1267the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1268is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1269next possible position in C<s> that could begin a non-malformed character.
de69f3af 1270See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
ec5f19d0 1271
760c7c2f
KW
1272Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1273unless those are turned off.
1274
ec5f19d0
KW
1275=cut
1276*/
1277
1278UV
1279Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1280{
1281 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1282
1283 assert(send > s);
1284
5962d97e
KW
1285 /* Call the low level routine, asking for checks */
1286 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
ec5f19d0
KW
1287}
1288
b76347f2 1289/*
87cea99e 1290=for apidoc utf8_length
b76347f2
JH
1291
1292Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
1293Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
1294up past C<e>, croaks.
b76347f2
JH
1295
1296=cut
1297*/
1298
1299STRLEN
35a4481c 1300Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
1301{
1302 STRLEN len = 0;
1303
7918f24d
NC
1304 PERL_ARGS_ASSERT_UTF8_LENGTH;
1305
8850bf83
JH
1306 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1307 * the bitops (especially ~) can create illegal UTF-8.
1308 * In other words: in Perl UTF-8 is not just for Unicode. */
1309
a3b680e6
AL
1310 if (e < s)
1311 goto warn_and_return;
b76347f2 1312 while (s < e) {
4cbf4130 1313 s += UTF8SKIP(s);
8e91ec7f
AV
1314 len++;
1315 }
1316
1317 if (e != s) {
1318 len--;
1319 warn_and_return:
9b387841
NC
1320 if (PL_op)
1321 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1322 "%s in %s", unees, OP_DESC(PL_op));
1323 else
61a12c31 1324 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
b76347f2
JH
1325 }
1326
1327 return len;
1328}
1329
b06226ff 1330/*
fed3ba5d
NC
1331=for apidoc bytes_cmp_utf8
1332
a1433954 1333Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
72d33970
FC
1334sequence of characters (stored as UTF-8)
1335in C<u>, C<ulen>. Returns 0 if they are
fed3ba5d
NC
1336equal, -1 or -2 if the first string is less than the second string, +1 or +2
1337if the first string is greater than the second string.
1338
1339-1 or +1 is returned if the shorter string was identical to the start of the
72d33970
FC
1340longer string. -2 or +2 is returned if
1341there was a difference between characters
fed3ba5d
NC
1342within the strings.
1343
1344=cut
1345*/
1346
1347int
1348Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1349{
1350 const U8 *const bend = b + blen;
1351 const U8 *const uend = u + ulen;
1352
1353 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
fed3ba5d
NC
1354
1355 while (b < bend && u < uend) {
1356 U8 c = *u++;
1357 if (!UTF8_IS_INVARIANT(c)) {
1358 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1359 if (u < uend) {
1360 U8 c1 = *u++;
1361 if (UTF8_IS_CONTINUATION(c1)) {
a62b247b 1362 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
fed3ba5d 1363 } else {
806547a7 1364 /* diag_listed_as: Malformed UTF-8 character (%s) */
fed3ba5d 1365 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
806547a7 1366 "%s %s%s",
7cf8d05d 1367 unexpected_non_continuation_text(u - 1, 2, 1, 2),
806547a7
KW
1368 PL_op ? " in " : "",
1369 PL_op ? OP_DESC(PL_op) : "");
fed3ba5d
NC
1370 return -2;
1371 }
1372 } else {
1373 if (PL_op)
1374 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1375 "%s in %s", unees, OP_DESC(PL_op));
1376 else
61a12c31 1377 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
fed3ba5d
NC
1378 return -2; /* Really want to return undef :-) */
1379 }
1380 } else {
1381 return -2;
1382 }
1383 }
1384 if (*b != c) {
1385 return *b < c ? -2 : +2;
1386 }
1387 ++b;
1388 }
1389
1390 if (b == bend && u == uend)
1391 return 0;
1392
1393 return b < bend ? +1 : -1;
1394}
1395
1396/*
87cea99e 1397=for apidoc utf8_to_bytes
6940069f 1398
2bbc8d55 1399Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954
KW
1400Unlike L</bytes_to_utf8>, this over-writes the original string, and
1401updates C<len> to contain the new length.
67e989fb 1402Returns zero on failure, setting C<len> to -1.
6940069f 1403
a1433954 1404If you need a copy of the string, see L</bytes_from_utf8>.
95be277c 1405
6940069f
GS
1406=cut
1407*/
1408
1409U8 *
37607a96 1410Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 1411{
d4c19fe8
AL
1412 U8 * const save = s;
1413 U8 * const send = s + *len;
6940069f 1414 U8 *d;
246fae53 1415
7918f24d 1416 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
81611534 1417 PERL_UNUSED_CONTEXT;
7918f24d 1418
1e54db1a 1419 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 1420 while (s < send) {
d59937ca
KW
1421 if (! UTF8_IS_INVARIANT(*s)) {
1422 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
1423 *len = ((STRLEN) -1);
1424 return 0;
1425 }
1426 s++;
dcad2880 1427 }
d59937ca 1428 s++;
246fae53 1429 }
dcad2880
JH
1430
1431 d = s = save;
6940069f 1432 while (s < send) {
80e0b38f
KW
1433 U8 c = *s++;
1434 if (! UTF8_IS_INVARIANT(c)) {
1435 /* Then it is two-byte encoded */
a62b247b 1436 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
80e0b38f
KW
1437 s++;
1438 }
1439 *d++ = c;
6940069f
GS
1440 }
1441 *d = '\0';
246fae53 1442 *len = d - save;
6940069f
GS
1443 return save;
1444}
1445
1446/*
87cea99e 1447=for apidoc bytes_from_utf8
f9a63242 1448
2bbc8d55 1449Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
a1433954 1450Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
1451the newly-created string, and updates C<len> to contain the new
1452length. Returns the original string if no conversion occurs, C<len>
72d33970 1453is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
2bbc8d55 14540 if C<s> is converted or consisted entirely of characters that are invariant
4a4088c4 1455in UTF-8 (i.e., US-ASCII on non-EBCDIC machines).
f9a63242 1456
37607a96
PK
1457=cut
1458*/
f9a63242
JH
1459
1460U8 *
e1ec3a88 1461Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 1462{
f9a63242 1463 U8 *d;
e1ec3a88
AL
1464 const U8 *start = s;
1465 const U8 *send;
f9a63242
JH
1466 I32 count = 0;
1467
7918f24d 1468 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
96a5add6 1469 PERL_UNUSED_CONTEXT;
f9a63242 1470 if (!*is_utf8)
73d840c0 1471 return (U8 *)start;
f9a63242 1472
1e54db1a 1473 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 1474 for (send = s + *len; s < send;) {
d59937ca
KW
1475 if (! UTF8_IS_INVARIANT(*s)) {
1476 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
73d840c0 1477 return (U8 *)start;
d59937ca
KW
1478 }
1479 count++;
1480 s++;
db42d148 1481 }
d59937ca 1482 s++;
f9a63242
JH
1483 }
1484
35da51f7 1485 *is_utf8 = FALSE;
f9a63242 1486
212542aa 1487 Newx(d, (*len) - count + 1, U8);
ef9edfd0 1488 s = start; start = d;
f9a63242
JH
1489 while (s < send) {
1490 U8 c = *s++;
1a91c45d 1491 if (! UTF8_IS_INVARIANT(c)) {
c4d5f83a 1492 /* Then it is two-byte encoded */
a62b247b 1493 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
1a91c45d 1494 s++;
c4d5f83a
NIS
1495 }
1496 *d++ = c;
f9a63242
JH
1497 }
1498 *d = '\0';
1499 *len = d - start;
73d840c0 1500 return (U8 *)start;
f9a63242
JH
1501}
1502
1503/*
87cea99e 1504=for apidoc bytes_to_utf8
6940069f 1505
ff97e5cf
KW
1506Converts a string C<s> of length C<len> bytes from the native encoding into
1507UTF-8.
6662521e 1508Returns a pointer to the newly-created string, and sets C<len> to
ff97e5cf 1509reflect the new length in bytes.
6940069f 1510
75200dff 1511A C<NUL> character will be written after the end of the string.
2bbc8d55
SP
1512
1513If you want to convert to UTF-8 from encodings other than
1514the native (Latin1 or EBCDIC),
a1433954 1515see L</sv_recode_to_utf8>().
c9ada85f 1516
497711e7 1517=cut
6940069f
GS
1518*/
1519
c682ebef
FC
1520/* This logic is duplicated in sv_catpvn_flags, so any bug fixes will
1521 likewise need duplication. */
1522
6940069f 1523U8*
35a4481c 1524Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 1525{
35a4481c 1526 const U8 * const send = s + (*len);
6940069f
GS
1527 U8 *d;
1528 U8 *dst;
7918f24d
NC
1529
1530 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
96a5add6 1531 PERL_UNUSED_CONTEXT;
6940069f 1532
212542aa 1533 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
1534 dst = d;
1535
1536 while (s < send) {
55d09dc8
KW
1537 append_utf8_from_native_byte(*s, &d);
1538 s++;
6940069f
GS
1539 }
1540 *d = '\0';
6662521e 1541 *len = d-dst;
6940069f
GS
1542 return dst;
1543}
1544
a0ed51b3 1545/*
dea0fc0b 1546 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
1547 *
1548 * Destination must be pre-extended to 3/2 source. Do not use in-place.
1549 * We optimize for native, for obvious reasons. */
1550
1551U8*
dea0fc0b 1552Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 1553{
dea0fc0b
JH
1554 U8* pend;
1555 U8* dstart = d;
1556
7918f24d
NC
1557 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
1558
dea0fc0b 1559 if (bytelen & 1)
f5992bc4 1560 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
1561
1562 pend = p + bytelen;
1563
a0ed51b3 1564 while (p < pend) {
dea0fc0b
JH
1565 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
1566 p += 2;
2d1545e5 1567 if (OFFUNI_IS_INVARIANT(uv)) {
56d37426 1568 *d++ = LATIN1_TO_NATIVE((U8) uv);
a0ed51b3
LW
1569 continue;
1570 }
56d37426
KW
1571 if (uv <= MAX_UTF8_TWO_BYTE) {
1572 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
1573 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
a0ed51b3
LW
1574 continue;
1575 }
46956fad
KW
1576#define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
1577#define LAST_HIGH_SURROGATE 0xDBFF
1578#define FIRST_LOW_SURROGATE 0xDC00
1579#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
e23c50db
KW
1580
1581 /* This assumes that most uses will be in the first Unicode plane, not
1582 * needing surrogates */
1583 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
1584 && uv <= UNICODE_SURROGATE_LAST))
1585 {
1586 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
1587 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1588 }
1589 else {
01ea242b 1590 UV low = (p[0] << 8) + p[1];
e23c50db
KW
1591 if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
1592 || UNLIKELY(low > LAST_LOW_SURROGATE))
1593 {
01ea242b 1594 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
e23c50db
KW
1595 }
1596 p += 2;
46956fad
KW
1597 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
1598 + (low - FIRST_LOW_SURROGATE) + 0x10000;
01ea242b 1599 }
a0ed51b3 1600 }
56d37426
KW
1601#ifdef EBCDIC
1602 d = uvoffuni_to_utf8_flags(d, uv, 0);
1603#else
a0ed51b3 1604 if (uv < 0x10000) {
eb160463
GS
1605 *d++ = (U8)(( uv >> 12) | 0xe0);
1606 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1607 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1608 continue;
1609 }
1610 else {
eb160463
GS
1611 *d++ = (U8)(( uv >> 18) | 0xf0);
1612 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1613 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1614 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
1615 continue;
1616 }
56d37426 1617#endif
a0ed51b3 1618 }
dea0fc0b 1619 *newlen = d - dstart;
a0ed51b3
LW
1620 return d;
1621}
1622
1623/* Note: this one is slightly destructive of the source. */
1624
1625U8*
dea0fc0b 1626Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
1627{
1628 U8* s = (U8*)p;
d4c19fe8 1629 U8* const send = s + bytelen;
7918f24d
NC
1630
1631 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1632
e0ea5e2d
NC
1633 if (bytelen & 1)
1634 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1635 (UV)bytelen);
1636
a0ed51b3 1637 while (s < send) {
d4c19fe8 1638 const U8 tmp = s[0];
a0ed51b3
LW
1639 s[0] = s[1];
1640 s[1] = tmp;
1641 s += 2;
1642 }
dea0fc0b 1643 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
1644}
1645
922e8cb4
KW
1646bool
1647Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
1648{
1649 U8 tmpbuf[UTF8_MAXBYTES+1];
1650 uvchr_to_utf8(tmpbuf, c);
1651 return _is_utf8_FOO(classnum, tmpbuf);
1652}
1653
f9ae8fb6
JD
1654/* Internal function so we can deprecate the external one, and call
1655 this one from other deprecated functions in this file */
1656
f2645549
KW
1657bool
1658Perl__is_utf8_idstart(pTHX_ const U8 *p)
61b19385 1659{
f2645549 1660 PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
61b19385
KW
1661
1662 if (*p == '_')
1663 return TRUE;
f25ce844 1664 return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
61b19385
KW
1665}
1666
5092f92a 1667bool
eba68aa0
KW
1668Perl__is_uni_perl_idcont(pTHX_ UV c)
1669{
1670 U8 tmpbuf[UTF8_MAXBYTES+1];
1671 uvchr_to_utf8(tmpbuf, c);
1672 return _is_utf8_perl_idcont(tmpbuf);
1673}
1674
1675bool
f91dcd13
KW
1676Perl__is_uni_perl_idstart(pTHX_ UV c)
1677{
1678 U8 tmpbuf[UTF8_MAXBYTES+1];
1679 uvchr_to_utf8(tmpbuf, c);
1680 return _is_utf8_perl_idstart(tmpbuf);
1681}
1682
3a4c58c9
KW
1683UV
1684Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const char S_or_s)
1685{
1686 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 1687 * those, converting the result to UTF-8. The only difference between upper
3a4c58c9
KW
1688 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
1689 * either "SS" or "Ss". Which one to use is passed into the routine in
1690 * 'S_or_s' to avoid a test */
1691
1692 UV converted = toUPPER_LATIN1_MOD(c);
1693
1694 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
1695
1696 assert(S_or_s == 'S' || S_or_s == 's');
1697
6f2d5cbc 1698 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
f4cd282c 1699 characters in this range */
3a4c58c9
KW
1700 *p = (U8) converted;
1701 *lenp = 1;
1702 return converted;
1703 }
1704
1705 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
1706 * which it maps to one of them, so as to only have to have one check for
1707 * it in the main case */
1708 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
1709 switch (c) {
1710 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
1711 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
1712 break;
1713 case MICRO_SIGN:
1714 converted = GREEK_CAPITAL_LETTER_MU;
1715 break;
79e064b9
KW
1716#if UNICODE_MAJOR_VERSION > 2 \
1717 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
1718 && UNICODE_DOT_DOT_VERSION >= 8)
3a4c58c9
KW
1719 case LATIN_SMALL_LETTER_SHARP_S:
1720 *(p)++ = 'S';
1721 *p = S_or_s;
1722 *lenp = 2;
1723 return 'S';
79e064b9 1724#endif
3a4c58c9
KW
1725 default:
1726 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
e5964223 1727 NOT_REACHED; /* NOTREACHED */
3a4c58c9
KW
1728 }
1729 }
1730
1731 *(p)++ = UTF8_TWO_BYTE_HI(converted);
1732 *p = UTF8_TWO_BYTE_LO(converted);
1733 *lenp = 2;
1734
1735 return converted;
1736}
1737
50bda2c3
KW
1738/* Call the function to convert a UTF-8 encoded character to the specified case.
1739 * Note that there may be more than one character in the result.
1740 * INP is a pointer to the first byte of the input character
1741 * OUTP will be set to the first byte of the string of changed characters. It
1742 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
1743 * LENP will be set to the length in bytes of the string of changed characters
1744 *
1745 * The functions return the ordinal of the first character in the string of OUTP */
b9992569
KW
1746#define CALL_UPPER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
1747#define CALL_TITLE_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
1748#define CALL_LOWER_CASE(uv, s, d, lenp) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
50bda2c3 1749
b9992569
KW
1750/* This additionally has the input parameter 'specials', which if non-zero will
1751 * cause this to use the specials hash for folding (meaning get full case
50bda2c3 1752 * folding); otherwise, when zero, this implies a simple case fold */
b9992569 1753#define CALL_FOLD_CASE(uv, s, d, lenp, specials) _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
c3fd2246 1754
84afefe6
JH
1755UV
1756Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1757{
a1433954
KW
1758 /* Convert the Unicode character whose ordinal is <c> to its uppercase
1759 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
1760 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
c3fd2246
KW
1761 * the changed version may be longer than the original character.
1762 *
1763 * The ordinal of the first character of the changed version is returned
1764 * (but note, as explained above, that there may be more.) */
1765
7918f24d
NC
1766 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1767
3a4c58c9
KW
1768 if (c < 256) {
1769 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
1770 }
1771
0ebc6274 1772 uvchr_to_utf8(p, c);
b9992569 1773 return CALL_UPPER_CASE(c, p, p, lenp);
a0ed51b3
LW
1774}
1775
84afefe6
JH
1776UV
1777Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1778{
7918f24d
NC
1779 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1780
3a4c58c9
KW
1781 if (c < 256) {
1782 return _to_upper_title_latin1((U8) c, p, lenp, 's');
1783 }
1784
0ebc6274 1785 uvchr_to_utf8(p, c);
b9992569 1786 return CALL_TITLE_CASE(c, p, p, lenp);
a0ed51b3
LW
1787}
1788
afc16117 1789STATIC U8
81611534 1790S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp)
afc16117
KW
1791{
1792 /* We have the latin1-range values compiled into the core, so just use
4a4088c4 1793 * those, converting the result to UTF-8. Since the result is always just
a1433954 1794 * one character, we allow <p> to be NULL */
afc16117
KW
1795
1796 U8 converted = toLOWER_LATIN1(c);
1797
1798 if (p != NULL) {
6f2d5cbc 1799 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
afc16117
KW
1800 *p = converted;
1801 *lenp = 1;
1802 }
1803 else {
430c9760
KW
1804 /* Result is known to always be < 256, so can use the EIGHT_BIT
1805 * macros */
1806 *p = UTF8_EIGHT_BIT_HI(converted);
1807 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
afc16117
KW
1808 *lenp = 2;
1809 }
1810 }
1811 return converted;
1812}
1813
84afefe6
JH
1814UV
1815Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1816{
7918f24d
NC
1817 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1818
afc16117
KW
1819 if (c < 256) {
1820 return to_lower_latin1((U8) c, p, lenp);
bca00c02
KW
1821 }
1822
afc16117 1823 uvchr_to_utf8(p, c);
b9992569 1824 return CALL_LOWER_CASE(c, p, p, lenp);
a0ed51b3
LW
1825}
1826
84afefe6 1827UV
51910141 1828Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
a1dde8de 1829{
51910141 1830 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
1ca267a5 1831 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
51910141 1832 * FOLD_FLAGS_FULL iff full folding is to be used;
1ca267a5
KW
1833 *
1834 * Not to be used for locale folds
51910141 1835 */
f673fad4 1836
a1dde8de
KW
1837 UV converted;
1838
1839 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
81611534 1840 PERL_UNUSED_CONTEXT;
a1dde8de 1841
1ca267a5
KW
1842 assert (! (flags & FOLD_FLAGS_LOCALE));
1843
659a7c2d 1844 if (UNLIKELY(c == MICRO_SIGN)) {
a1dde8de
KW
1845 converted = GREEK_SMALL_LETTER_MU;
1846 }
9b63e895
KW
1847#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
1848 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
1849 || UNICODE_DOT_DOT_VERSION > 0)
659a7c2d
KW
1850 else if ( (flags & FOLD_FLAGS_FULL)
1851 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
1852 {
1ca267a5
KW
1853 /* If can't cross 127/128 boundary, can't return "ss"; instead return
1854 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
1855 * under those circumstances. */
1856 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
1857 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
1858 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
1859 p, *lenp, U8);
1860 return LATIN_SMALL_LETTER_LONG_S;
1861 }
1862 else {
4f489194
KW
1863 *(p)++ = 's';
1864 *p = 's';
1865 *lenp = 2;
1866 return 's';
1ca267a5 1867 }
a1dde8de 1868 }
9b63e895 1869#endif
a1dde8de
KW
1870 else { /* In this range the fold of all other characters is their lower
1871 case */
1872 converted = toLOWER_LATIN1(c);
1873 }
1874
6f2d5cbc 1875 if (UVCHR_IS_INVARIANT(converted)) {
a1dde8de
KW
1876 *p = (U8) converted;
1877 *lenp = 1;
1878 }
1879 else {
1880 *(p)++ = UTF8_TWO_BYTE_HI(converted);
1881 *p = UTF8_TWO_BYTE_LO(converted);
1882 *lenp = 2;
1883 }
1884
1885 return converted;
1886}
1887
1888UV
31f05a37 1889Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
84afefe6 1890{
4b593389 1891
a0270393
KW
1892 /* Not currently externally documented, and subject to change
1893 * <flags> bits meanings:
1894 * FOLD_FLAGS_FULL iff full folding is to be used;
31f05a37
KW
1895 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
1896 * locale are to be used.
a0270393
KW
1897 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
1898 */
4b593389 1899
36bb2ab6 1900 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
7918f24d 1901
780fcc9f
KW
1902 if (flags & FOLD_FLAGS_LOCALE) {
1903 /* Treat a UTF-8 locale as not being in locale at all */
1904 if (IN_UTF8_CTYPE_LOCALE) {
1905 flags &= ~FOLD_FLAGS_LOCALE;
1906 }
1907 else {
1908 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
e7b7ac46 1909 goto needs_full_generality;
780fcc9f 1910 }
31f05a37
KW
1911 }
1912
a1dde8de 1913 if (c < 256) {
e7b7ac46 1914 return _to_fold_latin1((U8) c, p, lenp,
31f05a37 1915 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
a1dde8de
KW
1916 }
1917
2f306ab9 1918 /* Here, above 255. If no special needs, just use the macro */
a0270393
KW
1919 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
1920 uvchr_to_utf8(p, c);
b9992569 1921 return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
a0270393
KW
1922 }
1923 else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
1924 the special flags. */
1925 U8 utf8_c[UTF8_MAXBYTES + 1];
e7b7ac46
KW
1926
1927 needs_full_generality:
a0270393 1928 uvchr_to_utf8(utf8_c, c);
445bf929 1929 return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
a0270393 1930 }
84afefe6
JH
1931}
1932
26483009 1933PERL_STATIC_INLINE bool
5141f98e 1934S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
f25ce844 1935 const char *const swashname, SV* const invlist)
bde6a22d 1936{
ea317ccb
KW
1937 /* returns a boolean giving whether or not the UTF8-encoded character that
1938 * starts at <p> is in the swash indicated by <swashname>. <swash>
1939 * contains a pointer to where the swash indicated by <swashname>
1940 * is to be stored; which this routine will do, so that future calls will
f25ce844
KW
1941 * look at <*swash> and only generate a swash if it is not null. <invlist>
1942 * is NULL or an inversion list that defines the swash. If not null, it
1943 * saves time during initialization of the swash.
ea317ccb
KW
1944 *
1945 * Note that it is assumed that the buffer length of <p> is enough to
1946 * contain all the bytes that comprise the character. Thus, <*p> should
1947 * have been checked before this call for mal-formedness enough to assure
1948 * that. */
1949
7918f24d
NC
1950 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1951
492a624f 1952 /* The API should have included a length for the UTF-8 character in <p>,
28123549 1953 * but it doesn't. We therefore assume that p has been validated at least
492a624f
KW
1954 * as far as there being enough bytes available in it to accommodate the
1955 * character without reading beyond the end, and pass that number on to the
1956 * validating routine */
6302f837 1957 if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
28123549
KW
1958 if (ckWARN_d(WARN_UTF8)) {
1959 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
9816f121 1960 "Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
28123549
KW
1961 if (ckWARN(WARN_UTF8)) { /* This will output details as to the
1962 what the malformation is */
1963 utf8_to_uvchr_buf(p, p + UTF8SKIP(p), NULL);
1964 }
1965 }
1966 return FALSE;
1967 }
87367d5f
KW
1968 if (!*swash) {
1969 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
f25ce844
KW
1970 *swash = _core_swash_init("utf8",
1971
1972 /* Only use the name if there is no inversion
1973 * list; otherwise will go out to disk */
1974 (invlist) ? "" : swashname,
1975
1976 &PL_sv_undef, 1, 0, invlist, &flags);
87367d5f 1977 }
28123549 1978
bde6a22d
NC
1979 return swash_fetch(*swash, p, TRUE) != 0;
1980}
1981
1982bool
922e8cb4
KW
1983Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p)
1984{
922e8cb4
KW
1985 PERL_ARGS_ASSERT__IS_UTF8_FOO;
1986
1987 assert(classnum < _FIRST_NON_SWASH_CC);
1988
f25ce844
KW
1989 return is_utf8_common(p,
1990 &PL_utf8_swash_ptrs[classnum],
1991 swash_property_names[classnum],
1992 PL_XPosix_ptrs[classnum]);
922e8cb4
KW
1993}
1994
1995bool
f2645549 1996Perl__is_utf8_perl_idstart(pTHX_ const U8 *p)
a0ed51b3 1997{
f2645549 1998 SV* invlist = NULL;
7918f24d 1999
f2645549 2000 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
7918f24d 2001
f2645549
KW
2002 if (! PL_utf8_perl_idstart) {
2003 invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
2004 }
60071a22 2005 return is_utf8_common(p, &PL_utf8_perl_idstart, "_Perl_IDStart", invlist);
82686b01
JH
2006}
2007
2008bool
f2645549 2009Perl__is_utf8_xidstart(pTHX_ const U8 *p)
c11ff943 2010{
f2645549 2011 PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
c11ff943
KW
2012
2013 if (*p == '_')
2014 return TRUE;
f25ce844 2015 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
c11ff943
KW
2016}
2017
2018bool
eba68aa0
KW
2019Perl__is_utf8_perl_idcont(pTHX_ const U8 *p)
2020{
b24b43f7 2021 SV* invlist = NULL;
eba68aa0
KW
2022
2023 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
2024
b24b43f7
KW
2025 if (! PL_utf8_perl_idcont) {
2026 invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
2027 }
60071a22 2028 return is_utf8_common(p, &PL_utf8_perl_idcont, "_Perl_IDCont", invlist);
eba68aa0
KW
2029}
2030
eba68aa0 2031bool
f2645549 2032Perl__is_utf8_idcont(pTHX_ const U8 *p)
82686b01 2033{
f2645549 2034 PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
7918f24d 2035
f25ce844 2036 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
a0ed51b3
LW
2037}
2038
2039bool
f2645549 2040Perl__is_utf8_xidcont(pTHX_ const U8 *p)
c11ff943 2041{
f2645549 2042 PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
c11ff943 2043
f25ce844 2044 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
c11ff943
KW
2045}
2046
2047bool
7dbf68d2
KW
2048Perl__is_utf8_mark(pTHX_ const U8 *p)
2049{
7dbf68d2
KW
2050 PERL_ARGS_ASSERT__IS_UTF8_MARK;
2051
f25ce844 2052 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
7dbf68d2
KW
2053}
2054
6b5c0936 2055/*
87cea99e 2056=for apidoc to_utf8_case
6b5c0936 2057
9da1e7cb
KW
2058Instead use the appropriate one of L</toUPPER_utf8>,
2059L</toTITLE_utf8>,
2060L</toLOWER_utf8>,
2061or L</toFOLD_utf8>.
2062
6fae5207 2063C<p> contains the pointer to the UTF-8 string encoding
a1433954
KW
2064the character that is being converted. This routine assumes that the character
2065at C<p> is well-formed.
6b5c0936 2066
6fae5207
KW
2067C<ustrp> is a pointer to the character buffer to put the
2068conversion result to. C<lenp> is a pointer to the length
6b5c0936
JH
2069of the result.
2070
6fae5207 2071C<swashp> is a pointer to the swash to use.
6b5c0936 2072
a1433954 2073Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
796b6530 2074and loaded by C<SWASHNEW>, using F<lib/utf8_heavy.pl>. C<special> (usually,
0134edef 2075but not always, a multicharacter mapping), is tried first.
6b5c0936 2076
4a8240a3
KW
2077C<special> is a string, normally C<NULL> or C<"">. C<NULL> means to not use
2078any special mappings; C<""> means to use the special mappings. Values other
2079than these two are treated as the name of the hash containing the special
2080mappings, like C<"utf8::ToSpecLower">.
6b5c0936 2081
796b6530
KW
2082C<normal> is a string like C<"ToLower"> which means the swash
2083C<%utf8::ToLower>.
0134edef 2084
760c7c2f
KW
2085Code points above the platform's C<IV_MAX> will raise a deprecation warning,
2086unless those are turned off.
2087
0134edef 2088=cut */
6b5c0936 2089
2104c8d9 2090UV
9a957fbc
AL
2091Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
2092 SV **swashp, const char *normal, const char *special)
a0ed51b3 2093{
b9992569
KW
2094 PERL_ARGS_ASSERT_TO_UTF8_CASE;
2095
2096 return _to_utf8_case(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, swashp, normal, special);
2097}
2098
2099 /* change namve uv1 to 'from' */
6a4a25f4 2100STATIC UV
b9992569
KW
2101S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
2102 SV **swashp, const char *normal, const char *special)
2103{
0134edef 2104 STRLEN len = 0;
7918f24d 2105
b9992569 2106 PERL_ARGS_ASSERT__TO_UTF8_CASE;
7918f24d 2107
36eaa811
KW
2108 /* For code points that don't change case, we already know that the output
2109 * of this function is the unchanged input, so we can skip doing look-ups
2110 * for them. Unfortunately the case-changing code points are scattered
2111 * around. But there are some long consecutive ranges where there are no
2112 * case changing code points. By adding tests, we can eliminate the lookup
2113 * for all the ones in such ranges. This is currently done here only for
2114 * just a few cases where the scripts are in common use in modern commerce
2115 * (and scripts adjacent to those which can be included without additional
2116 * tests). */
2117
2118 if (uv1 >= 0x0590) {
2119 /* This keeps from needing further processing the code points most
2120 * likely to be used in the following non-cased scripts: Hebrew,
2121 * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
2122 * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
2123 * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
2124 if (uv1 < 0x10A0) {
2125 goto cases_to_self;
2126 }
2127
2128 /* The following largish code point ranges also don't have case
2129 * changes, but khw didn't think they warranted extra tests to speed
2130 * them up (which would slightly slow down everything else above them):
2131 * 1100..139F Hangul Jamo, Ethiopic
2132 * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
2133 * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
2134 * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
2135 * Combining Diacritical Marks Extended, Balinese,
2136 * Sundanese, Batak, Lepcha, Ol Chiki
2137 * 2000..206F General Punctuation
2138 */
2139
2140 if (uv1 >= 0x2D30) {
2141
2142 /* This keeps the from needing further processing the code points
2143 * most likely to be used in the following non-cased major scripts:
2144 * CJK, Katakana, Hiragana, plus some less-likely scripts.
2145 *
2146 * (0x2D30 above might have to be changed to 2F00 in the unlikely
2147 * event that Unicode eventually allocates the unused block as of
2148 * v8.0 2FE0..2FEF to code points that are cased. khw has verified
2149 * that the test suite will start having failures to alert you
2150 * should that happen) */
2151 if (uv1 < 0xA640) {
2152 goto cases_to_self;
2153 }
2154
2155 if (uv1 >= 0xAC00) {
2156 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
5af9bc97
KW
2157 if (ckWARN_d(WARN_SURROGATE)) {
2158 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2159 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2160 "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
2161 }
2162 goto cases_to_self;
2163 }
36eaa811
KW
2164
2165 /* AC00..FAFF Catches Hangul syllables and private use, plus
2166 * some others */
2167 if (uv1 < 0xFB00) {
2168 goto cases_to_self;
2169
2170 }
2171
5af9bc97
KW
2172 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
2173 if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
2174 && ckWARN_d(WARN_DEPRECATED))
2175 {
2176 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2177 cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
2178 }
2179 if (ckWARN_d(WARN_NON_UNICODE)) {
2180 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2181 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
2182 "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
2183 }
2184 goto cases_to_self;
2185 }
3bfc1e70
KW
2186#ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
2187 if (UNLIKELY(uv1
2188 > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
2189 {
2190
2191 /* As of this writing, this means we avoid swash creation
2192 * for anything beyond low Plane 1 */
2193 goto cases_to_self;
2194 }
2195#endif
36eaa811
KW
2196 }
2197 }
9ae3ac1a 2198
36eaa811
KW
2199 /* Note that non-characters are perfectly legal, so no warning should
2200 * be given. There are so few of them, that it isn't worth the extra
2201 * tests to avoid swash creation */
9ae3ac1a
KW
2202 }
2203
0134edef 2204 if (!*swashp) /* load on-demand */
5ab9d2ef 2205 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef, 4, 0, NULL, NULL);
0134edef 2206
a6f87d8c 2207 if (special) {
0134edef 2208 /* It might be "special" (sometimes, but not always,
2a37f04d 2209 * a multicharacter mapping) */
4a8240a3 2210 HV *hv = NULL;
b08cf34e
JH
2211 SV **svp;
2212
4a8240a3
KW
2213 /* If passed in the specials name, use that; otherwise use any
2214 * given in the swash */
2215 if (*special != '\0') {
2216 hv = get_hv(special, 0);
2217 }
2218 else {
2219 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
2220 if (svp) {
2221 hv = MUTABLE_HV(SvRV(*svp));
2222 }
2223 }
2224
176fe009 2225 if (hv
5f560d8a 2226 && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
176fe009
KW
2227 && (*svp))
2228 {
cfd0369c 2229 const char *s;
47654450 2230
cfd0369c 2231 s = SvPV_const(*svp, len);
47654450 2232 if (len == 1)
f4cd282c 2233 /* EIGHTBIT */
c80e42f3 2234 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
2a37f04d 2235 else {
d2dcd0fb 2236 Copy(s, ustrp, len, U8);
29e98929 2237 }
983ffd37 2238 }
0134edef
JH
2239 }
2240
2241 if (!len && *swashp) {
4a4088c4 2242 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
d4c19fe8 2243
0134edef
JH
2244 if (uv2) {
2245 /* It was "normal" (a single character mapping). */
f4cd282c 2246 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
2a37f04d
JH
2247 }
2248 }
1feea2c7 2249
cbe07460
KW
2250 if (len) {
2251 if (lenp) {
2252 *lenp = len;
2253 }
2254 return valid_utf8_to_uvchr(ustrp, 0);
2255 }
2256
2257 /* Here, there was no mapping defined, which means that the code point maps
2258 * to itself. Return the inputs */
e24dfe9c 2259 cases_to_self:
bfdf22ec 2260 len = UTF8SKIP(p);
ca9fab46
KW
2261 if (p != ustrp) { /* Don't copy onto itself */
2262 Copy(p, ustrp, len, U8);
2263 }
0134edef 2264
2a37f04d
JH
2265 if (lenp)
2266 *lenp = len;
2267
f4cd282c 2268 return uv1;
cbe07460 2269
a0ed51b3
LW
2270}
2271
051a06d4 2272STATIC UV
357aadde 2273S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
051a06d4 2274{
4a4088c4 2275 /* This is called when changing the case of a UTF-8-encoded character above
31f05a37
KW
2276 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
2277 * result contains a character that crosses the 255/256 boundary, disallow
2278 * the change, and return the original code point. See L<perlfunc/lc> for
2279 * why;
051a06d4 2280 *
a1433954
KW
2281 * p points to the original string whose case was changed; assumed
2282 * by this routine to be well-formed
051a06d4
KW
2283 * result the code point of the first character in the changed-case string
2284 * ustrp points to the changed-case string (<result> represents its first char)
2285 * lenp points to the length of <ustrp> */
2286
2287 UV original; /* To store the first code point of <p> */
2288
2289 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
2290
a4f12ed7 2291 assert(UTF8_IS_ABOVE_LATIN1(*p));
051a06d4
KW
2292
2293 /* We know immediately if the first character in the string crosses the
2294 * boundary, so can skip */
2295 if (result > 255) {
2296
2297 /* Look at every character in the result; if any cross the
2298 * boundary, the whole thing is disallowed */
2299 U8* s = ustrp + UTF8SKIP(ustrp);
2300 U8* e = ustrp + *lenp;
2301 while (s < e) {
a4f12ed7 2302 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
051a06d4
KW
2303 goto bad_crossing;
2304 }
2305 s += UTF8SKIP(s);
2306 }
2307
613abc6d
KW
2308 /* Here, no characters crossed, result is ok as-is, but we warn. */
2309 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
051a06d4
KW
2310 return result;
2311 }
2312
7b52d656 2313 bad_crossing:
051a06d4
KW
2314
2315 /* Failed, have to return the original */
4b88fb76 2316 original = valid_utf8_to_uvchr(p, lenp);
ab0b796c
KW
2317
2318 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2319 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2320 "Can't do %s(\"\\x{%"UVXf"}\") on non-UTF-8 locale; "
2321 "resolved to \"\\x{%"UVXf"}\".",
357aadde 2322 OP_DESC(PL_op),
ab0b796c
KW
2323 original,
2324 original);
051a06d4
KW
2325 Copy(p, ustrp, *lenp, char);
2326 return original;
2327}
2328
d3e79532 2329/*
87cea99e 2330=for apidoc to_utf8_upper
d3e79532 2331
1f607577 2332Instead use L</toUPPER_utf8>.
a1433954 2333
d3e79532
JH
2334=cut */
2335
051a06d4 2336/* Not currently externally documented, and subject to change:
31f05a37
KW
2337 * <flags> is set iff iff the rules from the current underlying locale are to
2338 * be used. */
051a06d4 2339
2104c8d9 2340UV
31f05a37 2341Perl__to_utf8_upper_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2342{
051a06d4
KW
2343 UV result;
2344
2345 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
7918f24d 2346
780fcc9f
KW
2347 if (flags) {
2348 /* Treat a UTF-8 locale as not being in locale at all */
2349 if (IN_UTF8_CTYPE_LOCALE) {
2350 flags = FALSE;
2351 }
2352 else {
2353 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2354 }
31f05a37
KW
2355 }
2356
3a4c58c9 2357 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2358 if (flags) {
2359 result = toUPPER_LC(*p);
2360 }
2361 else {
81c6c7ce 2362 return _to_upper_title_latin1(*p, ustrp, lenp, 'S');
051a06d4 2363 }
3a4c58c9
KW
2364 }
2365 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2366 if (flags) {
a62b247b 2367 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2368 result = toUPPER_LC(c);
051a06d4
KW
2369 }
2370 else {
a62b247b 2371 return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2372 ustrp, lenp, 'S');
051a06d4
KW
2373 }
2374 }
4a4088c4 2375 else { /* UTF-8, ord above 255 */
b9992569 2376 result = CALL_UPPER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2377
2378 if (flags) {
357aadde 2379 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4
KW
2380 }
2381 return result;
2382 }
2383
4a4088c4 2384 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2385 if (UTF8_IS_INVARIANT(result)) {
2386 *ustrp = (U8) result;
2387 *lenp = 1;
2388 }
2389 else {
62cb07ea
KW
2390 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2391 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2392 *lenp = 2;
3a4c58c9 2393 }
baa60164 2394
051a06d4 2395 return result;
983ffd37 2396}
a0ed51b3 2397
d3e79532 2398/*
87cea99e 2399=for apidoc to_utf8_title
d3e79532 2400
1f607577 2401Instead use L</toTITLE_utf8>.
a1433954 2402
d3e79532
JH
2403=cut */
2404
051a06d4 2405/* Not currently externally documented, and subject to change:
31f05a37
KW
2406 * <flags> is set iff the rules from the current underlying locale are to be
2407 * used. Since titlecase is not defined in POSIX, for other than a
2408 * UTF-8 locale, uppercase is used instead for code points < 256.
445bf929 2409 */
051a06d4 2410
983ffd37 2411UV
31f05a37 2412Perl__to_utf8_title_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
983ffd37 2413{
051a06d4
KW
2414 UV result;
2415
2416 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
7918f24d 2417
780fcc9f
KW
2418 if (flags) {
2419 /* Treat a UTF-8 locale as not being in locale at all */
2420 if (IN_UTF8_CTYPE_LOCALE) {
2421 flags = FALSE;
2422 }
2423 else {
2424 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2425 }
31f05a37
KW
2426 }
2427
3a4c58c9 2428 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2429 if (flags) {
2430 result = toUPPER_LC(*p);
2431 }
2432 else {
81c6c7ce 2433 return _to_upper_title_latin1(*p, ustrp, lenp, 's');
051a06d4 2434 }
3a4c58c9
KW
2435 }
2436 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2437 if (flags) {
a62b247b 2438 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2439 result = toUPPER_LC(c);
051a06d4
KW
2440 }
2441 else {
a62b247b 2442 return _to_upper_title_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2443 ustrp, lenp, 's');
051a06d4
KW
2444 }
2445 }
4a4088c4 2446 else { /* UTF-8, ord above 255 */
b9992569 2447 result = CALL_TITLE_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2448
2449 if (flags) {
357aadde 2450 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4
KW
2451 }
2452 return result;
2453 }
2454
4a4088c4 2455 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2456 if (UTF8_IS_INVARIANT(result)) {
2457 *ustrp = (U8) result;
2458 *lenp = 1;
2459 }
2460 else {
62cb07ea
KW
2461 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2462 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4 2463 *lenp = 2;
3a4c58c9
KW
2464 }
2465
051a06d4 2466 return result;
a0ed51b3
LW
2467}
2468
d3e79532 2469/*
87cea99e 2470=for apidoc to_utf8_lower
d3e79532 2471
1f607577 2472Instead use L</toLOWER_utf8>.
a1433954 2473
d3e79532
JH
2474=cut */
2475
051a06d4 2476/* Not currently externally documented, and subject to change:
31f05a37
KW
2477 * <flags> is set iff iff the rules from the current underlying locale are to
2478 * be used.
2479 */
051a06d4 2480
2104c8d9 2481UV
31f05a37 2482Perl__to_utf8_lower_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, bool flags)
a0ed51b3 2483{
051a06d4
KW
2484 UV result;
2485
051a06d4 2486 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
7918f24d 2487
780fcc9f
KW
2488 if (flags) {
2489 /* Treat a UTF-8 locale as not being in locale at all */
2490 if (IN_UTF8_CTYPE_LOCALE) {
2491 flags = FALSE;
2492 }
2493 else {
2494 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2495 }
31f05a37
KW
2496 }
2497
968c5e6a 2498 if (UTF8_IS_INVARIANT(*p)) {
051a06d4
KW
2499 if (flags) {
2500 result = toLOWER_LC(*p);
2501 }
2502 else {
81c6c7ce 2503 return to_lower_latin1(*p, ustrp, lenp);
051a06d4 2504 }
968c5e6a
KW
2505 }
2506 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2507 if (flags) {
a62b247b 2508 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2509 result = toLOWER_LC(c);
051a06d4
KW
2510 }
2511 else {
a62b247b 2512 return to_lower_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
81c6c7ce 2513 ustrp, lenp);
051a06d4 2514 }
968c5e6a 2515 }
4a4088c4 2516 else { /* UTF-8, ord above 255 */
b9992569 2517 result = CALL_LOWER_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp);
051a06d4
KW
2518
2519 if (flags) {
357aadde 2520 result = check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 2521 }
968c5e6a 2522
051a06d4
KW
2523 return result;
2524 }
2525
4a4088c4 2526 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2527 if (UTF8_IS_INVARIANT(result)) {
2528 *ustrp = (U8) result;
2529 *lenp = 1;
2530 }
2531 else {
62cb07ea
KW
2532 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2533 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
2534 *lenp = 2;
2535 }
2536
051a06d4 2537 return result;
b4e400f9
JH
2538}
2539
d3e79532 2540/*
87cea99e 2541=for apidoc to_utf8_fold
d3e79532 2542
1f607577 2543Instead use L</toFOLD_utf8>.
a1433954 2544
d3e79532
JH
2545=cut */
2546
051a06d4
KW
2547/* Not currently externally documented, and subject to change,
2548 * in <flags>
31f05a37
KW
2549 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2550 * locale are to be used.
051a06d4
KW
2551 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
2552 * otherwise simple folds
a0270393
KW
2553 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
2554 * prohibited
445bf929 2555 */
36bb2ab6 2556
b4e400f9 2557UV
445bf929 2558Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
b4e400f9 2559{
051a06d4
KW
2560 UV result;
2561
36bb2ab6 2562 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
7918f24d 2563
a0270393
KW
2564 /* These are mutually exclusive */
2565 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
2566
50ba90ff
KW
2567 assert(p != ustrp); /* Otherwise overwrites */
2568
780fcc9f
KW
2569 if (flags & FOLD_FLAGS_LOCALE) {
2570 /* Treat a UTF-8 locale as not being in locale at all */
2571 if (IN_UTF8_CTYPE_LOCALE) {
2572 flags &= ~FOLD_FLAGS_LOCALE;
2573 }
2574 else {
2575 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2576 }
31f05a37
KW
2577 }
2578
a1dde8de 2579 if (UTF8_IS_INVARIANT(*p)) {
051a06d4 2580 if (flags & FOLD_FLAGS_LOCALE) {
d22b930b 2581 result = toFOLD_LC(*p);
051a06d4
KW
2582 }
2583 else {
81c6c7ce 2584 return _to_fold_latin1(*p, ustrp, lenp,
1ca267a5 2585 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2586 }
a1dde8de
KW
2587 }
2588 else if UTF8_IS_DOWNGRADEABLE_START(*p) {
051a06d4 2589 if (flags & FOLD_FLAGS_LOCALE) {
a62b247b 2590 U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1));
68067e4e 2591 result = toFOLD_LC(c);
051a06d4
KW
2592 }
2593 else {
a62b247b 2594 return _to_fold_latin1(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)),
51910141 2595 ustrp, lenp,
1ca267a5 2596 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
051a06d4 2597 }
a1dde8de 2598 }
4a4088c4 2599 else { /* UTF-8, ord above 255 */
b9992569 2600 result = CALL_FOLD_CASE(valid_utf8_to_uvchr(p, NULL), p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
a1dde8de 2601
1ca267a5
KW
2602 if (flags & FOLD_FLAGS_LOCALE) {
2603
76f2ffcd 2604# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
0766489e
KW
2605 const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
2606
2607# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
2608# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
76f2ffcd
KW
2609
2610 const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
76f2ffcd 2611
538e84ed
KW
2612 /* Special case these two characters, as what normally gets
2613 * returned under locale doesn't work */
76f2ffcd
KW
2614 if (UTF8SKIP(p) == cap_sharp_s_len
2615 && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
1ca267a5 2616 {
ab0b796c
KW
2617 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2618 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2619 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
2620 "resolved to \"\\x{17F}\\x{17F}\".");
1ca267a5
KW
2621 goto return_long_s;
2622 }
0766489e
KW
2623 else
2624#endif
2625 if (UTF8SKIP(p) == long_s_t_len
76f2ffcd 2626 && memEQ((char *) p, LONG_S_T, long_s_t_len))
9fc2026f 2627 {
ab0b796c
KW
2628 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2629 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2630 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
2631 "resolved to \"\\x{FB06}\".");
9fc2026f
KW
2632 goto return_ligature_st;
2633 }
74894415
KW
2634
2635#if UNICODE_MAJOR_VERSION == 3 \
2636 && UNICODE_DOT_VERSION == 0 \
2637 && UNICODE_DOT_DOT_VERSION == 1
2638# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
2639
2640 /* And special case this on this Unicode version only, for the same
2641 * reaons the other two are special cased. They would cross the
2642 * 255/256 boundary which is forbidden under /l, and so the code
2643 * wouldn't catch that they are equivalent (which they are only in
2644 * this release) */
2645 else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
2646 && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
2647 {
2648 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
2649 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
2650 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
2651 "resolved to \"\\x{0131}\".");
2652 goto return_dotless_i;
2653 }
2654#endif
2655
357aadde 2656 return check_locale_boundary_crossing(p, result, ustrp, lenp);
051a06d4 2657 }
a0270393
KW
2658 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
2659 return result;
2660 }
2661 else {
4a4088c4 2662 /* This is called when changing the case of a UTF-8-encoded
9fc2026f
KW
2663 * character above the ASCII range, and the result should not
2664 * contain an ASCII character. */
a0270393
KW
2665
2666 UV original; /* To store the first code point of <p> */
2667
2668 /* Look at every character in the result; if any cross the
2669 * boundary, the whole thing is disallowed */
2670 U8* s = ustrp;
2671 U8* e = ustrp + *lenp;
2672 while (s < e) {
2673 if (isASCII(*s)) {
2674 /* Crossed, have to return the original */
2675 original = valid_utf8_to_uvchr(p, lenp);
1ca267a5 2676
9fc2026f 2677 /* But in these instances, there is an alternative we can
1ca267a5 2678 * return that is valid */
0766489e
KW
2679 if (original == LATIN_SMALL_LETTER_SHARP_S
2680#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
2681 || original == LATIN_CAPITAL_LETTER_SHARP_S
2682#endif
2683 ) {
1ca267a5
KW
2684 goto return_long_s;
2685 }
9fc2026f
KW
2686 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
2687 goto return_ligature_st;
2688 }
74894415
KW
2689#if UNICODE_MAJOR_VERSION == 3 \
2690 && UNICODE_DOT_VERSION == 0 \
2691 && UNICODE_DOT_DOT_VERSION == 1
2692
2693 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
2694 goto return_dotless_i;
2695 }
2696#endif
a0270393
KW
2697 Copy(p, ustrp, *lenp, char);
2698 return original;
2699 }
2700 s += UTF8SKIP(s);
2701 }
051a06d4 2702
a0270393
KW
2703 /* Here, no characters crossed, result is ok as-is */
2704 return result;
2705 }
051a06d4
KW
2706 }
2707
4a4088c4 2708 /* Here, used locale rules. Convert back to UTF-8 */
051a06d4
KW
2709 if (UTF8_IS_INVARIANT(result)) {
2710 *ustrp = (U8) result;
2711 *lenp = 1;
2712 }
2713 else {
62cb07ea
KW
2714 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
2715 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
051a06d4
KW
2716 *lenp = 2;
2717 }
2718
051a06d4 2719 return result;
1ca267a5
KW
2720
2721 return_long_s:
2722 /* Certain folds to 'ss' are prohibited by the options, but they do allow
2723 * folds to a string of two of these characters. By returning this
2724 * instead, then, e.g.,
2725 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
2726 * works. */
2727
2728 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2729 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2730 ustrp, *lenp, U8);
2731 return LATIN_SMALL_LETTER_LONG_S;
9fc2026f
KW
2732
2733 return_ligature_st:
2734 /* Two folds to 'st' are prohibited by the options; instead we pick one and
2735 * have the other one fold to it */
2736
2737 *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
2738 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
2739 return LATIN_SMALL_LIGATURE_ST;
74894415
KW
2740
2741#if UNICODE_MAJOR_VERSION == 3 \
2742 && UNICODE_DOT_VERSION == 0 \
2743 && UNICODE_DOT_DOT_VERSION == 1
2744
2745 return_dotless_i:
2746 *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
2747 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
2748 return LATIN_SMALL_LETTER_DOTLESS_I;
2749
2750#endif
2751
a0ed51b3
LW
2752}
2753
711a919c 2754/* Note:
f90a9a02 2755 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
711a919c
TS
2756 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
2757 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
2758 */
c4a5db0c 2759
a0ed51b3 2760SV*
7fc63493 2761Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 2762{
c4a5db0c
KW
2763 PERL_ARGS_ASSERT_SWASH_INIT;
2764
2765 /* Returns a copy of a swash initiated by the called function. This is the
2766 * public interface, and returning a copy prevents others from doing
2767 * mischief on the original */
2768
5d3d13d1 2769 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none, NULL, NULL));
c4a5db0c
KW
2770}
2771
2772SV*
5d3d13d1 2773Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none, SV* invlist, U8* const flags_p)
c4a5db0c 2774{
2c1f00b9
YO
2775
2776 /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
2777 * use the following define */
2778
2779#define CORE_SWASH_INIT_RETURN(x) \
2780 PL_curpm= old_PL_curpm; \
2781 return x
2782
c4a5db0c 2783 /* Initialize and return a swash, creating it if necessary. It does this
87367d5f
KW
2784 * by calling utf8_heavy.pl in the general case. The returned value may be
2785 * the swash's inversion list instead if the input parameters allow it.
2786 * Which is returned should be immaterial to callers, as the only
923b6d4e
KW
2787 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
2788 * and swash_to_invlist() handle both these transparently.
c4a5db0c
KW
2789 *
2790 * This interface should only be used by functions that won't destroy or
2791 * adversely change the swash, as doing so affects all other uses of the
2792 * swash in the program; the general public should use 'Perl_swash_init'
2793 * instead.
2794 *
2795 * pkg is the name of the package that <name> should be in.
2796 * name is the name of the swash to find. Typically it is a Unicode
2797 * property name, including user-defined ones
2798 * listsv is a string to initialize the swash with. It must be of the form
2799 * documented as the subroutine return value in
2800 * L<perlunicode/User-Defined Character Properties>
2801 * minbits is the number of bits required to represent each data element.
2802 * It is '1' for binary properties.
2803 * none I (khw) do not understand this one, but it is used only in tr///.
9a53f6cf 2804 * invlist is an inversion list to initialize the swash with (or NULL)
83199d38
KW
2805 * flags_p if non-NULL is the address of various input and output flag bits
2806 * to the routine, as follows: ('I' means is input to the routine;
2807 * 'O' means output from the routine. Only flags marked O are
2808 * meaningful on return.)
2809 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
2810 * came from a user-defined property. (I O)
5d3d13d1
KW
2811 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
2812 * when the swash cannot be located, to simply return NULL. (I)
87367d5f
KW
2813 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
2814 * return of an inversion list instead of a swash hash if this routine
2815 * thinks that would result in faster execution of swash_fetch() later
2816 * on. (I)
9a53f6cf
KW
2817 *
2818 * Thus there are three possible inputs to find the swash: <name>,
2819 * <listsv>, and <invlist>. At least one must be specified. The result
2820 * will be the union of the specified ones, although <listsv>'s various
aabbdbda
KW
2821 * actions can intersect, etc. what <name> gives. To avoid going out to
2822 * disk at all, <invlist> should specify completely what the swash should
2823 * have, and <listsv> should be &PL_sv_undef and <name> should be "".
9a53f6cf
KW
2824 *
2825 * <invlist> is only valid for binary properties */
c4a5db0c 2826
2c1f00b9
YO
2827 PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
2828
c4a5db0c 2829 SV* retval = &PL_sv_undef;
83199d38 2830 HV* swash_hv = NULL;
87367d5f
KW
2831 const int invlist_swash_boundary =
2832 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
2833 ? 512 /* Based on some benchmarking, but not extensive, see commit
2834 message */
2835 : -1; /* Never return just an inversion list */
9a53f6cf
KW
2836
2837 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
2838 assert(! invlist || minbits == 1);
2839
2c1f00b9
YO
2840 PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the regex
2841 that triggered the swash init and the swash init perl logic itself.
2842 See perl #122747 */
2843
9a53f6cf
KW
2844 /* If data was passed in to go out to utf8_heavy to find the swash of, do
2845 * so */
2846 if (listsv != &PL_sv_undef || strNE(name, "")) {
69794297
KW
2847 dSP;
2848 const size_t pkg_len = strlen(pkg);
2849 const size_t name_len = strlen(name);
2850 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
2851 SV* errsv_save;
2852 GV *method;
2853
2854 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
2855
2856 PUSHSTACKi(PERLSI_MAGIC);
ce3b816e 2857 ENTER;
69794297 2858 SAVEHINTS();
2782061f 2859 save_re_context();
650f067c
JL
2860 /* We might get here via a subroutine signature which uses a utf8
2861 * parameter name, at which point PL_subname will have been set
2862 * but not yet used. */
2863 save_item(PL_subname);
69794297
KW
2864 if (PL_parser && PL_parser->error_count)
2865 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
2866 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
4a4088c4 2867 if (!method) { /* demand load UTF-8 */
69794297 2868 ENTER;
db2c6cb3
FC
2869 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
2870 GvSV(PL_errgv) = NULL;
1a419e6b 2871#ifndef NO_TAINT_SUPPORT
69794297
KW
2872 /* It is assumed that callers of this routine are not passing in
2873 * any user derived data. */
2782061f
DM
2874 /* Need to do this after save_re_context() as it will set
2875 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
2876 * in Perl_magic_get). Even line to create errsv_save can turn on
2877 * PL_tainted. */
284167a5
S
2878 SAVEBOOL(TAINT_get);
2879 TAINT_NOT;
2880#endif
69794297
KW
2881 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
2882 NULL);
eed484f9 2883 {
db2c6cb3
FC
2884 /* Not ERRSV, as there is no need to vivify a scalar we are
2885 about to discard. */
2886 SV * const errsv = GvSV(PL_errgv);
2887 if (!SvTRUE(errsv)) {
2888 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
2889 SvREFCNT_dec(errsv);
2890 }
eed484f9 2891 }
69794297
KW
2892 LEAVE;
2893 }
2894 SPAGAIN;
2895 PUSHMARK(SP);
2896 EXTEND(SP,5);
2897 mPUSHp(pkg, pkg_len);
2898 mPUSHp(name, name_len);
2899 PUSHs(listsv);
2900 mPUSHi(minbits);
2901 mPUSHi(none);
2902 PUTBACK;
db2c6cb3
FC
2903 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
2904 GvSV(PL_errgv) = NULL;
69794297
KW
2905 /* If we already have a pointer to the method, no need to use
2906 * call_method() to repeat the lookup. */
c41800a8
KW
2907 if (method
2908 ? call_sv(MUTABLE_SV(method), G_SCALAR)
69794297
KW
2909 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
2910 {
2911 retval = *PL_stack_sp--;
2912 SvREFCNT_inc(retval);
2913 }
eed484f9 2914 {
db2c6cb3
FC
2915 /* Not ERRSV. See above. */
2916 SV * const errsv = GvSV(PL_errgv);
2917 if (!SvTRUE(errsv)) {
2918 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
2919 SvREFCNT_dec(errsv);
2920 }
eed484f9 2921 }
ce3b816e 2922 LEAVE;
69794297
KW
2923 POPSTACK;
2924 if (IN_PERL_COMPILETIME) {
2925 CopHINTS_set(PL_curcop, PL_hints);
2926 }
2927 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
d95e4a00 2928 if (SvPOK(retval)) {
69794297
KW
2929
2930 /* If caller wants to handle missing properties, let them */
5d3d13d1 2931 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
2c1f00b9 2932 CORE_SWASH_INIT_RETURN(NULL);
69794297
KW
2933 }
2934 Perl_croak(aTHX_
2935 "Can't find Unicode property definition \"%"SVf"\"",
2936 SVfARG(retval));
a25b5927 2937 NOT_REACHED; /* NOTREACHED */
d95e4a00 2938 }
69794297 2939 }
9a53f6cf 2940 } /* End of calling the module to find the swash */
36eb48b4 2941
83199d38
KW
2942 /* If this operation fetched a swash, and we will need it later, get it */
2943 if (retval != &PL_sv_undef
2944 && (minbits == 1 || (flags_p
2945 && ! (*flags_p
2946 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
2947 {
2948 swash_hv = MUTABLE_HV(SvRV(retval));
2949
2950 /* If we don't already know that there is a user-defined component to
2951 * this swash, and the user has indicated they wish to know if there is
2952 * one (by passing <flags_p>), find out */
2953 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
2954 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
2955 if (user_defined && SvUV(*user_defined)) {
2956 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
2957 }
2958 }
2959 }
2960
36eb48b4
KW
2961 /* Make sure there is an inversion list for binary properties */
2962 if (minbits == 1) {
2963 SV** swash_invlistsvp = NULL;
2964 SV* swash_invlist = NULL;
9a53f6cf 2965 bool invlist_in_swash_is_valid = FALSE;
02c85471
FC
2966 bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
2967 an unclaimed reference count */
36eb48b4 2968
9a53f6cf 2969 /* If this operation fetched a swash, get its already existing
83199d38 2970 * inversion list, or create one for it */
36eb48b4 2971
83199d38 2972 if (swash_hv) {
5c9f4bd2 2973 swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
9a53f6cf
KW
2974 if (swash_invlistsvp) {
2975 swash_invlist = *swash_invlistsvp;
2976 invlist_in_swash_is_valid = TRUE;
2977 }
2978 else {
36eb48b4 2979 swash_invlist = _swash_to_invlist(retval);
02c85471 2980 swash_invlist_unclaimed = TRUE;
9a53f6cf
KW
2981 }
2982 }
2983
2984 /* If an inversion list was passed in, have to include it */
2985 if (invlist) {
2986
2987 /* Any fetched swash will by now have an inversion list in it;
2988 * otherwise <swash_invlist> will be NULL, indicating that we
2989 * didn't fetch a swash */
2990 if (swash_invlist) {
2991
2992 /* Add the passed-in inversion list, which invalidates the one
2993 * already stored in the swash */
2994 invlist_in_swash_is_valid = FALSE;
2995 _invlist_union(invlist, swash_invlist, &swash_invlist);
2996 }
2997 else {
2998
87367d5f
KW
2999 /* Here, there is no swash already. Set up a minimal one, if
3000 * we are going to return a swash */
3001 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
971d486f 3002 swash_hv = newHV();
4aca0fe6 3003 retval = newRV_noinc(MUTABLE_SV(swash_hv));
87367d5f 3004 }
9a53f6cf
KW
3005 swash_invlist = invlist;
3006 }
9a53f6cf
KW
3007 }
3008
3009 /* Here, we have computed the union of all the passed-in data. It may
3010 * be that there was an inversion list in the swash which didn't get
538e84ed 3011 * touched; otherwise save the computed one */
87367d5f
KW
3012 if (! invlist_in_swash_is_valid
3013 && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
3014 {
5c9f4bd2 3015 if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
69794297
KW
3016 {
3017 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3018 }
cc34d8c5
FC
3019 /* We just stole a reference count. */
3020 if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
3021 else SvREFCNT_inc_simple_void_NN(swash_invlist);
9a53f6cf 3022 }
87367d5f 3023
dbfdbd26
KW
3024 SvREADONLY_on(swash_invlist);
3025
c41800a8 3026 /* Use the inversion list stand-alone if small enough */
87367d5f
KW
3027 if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
3028 SvREFCNT_dec(retval);
02c85471
FC
3029 if (!swash_invlist_unclaimed)
3030 SvREFCNT_inc_simple_void_NN(swash_invlist);
3031 retval = newRV_noinc(swash_invlist);
87367d5f 3032 }
36eb48b4
KW
3033 }
3034
2c1f00b9
YO
3035 CORE_SWASH_INIT_RETURN(retval);
3036#undef CORE_SWASH_INIT_RETURN
a0ed51b3
LW
3037}
3038
035d37be
JH
3039
3040/* This API is wrong for special case conversions since we may need to
3041 * return several Unicode characters for a single Unicode character
3042 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
3043 * the lower-level routine, and it is similarly broken for returning
38684baa 3044 * multiple values. --jhi
b9992569 3045 * For those, you should use S__to_utf8_case() instead */
b0e3252e 3046/* Now SWASHGET is recasted into S_swatch_get in this file. */
680c470c
TS
3047
3048/* Note:
3049 * Returns the value of property/mapping C<swash> for the first character
3050 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
4a4088c4 3051 * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
3d0f8846 3052 * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
af2af982
KW
3053 *
3054 * A "swash" is a hash which contains initially the keys/values set up by
3055 * SWASHNEW. The purpose is to be able to completely represent a Unicode
3056 * property for all possible code points. Things are stored in a compact form
3057 * (see utf8_heavy.pl) so that calculation is required to find the actual
3058 * property value for a given code point. As code points are looked up, new
3059 * key/value pairs are added to the hash, so that the calculation doesn't have
3060 * to ever be re-done. Further, each calculation is done, not just for the
3061 * desired one, but for a whole block of code points adjacent to that one.
3062 * For binary properties on ASCII machines, the block is usually for 64 code
3063 * points, starting with a code point evenly divisible by 64. Thus if the
3064 * property value for code point 257 is requested, the code goes out and
3065 * calculates the property values for all 64 code points between 256 and 319,
3066 * and stores these as a single 64-bit long bit vector, called a "swatch",
3067 * under the key for code point 256. The key is the UTF-8 encoding for code
3068 * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
3069 * for a code point is 13 bytes, the key will be 12 bytes long. If the value
3070 * for code point 258 is then requested, this code realizes that it would be
3071 * stored under the key for 256, and would find that value and extract the
3072 * relevant bit, offset from 256.
3073 *
3074 * Non-binary properties are stored in as many bits as necessary to represent
3075 * their values (32 currently, though the code is more general than that), not
fc273927 3076 * as single bits, but the principle is the same: the value for each key is a
af2af982
KW
3077 * vector that encompasses the property values for all code points whose UTF-8
3078 * representations are represented by the key. That is, for all code points
3079 * whose UTF-8 representations are length N bytes, and the key is the first N-1
3080 * bytes of that.
680c470c 3081 */
a0ed51b3 3082UV
680c470c 3083Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 3084{
ef8f7699 3085 HV *const hv = MUTABLE_HV(SvRV(swash));
3568d838
JH
3086 U32 klen;
3087 U32 off;
9b56a019 3088 STRLEN slen = 0;
7d85a32c 3089 STRLEN needents;
cfd0369c 3090 const U8 *tmps = NULL;
979f2922 3091 SV *swatch;
08fb1ac5 3092 const U8 c = *ptr;
3568d838 3093
7918f24d
NC
3094 PERL_ARGS_ASSERT_SWASH_FETCH;
3095
87367d5f
KW
3096 /* If it really isn't a hash, it isn't really swash; must be an inversion
3097 * list */
3098 if (SvTYPE(hv) != SVt_PVHV) {
3099 return _invlist_contains_cp((SV*)hv,
3100 (do_utf8)
3101 ? valid_utf8_to_uvchr(ptr, NULL)
3102 : c);
3103 }
3104
08fb1ac5
KW
3105 /* We store the values in a "swatch" which is a vec() value in a swash
3106 * hash. Code points 0-255 are a single vec() stored with key length
3107 * (klen) 0. All other code points have a UTF-8 representation
3108 * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
3109 * share 0xAA..0xYY, which is the key in the hash to that vec. So the key
3110 * length for them is the length of the encoded char - 1. ptr[klen] is the
3111 * final byte in the sequence representing the character */
3112 if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
3113 klen = 0;
3114 needents = 256;
3115 off = c;
3568d838 3116 }
08fb1ac5
KW
3117 else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
3118 klen = 0;
3119 needents = 256;
a62b247b 3120 off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
979f2922
TS
3121 }
3122 else {
08fb1ac5
KW
3123 klen = UTF8SKIP(ptr) - 1;
3124
3125 /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
3126 * the vec is the final byte in the sequence. (In EBCDIC this is
3127 * converted to I8 to get consecutive values.) To help you visualize
3128 * all this:
3129 * Straight 1047 After final byte
3130 * UTF-8 UTF-EBCDIC I8 transform
3131 * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
3132 * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
3133 * ...
3134 * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
3135 * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
3136 * ...
3137 * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
3138 * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
3139 * ...
3140 * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
3141 * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
3142 * ...
3143 * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
3144 * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
3145 *
3146 * (There are no discontinuities in the elided (...) entries.)
3147 * The UTF-8 key for these 33 code points is '\xD0' (which also is the
3148 * key for the next 31, up through U+043F, whose UTF-8 final byte is
3149 * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
3150 * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
3151 * index into the vec() swatch (after subtracting 0x80, which we
3152 * actually do with an '&').
3153 * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
3154 * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
3155 * dicontinuities which go away by transforming it into I8, and we
3156 * effectively subtract 0xA0 to get the index. */
979f2922 3157 needents = (1 << UTF_ACCUMULATION_SHIFT);
bc3632a8 3158 off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
979f2922 3159 }
7d85a32c 3160
a0ed51b3 3161 /*
4a4088c4 3162 * This single-entry cache saves about 1/3 of the UTF-8 overhead in test
a0ed51b3
LW
3163 * suite. (That is, only 7-8% overall over just a hash cache. Still,
3164 * it's nothing to sniff at.) Pity we usually come through at least
3165 * two function calls to get here...
3166 *
3167 * NB: this code assumes that swatches are never modified, once generated!
3168 */
3169
3568d838 3170 if (hv == PL_last_swash_hv &&
a0ed51b3 3171 klen == PL_last_swash_klen &&
27da23d5 3172 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
3173 {
3174 tmps = PL_last_swash_tmps;
3175 slen = PL_last_swash_slen;
3176 }
3177 else {
3178 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 3179 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 3180
b0e3252e 3181 /* If not cached, generate it via swatch_get */
979f2922 3182 if (!svp || !SvPOK(*svp)
08fb1ac5
KW
3183 || !(tmps = (const U8*)SvPV_const(*svp, slen)))
3184 {
3185 if (klen) {
3186 const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
3187 swatch = swatch_get(swash,
3188 code_point & ~((UV)needents - 1),
3189 needents);
3190 }
3191 else { /* For the first 256 code points, the swatch has a key of
3192 length 0 */
3193 swatch = swatch_get(swash, 0, needents);
3194 }
979f2922 3195
923e4eb5 3196 if (IN_PERL_COMPILETIME)
623e6609 3197 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 3198
979f2922 3199 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 3200
979f2922
TS
3201 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
3202 || (slen << 3) < needents)
5637ef5b
NC
3203 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
3204 "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
3205 svp, tmps, (UV)slen, (UV)needents);
a0ed51b3
LW
3206 }
3207
3208 PL_last_swash_hv = hv;
16d8f38a 3209 assert(klen <= sizeof(PL_last_swash_key));
eac04b2e 3210 PL_last_swash_klen = (U8)klen;
cfd0369c
NC
3211 /* FIXME change interpvar.h? */
3212 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
3213 PL_last_swash_slen = slen;
3214 if (klen)
3215 Copy(ptr, PL_last_swash_key, klen, U8);
3216 }
3217
9faf8d75 3218 switch ((int)((slen << 3) / needents)) {
a0ed51b3 3219 case 1:
e7aca353 3220 return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
a0ed51b3 3221 case 8:
e7aca353 3222 return ((UV) tmps[off]);
a0ed51b3
LW
3223 case 16:
3224 off <<= 1;
e7aca353
JH
3225 return
3226 ((UV) tmps[off ] << 8) +
3227 ((UV) tmps[off + 1]);
a0ed51b3
LW
3228 case 32:
3229 off <<= 2;
e7aca353
JH
3230 return
3231 ((UV) tmps[off ] << 24) +
3232 ((UV) tmps[off + 1] << 16) +
3233 ((UV) tmps[off + 2] << 8) +
3234 ((UV) tmps[off + 3]);
a0ed51b3 3235 }
5637ef5b
NC
3236 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
3237 "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
670f1322 3238 NORETURN_FUNCTION_END;
a0ed51b3 3239}
2b9d42f0 3240
319009ee
KW
3241/* Read a single line of the main body of the swash input text. These are of
3242 * the form:
3243 * 0053 0056 0073
3244 * where each number is hex. The first two numbers form the minimum and
3245 * maximum of a range, and the third is the value associated with the range.
3246 * Not all swashes should have a third number
3247 *
3248 * On input: l points to the beginning of the line to be examined; it points
3249 * to somewhere in the string of the whole input text, and is
3250 * terminated by a \n or the null string terminator.
3251 * lend points to the null terminator of that string
3252 * wants_value is non-zero if the swash expects a third number
3253 * typestr is the name of the swash's mapping, like 'ToLower'
3254 * On output: *min, *max, and *val are set to the values read from the line.
3255 * returns a pointer just beyond the line examined. If there was no
3256 * valid min number on the line, returns lend+1
3257 */
3258
3259STATIC U8*
3260S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
3261 const bool wants_value, const U8* const typestr)
3262{
3263 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
3264 STRLEN numlen; /* Length of the number */
02470786
KW
3265 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
3266 | PERL_SCAN_DISALLOW_PREFIX
3267 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3268
3269 /* nl points to the next \n in the scan */
3270 U8* const nl = (U8*)memchr(l, '\n', lend - l);
3271
95543e92
KW
3272 PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
3273
319009ee
KW
3274 /* Get the first number on the line: the range minimum */
3275 numlen = lend - l;
3276 *min = grok_hex((char *)l, &numlen, &flags, NULL);
c88850db 3277 *max = *min; /* So can never return without setting max */
319009ee
KW
3278 if (numlen) /* If found a hex number, position past it */
3279 l += numlen;
3280 else if (nl) { /* Else, go handle next line, if any */
3281 return nl + 1; /* 1 is length of "\n" */
3282 }
3283 else { /* Else, no next line */
3284 return lend + 1; /* to LIST's end at which \n is not found */
3285 }
3286
3287 /* The max range value follows, separated by a BLANK */
3288 if (isBLANK(*l)) {
3289 ++l;
02470786
KW
3290 flags = PERL_SCAN_SILENT_ILLDIGIT
3291 | PERL_SCAN_DISALLOW_PREFIX
3292 | PERL_SCAN_SILENT_NON_PORTABLE;
319009ee
KW
3293 numlen = lend - l;
3294 *max = grok_hex((char *)l, &numlen, &flags, NULL);
3295 if (numlen)
3296 l += numlen;
3297 else /* If no value here, it is a single element range */
3298 *max = *min;
3299
3300 /* Non-binary tables have a third entry: what the first element of the
24303724 3301 * range maps to. The map for those currently read here is in hex */
319009ee
KW
3302 if (wants_value) {
3303 if (isBLANK(*l)) {
3304 ++l;
f2a7d0fc
KW
3305 flags = PERL_SCAN_SILENT_ILLDIGIT
3306 | PERL_SCAN_DISALLOW_PREFIX
3307 | PERL_SCAN_SILENT_NON_PORTABLE;
3308 numlen = lend - l;
3309 *val = grok_hex((char *)l, &numlen, &flags, NULL);
3310 if (numlen)
3311 l += numlen;
3312 else
3313 *val = 0;
319009ee
KW
3314 }
3315 else {
3316 *val = 0;
3317 if (typeto) {
dcbac5bb 3318 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
3319 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
3320 typestr, l);
3321 }
3322 }
3323 }
3324 else
3325 *val = 0; /* bits == 1, then any val should be ignored */
3326 }
3327 else { /* Nothing following range min, should be single element with no
3328 mapping expected */
319009ee
KW
3329 if (wants_value) {
3330 *val = 0;
3331 if (typeto) {
dcbac5bb 3332 /* diag_listed_as: To%s: illegal mapping '%s' */
319009ee
KW
3333 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
3334 }
3335 }
3336 else
3337 *val = 0; /* bits == 1, then val should be ignored */
3338 }
3339
3340 /* Position to next line if any, or EOF */
3341 if (nl)
3342 l = nl + 1;
3343 else
3344 l = lend;
3345
3346 return l;
3347}
3348
979f2922
TS
3349/* Note:
3350 * Returns a swatch (a bit vector string) for a code point sequence
3351 * that starts from the value C<start> and comprises the number C<span>.
3352 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
3353 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
3354 */
3355STATIC SV*
b0e3252e 3356S_swatch_get(pTHX_ SV* swash, UV start, UV span)
979f2922
TS
3357{
3358 SV *swatch;
77f9f126 3359 U8 *l, *lend, *x, *xend, *s, *send;
979f2922 3360 STRLEN lcur, xcur, scur;
ef8f7699 3361 HV *const hv = MUTABLE_HV(SvRV(swash));
5c9f4bd2 3362 SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
36eb48b4 3363
88d45d28
KW
3364 SV** listsvp = NULL; /* The string containing the main body of the table */
3365 SV** extssvp = NULL;
3366 SV** invert_it_svp = NULL;
3367 U8* typestr = NULL;
786861f5
KW
3368 STRLEN bits;
3369 STRLEN octets; /* if bits == 1, then octets == 0 */
3370 UV none;
3371 UV end = start + span;
972dd592 3372
36eb48b4 3373 if (invlistsvp == NULL) {
786861f5
KW
3374 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
3375 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
3376 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
3377 extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
3378 listsvp = hv_fetchs(hv, "LIST", FALSE);
3379 invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
3380
3381 bits = SvUV(*bitssvp);
3382 none = SvUV(*nonesvp);
3383 typestr = (U8*)SvPV_nolen(*typesvp);
3384 }
36eb48b4
KW
3385 else {
3386 bits = 1;
3387 none = 0;
3388 }
786861f5 3389 octets = bits >> 3; /* if bits == 1, then octets == 0 */
979f2922 3390
b0e3252e 3391 PERL_ARGS_ASSERT_SWATCH_GET;
7918f24d 3392
979f2922 3393 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
b0e3252e 3394 Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %"UVuf,
660a4616 3395 (UV)bits);
979f2922
TS
3396 }
3397
84ea5ef6
KW
3398 /* If overflowed, use the max possible */
3399 if (end < start) {
3400 end = UV_MAX;
3401 span = end - start;
3402 }
3403
979f2922 3404 /* create and initialize $swatch */
979f2922 3405 scur = octets ? (span * octets) : (span + 7) / 8;
e524fe40
NC
3406 swatch = newSV(scur);
3407 SvPOK_on(swatch);
979f2922
TS
3408 s = (U8*)SvPVX(swatch);
3409 if (octets && none) {
0bd48802 3410 const U8* const e = s + scur;
979f2922
TS
3411 while (s < e) {
3412 if (bits == 8)
3413 *s++ = (U8)(none & 0xff);
3414 else if (bits == 16) {
3415 *s++ = (U8)((none >> 8) & 0xff);
3416 *s++ = (U8)( none & 0xff);
3417 }
3418 else if (bits == 32) {
3419 *s++ = (U8)((none >> 24) & 0xff);
3420 *s++ = (U8)((none >> 16) & 0xff);
3421 *s++ = (U8)((none >> 8) & 0xff);
3422 *s++ = (U8)( none & 0xff);
3423 }
3424 }
3425 *s = '\0';
3426 }
3427 else {
3428 (void)memzero((U8*)s, scur + 1);
3429 }
3430 SvCUR_set(swatch, scur);
3431 s = (U8*)SvPVX(swatch);
3432
36eb48b4
KW
3433 if (invlistsvp) { /* If has an inversion list set up use that */
3434 _invlist_populate_swatch(*invlistsvp, start, end, s);
3435 return swatch;
3436 }
3437
3438 /* read $swash->{LIST} */
979f2922
TS
3439 l = (U8*)SvPV(*listsvp, lcur);
3440 lend = l + lcur;
3441 while (l < lend) {
8ed25d53 3442 UV min, max, val, upper;
95543e92
KW
3443 l = swash_scan_list_line(l, lend, &min, &max, &val,
3444 cBOOL(octets), typestr);
319009ee 3445 if (l > lend) {
979f2922
TS
3446 break;
3447 }
3448
972dd592 3449 /* If looking for something beyond this range, go try the next one */
979f2922
TS
3450 if (max < start)
3451 continue;
3452
8ed25d53
KW
3453 /* <end> is generally 1 beyond where we want to set things, but at the
3454 * platform's infinity, where we can't go any higher, we want to
3455 * include the code point at <end> */
3456 upper = (max < end)
3457 ? max
3458 : (max != UV_MAX || end != UV_MAX)
3459 ? end - 1
3460 : end;
3461
979f2922 3462 if (octets) {
35da51f7 3463 UV key;
979f2922
TS
3464 if (min < start) {
3465 if (!none || val < none) {
3466 val += start - min;
3467 }
3468 min = start;
3469 }
8ed25d53 3470 for (key = min; key <= upper; key++) {
979f2922 3471 STRLEN offset;
979f2922
TS
3472 /* offset must be non-negative (start <= min <= key < end) */
3473 offset = octets * (key - start);
3474 if (bits == 8)
3475 s[offset] = (U8)(val & 0xff);
3476 else if (bits == 16) {
3477 s[offset ] = (U8)((val >> 8) & 0xff);
3478 s[offset + 1] = (U8)( val & 0xff);
3479 }
3480 else if (bits == 32) {
3481 s[offset ] = (U8)((val >> 24) & 0xff);
3482 s[offset + 1] = (U8)((val >> 16) & 0xff);
3483 s[offset + 2] = (U8)((val >> 8) & 0xff);
3484 s[offset + 3] = (U8)( val & 0xff);
3485 }
3486
3487 if (!none || val < none)
3488 ++val;
3489 }
3490 }
711a919c 3491 else { /* bits == 1, then val should be ignored */
35da51f7 3492 UV key;
979f2922
TS
3493 if (min < start)
3494 min = start;
6cb05c12 3495
8ed25d53 3496 for (key = min; key <= upper; key++) {
0bd48802 3497 const STRLEN offset = (STRLEN)(key - start);
979f2922
TS
3498 s[offset >> 3] |= 1 << (offset & 7);
3499 }
3500 }
3501 } /* while */
979f2922 3502
9479a769 3503 /* Invert if the data says it should be. Assumes that bits == 1 */
77f9f126 3504 if (invert_it_svp && SvUV(*invert_it_svp)) {
0bda3001
KW
3505
3506 /* Unicode properties should come with all bits above PERL_UNICODE_MAX
3507 * be 0, and their inversion should also be 0, as we don't succeed any
3508 * Unicode property matches for non-Unicode code points */
3509 if (start <= PERL_UNICODE_MAX) {
3510
3511 /* The code below assumes that we never cross the
3512 * Unicode/above-Unicode boundary in a range, as otherwise we would
3513 * have to figure out where to stop flipping the bits. Since this
3514 * boundary is divisible by a large power of 2, and swatches comes
3515 * in small powers of 2, this should be a valid assumption */
3516 assert(start + span - 1 <= PERL_UNICODE_MAX);
3517
507a8485
KW
3518 send = s + scur;
3519 while (s < send) {
3520 *s = ~(*s);
3521 s++;
3522 }
0bda3001 3523 }
77f9f126
KW
3524 }
3525
d73c39c5
KW
3526 /* read $swash->{EXTRAS}
3527 * This code also copied to swash_to_invlist() below */
979f2922
TS
3528 x = (U8*)SvPV(*extssvp, xcur);
3529 xend = x + xcur;
3530 while (x < xend) {
3531 STRLEN namelen;
3532 U8 *namestr;
3533 SV** othersvp;
3534 HV* otherhv;
3535 STRLEN otherbits;
3536 SV **otherbitssvp, *other;
711a919c 3537 U8 *s, *o, *nl;
979f2922
TS
3538 STRLEN slen, olen;
3539
35da51f7 3540 const U8 opc = *x++;
979f2922
TS
3541 if (opc == '\n')
3542 continue;
3543
3544 nl = (U8*)memchr(x, '\n', xend - x);
3545
3546 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
3547 if (nl) {
3548 x = nl + 1; /* 1 is length of "\n" */
3549 continue;
3550 }
3551 else {
3552 x = xend; /* to EXTRAS' end at which \n is not found */
3553 break;
3554 }
3555 }
3556
3557 namestr = x;
3558 if (nl) {
3559 namelen = nl - namestr;
3560 x = nl + 1;
3561 }
3562 else {
3563 namelen = xend - namestr;
3564 x = xend;
3565 }
3566
3567 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
ef8f7699 3568 otherhv = MUTABLE_HV(SvRV(*othersvp));
017a3ce5 3569 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922
TS
3570 otherbits = (STRLEN)SvUV(*otherbitssvp);
3571 if (bits < otherbits)
5637ef5b
NC
3572 Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
3573 "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
979f2922
TS
3574
3575 /* The "other" swatch must be destroyed after. */
b0e3252e 3576 other = swatch_get(*othersvp, start, span);
979f2922
TS
3577 o = (U8*)SvPV(other, olen);
3578
3579 if (!olen)
b0e3252e 3580 Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
979f2922
TS
3581
3582 s = (U8*)SvPV(swatch, slen);
3583 if (bits == 1 && otherbits == 1) {
3584 if (slen != olen)
5637ef5b
NC
3585 Perl_croak(aTHX_ "panic: swatch_get found swatch length "
3586 "mismatch, slen=%"UVuf", olen=%"UVuf,
3587 (UV)slen, (UV)olen);
979f2922
TS
3588
3589 switch (opc) {
3590 case '+':
3591 while (slen--)
3592 *s++ |= *o++;
3593 break;
3594 case '!':
3595 while (slen--)
3596 *s++ |= ~*o++;
3597 break;
3598 case '-':
3599 while (slen--)
3600 *s++ &= ~*o++;
3601 break;
3602 case '&':
3603 while (slen--)
3604 *s++ &= *o++;
3605 break;
3606 default:
3607 break;
3608 }
3609 }
711a919c 3610 else {
979f2922
TS
3611 STRLEN otheroctets = otherbits >> 3;
3612 STRLEN offset = 0;
35da51f7 3613 U8* const send = s + slen;
979f2922
TS
3614
3615 while (s < send) {
3616 UV otherval = 0;
3617
3618 if (otherbits == 1) {
3619 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
3620 ++offset;
3621 }
3622 else {
3623 STRLEN vlen = otheroctets;
3624 otherval = *o++;
3625 while (--vlen) {
3626 otherval <<= 8;
3627 otherval |= *o++;
3628 }
3629 }
3630
711a919c 3631 if (opc == '+' && otherval)
6f207bd3 3632 NOOP; /* replace with otherval */
979f2922
TS
3633 else if (opc == '!' && !otherval)
3634 otherval = 1;
3635 else if (opc == '-' && otherval)
3636 otherval = 0;
3637 else if (opc == '&' && !otherval)
3638 otherval = 0;
3639 else {
711a919c 3640 s += octets; /* no replacement */
979f2922
TS
3641 continue;
3642 }
3643
3644 if (bits == 8)
3645 *s++ = (U8)( otherval & 0xff);
3646 else if (bits == 16) {
3647 *s++ = (U8)((otherval >> 8) & 0xff);
3648 *s++ = (U8)( otherval & 0xff);
3649 }
3650 else if (bits == 32) {
3651 *s++ = (U8)((otherval >> 24) & 0xff);
3652 *s++ = (U8)((otherval >> 16) & 0xff);
3653 *s++ = (U8)((otherval >> 8) & 0xff);
3654 *s++ = (U8)( otherval & 0xff);
3655 }
3656 }
3657 }
3658 sv_free(other); /* through with it! */
3659 } /* while */
3660 return swatch;
3661}
3662
064c021d 3663HV*
4c2e1131 3664Perl__swash_inversion_hash(pTHX_ SV* const swash)
064c021d
KW
3665{
3666
79a2a0e8 3667 /* Subject to change or removal. For use only in regcomp.c and regexec.c
5662e334
KW
3668 * Can't be used on a property that is subject to user override, as it
3669 * relies on the value of SPECIALS in the swash which would be set by
3670 * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
3671 * for overridden properties
064c021d
KW
3672 *
3673 * Returns a hash which is the inversion and closure of a swash mapping.
3674 * For example, consider the input lines:
3675 * 004B 006B
3676 * 004C 006C
3677 * 212A 006B
3678 *
4a4088c4 3679 * The returned hash would have two keys, the UTF-8 for 006B and the UTF-8 for
064c021d 3680 * 006C. The value for each key is an array. For 006C, the array would
4a4088c4
KW