3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4 * by Larry Wall and others
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
20 * as is the custom in the West, if you wish to be answered?'
21 * --Gandalf, addressing Théoden's door wardens
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
32 #define PERL_IN_UTF8_C
34 #include "invlist_inline.h"
36 static const char malformed_text[] = "Malformed UTF-8 character";
37 static const char unees[] =
38 "Malformed UTF-8 character (unexpected end of string)";
39 static const char cp_above_legal_max[] =
40 "Use of code point 0x%" UVXf " is deprecated; the permissible max is 0x%" UVXf ". This will be fatal in Perl 5.28";
42 #define MAX_NON_DEPRECATED_CP ((UV) (IV_MAX))
45 =head1 Unicode Support
46 These are various utility functions for manipulating UTF8-encoded
47 strings. For the uninitiated, this is a method of representing arbitrary
48 Unicode characters as a variable number of bytes, in such a way that
49 characters in the ASCII range are unmodified, and a zero byte never appears
50 within non-zero characters.
56 Perl__force_out_malformed_utf8_message(pTHX_
57 const U8 *const p, /* First byte in UTF-8 sequence */
58 const U8 * const e, /* Final byte in sequence (may include
60 const U32 flags, /* Flags to pass to utf8n_to_uvchr(),
61 usually 0, or some DISALLOW flags */
62 const bool die_here) /* If TRUE, this function does not return */
64 /* This core-only function is to be called when a malformed UTF-8 character
65 * is found, in order to output the detailed information about the
66 * malformation before dieing. The reason it exists is for the occasions
67 * when such a malformation is fatal, but warnings might be turned off, so
68 * that normally they would not be actually output. This ensures that they
69 * do get output. Because a sequence may be malformed in more than one
70 * way, multiple messages may be generated, so we can't make them fatal, as
71 * that would cause the first one to die.
73 * Instead we pretend -W was passed to perl, then die afterwards. The
74 * flexibility is here to return to the caller so they can finish up and
78 PERL_ARGS_ASSERT__FORCE_OUT_MALFORMED_UTF8_MESSAGE;
84 PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
86 PL_curcop->cop_warnings = pWARN_ALL;
89 (void) utf8n_to_uvchr_error(p, e - p, NULL, flags & ~UTF8_CHECK_ONLY, &errors);
94 Perl_croak(aTHX_ "panic: _force_out_malformed_utf8_message should"
95 " be called only when there are errors found");
99 Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
104 =for apidoc uvoffuni_to_utf8_flags
106 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
107 Instead, B<Almost all code should use L</uvchr_to_utf8> or
108 L</uvchr_to_utf8_flags>>.
110 This function is like them, but the input is a strict Unicode
111 (as opposed to native) code point. Only in very rare circumstances should code
112 not be using the native code point.
114 For details, see the description for L</uvchr_to_utf8_flags>.
119 /* All these formats take a single UV code point argument */
120 const char surrogate_cp_format[] = "UTF-16 surrogate U+%04" UVXf;
121 const char nonchar_cp_format[] = "Unicode non-character U+%04" UVXf
122 " is not recommended for open interchange";
123 const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
124 " may not be portable";
125 const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \
126 " Unicode, requires a Perl extension," \
127 " and so is not portable";
129 #define HANDLE_UNICODE_SURROGATE(uv, flags) \
131 if (flags & UNICODE_WARN_SURROGATE) { \
132 Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE), \
133 surrogate_cp_format, uv); \
135 if (flags & UNICODE_DISALLOW_SURROGATE) { \
140 #define HANDLE_UNICODE_NONCHAR(uv, flags) \
142 if (flags & UNICODE_WARN_NONCHAR) { \
143 Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR), \
144 nonchar_cp_format, uv); \
146 if (flags & UNICODE_DISALLOW_NONCHAR) { \
151 /* Use shorter names internally in this file */
152 #define SHIFT UTF_ACCUMULATION_SHIFT
154 #define MARK UTF_CONTINUATION_MARK
155 #define MASK UTF_CONTINUATION_MASK
158 Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, const UV flags)
160 PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
162 if (OFFUNI_IS_INVARIANT(uv)) {
163 *d++ = LATIN1_TO_NATIVE(uv);
167 if (uv <= MAX_UTF8_TWO_BYTE) {
168 *d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
169 *d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
173 /* Not 2-byte; test for and handle 3-byte result. In the test immediately
174 * below, the 16 is for start bytes E0-EF (which are all the possible ones
175 * for 3 byte characters). The 2 is for 2 continuation bytes; these each
176 * contribute SHIFT bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000
177 * on ASCII; so 3 bytes covers the range 0x400-0x3FFF on EBCDIC;
178 * 0x800-0xFFFF on ASCII */
179 if (uv < (16 * (1U << (2 * SHIFT)))) {
180 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
181 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
182 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
184 #ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC, so
185 aren't tested here */
186 /* The most likely code points in this range are below the surrogates.
187 * Do an extra test to quickly exclude those. */
188 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
189 if (UNLIKELY( UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)
190 || UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)))
192 HANDLE_UNICODE_NONCHAR(uv, flags);
194 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
195 HANDLE_UNICODE_SURROGATE(uv, flags);
202 /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
203 * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
204 * happen starting with 4-byte characters on ASCII platforms. We unify the
205 * code for these with EBCDIC, even though some of them require 5-bytes on
206 * those, because khw believes the code saving is worth the very slight
207 * performance hit on these high EBCDIC code points. */
209 if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
210 if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
211 && ckWARN_d(WARN_DEPRECATED))
213 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
214 cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
216 if ( (flags & UNICODE_WARN_SUPER)
217 || ( UNICODE_IS_PERL_EXTENDED(uv)
218 && (flags & UNICODE_WARN_PERL_EXTENDED)))
220 Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
222 /* Choose the more dire applicable warning */
223 (UNICODE_IS_PERL_EXTENDED(uv))
224 ? perl_extended_cp_format
228 if ( (flags & UNICODE_DISALLOW_SUPER)
229 || ( UNICODE_IS_PERL_EXTENDED(uv)
230 && (flags & UNICODE_DISALLOW_PERL_EXTENDED)))
235 else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
236 HANDLE_UNICODE_NONCHAR(uv, flags);
239 /* Test for and handle 4-byte result. In the test immediately below, the
240 * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
241 * characters). The 3 is for 3 continuation bytes; these each contribute
242 * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
243 * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
244 * 0x1_0000-0x1F_FFFF on ASCII */
245 if (uv < (8 * (1U << (3 * SHIFT)))) {
246 *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
247 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
248 *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
249 *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
251 #ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
252 characters. The end-plane non-characters for EBCDIC were
253 handled just above */
254 if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
255 HANDLE_UNICODE_NONCHAR(uv, flags);
257 else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
258 HANDLE_UNICODE_SURROGATE(uv, flags);
265 /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
266 * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
267 * format. The unrolled version above turns out to not save all that much
268 * time, and at these high code points (well above the legal Unicode range
269 * on ASCII platforms, and well above anything in common use in EBCDIC),
270 * khw believes that less code outweighs slight performance gains. */
273 STRLEN len = OFFUNISKIP(uv);
276 *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
277 uv >>= UTF_ACCUMULATION_SHIFT;
279 *p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
285 =for apidoc uvchr_to_utf8
287 Adds the UTF-8 representation of the native code point C<uv> to the end
288 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
289 C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
290 the byte after the end of the new character. In other words,
292 d = uvchr_to_utf8(d, uv);
294 is the recommended wide native character-aware way of saying
298 This function accepts any UV as input, but very high code points (above
299 C<IV_MAX> on the platform) will raise a deprecation warning. This is
300 typically 0x7FFF_FFFF in a 32-bit word.
302 It is possible to forbid or warn on non-Unicode code points, or those that may
303 be problematic by using L</uvchr_to_utf8_flags>.
308 /* This is also a macro */
309 PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
312 Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
314 return uvchr_to_utf8(d, uv);
318 =for apidoc uvchr_to_utf8_flags
320 Adds the UTF-8 representation of the native code point C<uv> to the end
321 of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
322 C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
323 the byte after the end of the new character. In other words,
325 d = uvchr_to_utf8_flags(d, uv, flags);
329 d = uvchr_to_utf8_flags(d, uv, 0);
331 This is the Unicode-aware way of saying
335 If C<flags> is 0, this function accepts any UV as input, but very high code
336 points (above C<IV_MAX> for the platform) will raise a deprecation warning.
337 This is typically 0x7FFF_FFFF in a 32-bit word.
339 Specifying C<flags> can further restrict what is allowed and not warned on, as
342 If C<uv> is a Unicode surrogate code point and C<UNICODE_WARN_SURROGATE> is set,
343 the function will raise a warning, provided UTF8 warnings are enabled. If
344 instead C<UNICODE_DISALLOW_SURROGATE> is set, the function will fail and return
345 NULL. If both flags are set, the function will both warn and return NULL.
347 Similarly, the C<UNICODE_WARN_NONCHAR> and C<UNICODE_DISALLOW_NONCHAR> flags
348 affect how the function handles a Unicode non-character.
350 And likewise, the C<UNICODE_WARN_SUPER> and C<UNICODE_DISALLOW_SUPER> flags
351 affect the handling of code points that are above the Unicode maximum of
352 0x10FFFF. Languages other than Perl may not be able to accept files that
355 The flag C<UNICODE_WARN_ILLEGAL_INTERCHANGE> selects all three of
356 the above WARN flags; and C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> selects all
357 three DISALLOW flags. C<UNICODE_DISALLOW_ILLEGAL_INTERCHANGE> restricts the
358 allowed inputs to the strict UTF-8 traditionally defined by Unicode.
359 Similarly, C<UNICODE_WARN_ILLEGAL_C9_INTERCHANGE> and
360 C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
361 above-Unicode and surrogate flags, but not the non-character ones, as
363 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
364 See L<perlunicode/Noncharacter code points>.
366 Extremely high code points were never specified in any standard, and require an
367 extension to UTF-8 to express, which Perl does. It is likely that programs
368 written in something other than Perl would not be able to read files that
369 contain these; nor would Perl understand files written by something that uses a
370 different extension. For these reasons, there is a separate set of flags that
371 can warn and/or disallow these extremely high code points, even if other
372 above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
373 and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
374 L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
375 treat all above-Unicode code points, including these, as malformations. (Note
376 that the Unicode standard considers anything above 0x10FFFF to be illegal, but
377 there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
379 A somewhat misleadingly named synonym for C<UNICODE_WARN_PERL_EXTENDED> is
380 retained for backward compatibility: C<UNICODE_WARN_ABOVE_31_BIT>. Similarly,
381 C<UNICODE_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
382 C<UNICODE_DISALLOW_PERL_EXTENDED>. The names are misleading because these
383 flags can apply to code points that actually do fit in 31 bits. This happens
384 on EBCDIC platforms, and sometimes when the L<overlong
385 malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately
386 describe the situation in all cases.
391 /* This is also a macro */
392 PERL_CALLCONV U8* Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags);
395 Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
397 return uvchr_to_utf8_flags(d, uv, flags);
402 PERL_STATIC_INLINE bool
403 S_is_utf8_cp_above_31_bits(const U8 * const s, const U8 * const e)
405 /* Returns TRUE if the first code point represented by the Perl-extended-
406 * UTF-8-encoded string starting at 's', and looking no further than 'e -
407 * 1' doesn't fit into 31 bytes. That is, that if it is >= 2**31.
409 * The function handles the case where the input bytes do not include all
410 * the ones necessary to represent a full character. That is, they may be
411 * the intial bytes of the representation of a code point, but possibly
412 * the final ones necessary for the complete representation may be beyond
415 * The function assumes that the sequence is well-formed UTF-8 as far as it
416 * goes, and is for a UTF-8 variant code point. If the sequence is
417 * incomplete, the function returns FALSE if there is any well-formed
418 * UTF-8 byte sequence that can complete it in such a way that a code point
419 * < 2**31 is produced; otherwise it returns TRUE.
421 * Getting this exactly right is slightly tricky, and has to be done in
422 * several places in this file, so is centralized here. It is based on the
425 * U+7FFFFFFF (2 ** 31 - 1)
426 * ASCII: \xFD\xBF\xBF\xBF\xBF\xBF
427 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x42\x73\x73\x73\x73\x73\x73
428 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x42\x72\x72\x72\x72\x72\x72
429 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x42\x75\x75\x75\x75\x75\x75
430 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF
431 * U+80000000 (2 ** 31):
432 * ASCII: \xFE\x82\x80\x80\x80\x80\x80
433 * [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] 10 11 12 13
434 * IBM-1047: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
435 * IBM-037: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
436 * POSIX-BC: \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
437 * I8: \xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA2\xA0\xA0\xA0\xA0\xA0\xA0
442 /* [0] is start byte [1] [2] [3] [4] [5] [6] [7] */
443 const U8 prefix[] = "\x41\x41\x41\x41\x41\x41\x42";
444 const STRLEN prefix_len = sizeof(prefix) - 1;
445 const STRLEN len = e - s;
446 const STRLEN cmp_len = MIN(prefix_len, len - 1);
454 PERL_ARGS_ASSERT_IS_UTF8_CP_ABOVE_31_BITS;
456 assert(! UTF8_IS_INVARIANT(*s));
460 /* Technically, a start byte of FE can be for a code point that fits into
461 * 31 bytes, but not for well-formed UTF-8: doing that requires an overlong
467 /* On the EBCDIC code pages we handle, only the native start byte 0xFE can
468 * mean a 32-bit or larger code point (0xFF is an invariant). For 0xFE, we
469 * need at least 2 bytes, and maybe up through 8 bytes, to be sure that the
470 * value is above 31 bits. */
471 if (*s != 0xFE || len == 1) {
475 /* Note that in UTF-EBCDIC, the two lowest possible continuation bytes are
477 return cBOOL(memGT(s + 1, prefix, cmp_len));
485 /* Anything larger than this will overflow the word if it were converted into a UV */
486 #if defined(UV_IS_QUAD)
487 # ifdef EBCDIC /* Actually is I8 */
488 # define HIGHEST_REPRESENTABLE_UTF8 \
489 "\xFF\xAF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
491 # define HIGHEST_REPRESENTABLE_UTF8 \
492 "\xFF\x80\x8F\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"
496 # define HIGHEST_REPRESENTABLE_UTF8 \
497 "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA3\xBF\xBF\xBF\xBF\xBF\xBF"
499 # define HIGHEST_REPRESENTABLE_UTF8 "\xFE\x83\xBF\xBF\xBF\xBF\xBF"
503 PERL_STATIC_INLINE bool
504 S_does_utf8_overflow(const U8 * const s, const U8 * e)
507 const U8 * y = (const U8 *) HIGHEST_REPRESENTABLE_UTF8;
509 #if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
511 const STRLEN len = e - s;
515 /* Returns a boolean as to if this UTF-8 string would overflow a UV on this
516 * platform, that is if it represents a code point larger than the highest
517 * representable code point. (For ASCII platforms, we could use memcmp()
518 * because we don't have to convert each byte to I8, but it's very rare
519 * input indeed that would approach overflow, so the loop below will likely
520 * only get executed once.
522 * 'e' must not be beyond a full character. If it is less than a full
523 * character, the function returns FALSE if there is any input beyond 'e'
524 * that could result in a non-overflowing code point */
526 PERL_ARGS_ASSERT_DOES_UTF8_OVERFLOW;
527 assert(s <= e && s + UTF8SKIP(s) >= e);
529 #if ! defined(UV_IS_QUAD) && ! defined(EBCDIC)
531 /* On 32 bit ASCII machines, many overlongs that start with FF don't
534 if (isFF_OVERLONG(s, len)) {
535 const U8 max_32_bit_overlong[] = "\xFF\x80\x80\x80\x80\x80\x80\x84";
536 return memGE(s, max_32_bit_overlong,
537 MIN(len, sizeof(max_32_bit_overlong) - 1));
542 for (x = s; x < e; x++, y++) {
544 /* If this byte is larger than the corresponding highest UTF-8
545 * byte, it overflows */
546 if (UNLIKELY(NATIVE_UTF8_TO_I8(*x) > *y)) {
550 /* If not the same as this byte, it must be smaller, doesn't
552 if (LIKELY(NATIVE_UTF8_TO_I8(*x) != *y)) {
557 /* Got to the end and all bytes are the same. If the input is a whole
558 * character, it doesn't overflow. And if it is a partial character,
559 * there's not enough information to tell, so assume doesn't overflow */
563 PERL_STATIC_INLINE bool
564 S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
566 /* Overlongs can occur whenever the number of continuation bytes
567 * changes. That means whenever the number of leading 1 bits in a start
568 * byte increases from the next lower start byte. That happens for start
569 * bytes C0, E0, F0, F8, FC, FE, and FF. On modern perls, the following
570 * illegal start bytes have already been excluded, so don't need to be
572 * ASCII platforms: C0, C1
573 * EBCDIC platforms C0, C1, C2, C3, C4, E0
575 * At least a second byte is required to determine if other sequences will
578 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
579 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
581 PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK;
582 assert(len > 1 && UTF8_IS_START(*s));
584 /* Each platform has overlongs after the start bytes given above (expressed
585 * in I8 for EBCDIC). What constitutes an overlong varies by platform, but
586 * the logic is the same, except the E0 overlong has already been excluded
587 * on EBCDIC platforms. The values below were found by manually
588 * inspecting the UTF-8 patterns. See the tables in utf8.h and
592 # define F0_ABOVE_OVERLONG 0xB0
593 # define F8_ABOVE_OVERLONG 0xA8
594 # define FC_ABOVE_OVERLONG 0xA4
595 # define FE_ABOVE_OVERLONG 0xA2
596 # define FF_OVERLONG_PREFIX "\xfe\x41\x41\x41\x41\x41\x41\x41"
600 if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
604 # define F0_ABOVE_OVERLONG 0x90
605 # define F8_ABOVE_OVERLONG 0x88
606 # define FC_ABOVE_OVERLONG 0x84
607 # define FE_ABOVE_OVERLONG 0x82
608 # define FF_OVERLONG_PREFIX "\xff\x80\x80\x80\x80\x80\x80"
612 if ( (s0 == 0xF0 && UNLIKELY(s1 < F0_ABOVE_OVERLONG))
613 || (s0 == 0xF8 && UNLIKELY(s1 < F8_ABOVE_OVERLONG))
614 || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
615 || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
620 /* Check for the FF overlong */
621 return isFF_OVERLONG(s, len);
624 PERL_STATIC_INLINE bool
625 S_isFF_OVERLONG(const U8 * const s, const STRLEN len)
627 PERL_ARGS_ASSERT_ISFF_OVERLONG;
629 /* Check for the FF overlong. This happens only if all these bytes match;
630 * what comes after them doesn't matter. See tables in utf8.h,
633 return len >= sizeof(FF_OVERLONG_PREFIX) - 1
634 && UNLIKELY(memEQ(s, FF_OVERLONG_PREFIX,
635 sizeof(FF_OVERLONG_PREFIX) - 1));
638 #undef F0_ABOVE_OVERLONG
639 #undef F8_ABOVE_OVERLONG
640 #undef FC_ABOVE_OVERLONG
641 #undef FE_ABOVE_OVERLONG
642 #undef FF_OVERLONG_PREFIX
645 Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
650 /* A helper function that should not be called directly.
652 * This function returns non-zero if the string beginning at 's' and
653 * looking no further than 'e - 1' is well-formed Perl-extended-UTF-8 for a
654 * code point; otherwise it returns 0. The examination stops after the
655 * first code point in 's' is validated, not looking at the rest of the
656 * input. If 'e' is such that there are not enough bytes to represent a
657 * complete code point, this function will return non-zero anyway, if the
658 * bytes it does have are well-formed UTF-8 as far as they go, and aren't
659 * excluded by 'flags'.
661 * A non-zero return gives the number of bytes required to represent the
662 * code point. Be aware that if the input is for a partial character, the
663 * return will be larger than 'e - s'.
665 * This function assumes that the code point represented is UTF-8 variant.
666 * The caller should have excluded the possibility of it being invariant
667 * before calling this function.
669 * 'flags' can be 0, or any combination of the UTF8_DISALLOW_foo flags
670 * accepted by L</utf8n_to_uvchr>. If non-zero, this function will return
671 * 0 if the code point represented is well-formed Perl-extended-UTF-8, but
672 * disallowed by the flags. If the input is only for a partial character,
673 * the function will return non-zero if there is any sequence of
674 * well-formed UTF-8 that, when appended to the input sequence, could
675 * result in an allowed code point; otherwise it returns 0. Non characters
676 * cannot be determined based on partial character input. But many of the
677 * other excluded types can be determined with just the first one or two
682 PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
684 assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
685 |UTF8_DISALLOW_PERL_EXTENDED)));
686 assert(! UTF8_IS_INVARIANT(*s));
688 /* A variant char must begin with a start byte */
689 if (UNLIKELY(! UTF8_IS_START(*s))) {
693 /* Examine a maximum of a single whole code point */
694 if (e - s > UTF8SKIP(s)) {
700 if (flags && isUTF8_POSSIBLY_PROBLEMATIC(*s)) {
701 const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
703 /* Here, we are disallowing some set of largish code points, and the
704 * first byte indicates the sequence is for a code point that could be
705 * in the excluded set. We generally don't have to look beyond this or
706 * the second byte to see if the sequence is actually for one of the
707 * excluded classes. The code below is derived from this table:
709 * UTF-8 UTF-EBCDIC I8
710 * U+D800: \xED\xA0\x80 \xF1\xB6\xA0\xA0 First surrogate
711 * U+DFFF: \xED\xBF\xBF \xF1\xB7\xBF\xBF Final surrogate
712 * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0 First above Unicode
714 * Keep in mind that legal continuation bytes range between \x80..\xBF
715 * for UTF-8, and \xA0..\xBF for I8. Anything above those aren't
716 * continuation bytes. Hence, we don't have to test the upper edge
717 * because if any of those is encountered, the sequence is malformed,
718 * and would fail elsewhere in this function.
720 * The code here likewise assumes that there aren't other
721 * malformations; again the function should fail elsewhere because of
722 * these. For example, an overlong beginning with FC doesn't actually
723 * have to be a super; it could actually represent a small code point,
724 * even U+0000. But, since overlongs (and other malformations) are
725 * illegal, the function should return FALSE in either case.
728 #ifdef EBCDIC /* On EBCDIC, these are actually I8 bytes */
729 # define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xFA
730 # define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF9 && (s1) >= 0xA2)
732 # define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xF1 \
734 && ((s1) & 0xFE ) == 0xB6)
735 # define isUTF8_PERL_EXTENDED(s) (*s == I8_TO_NATIVE_UTF8(0xFF))
737 # define FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER 0xF5
738 # define IS_UTF8_2_BYTE_SUPER(s0, s1) ((s0) == 0xF4 && (s1) >= 0x90)
739 # define IS_UTF8_2_BYTE_SURROGATE(s0, s1) ((s0) == 0xED && (s1) >= 0xA0)
740 # define isUTF8_PERL_EXTENDED(s) (*s >= 0xFE)
743 if ( (flags & UTF8_DISALLOW_SUPER)
744 && UNLIKELY(s0 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
746 return 0; /* Above Unicode */
749 if ( (flags & UTF8_DISALLOW_PERL_EXTENDED)
750 && UNLIKELY(isUTF8_PERL_EXTENDED(s)))
756 const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
758 if ( (flags & UTF8_DISALLOW_SUPER)
759 && UNLIKELY(IS_UTF8_2_BYTE_SUPER(s0, s1)))
761 return 0; /* Above Unicode */
764 if ( (flags & UTF8_DISALLOW_SURROGATE)
765 && UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(s0, s1)))
767 return 0; /* Surrogate */
770 if ( (flags & UTF8_DISALLOW_NONCHAR)
771 && UNLIKELY(UTF8_IS_NONCHAR(s, e)))
773 return 0; /* Noncharacter code point */
778 /* Make sure that all that follows are continuation bytes */
779 for (x = s + 1; x < e; x++) {
780 if (UNLIKELY(! UTF8_IS_CONTINUATION(*x))) {
785 /* Here is syntactically valid. Next, make sure this isn't the start of an
787 if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
791 /* And finally, that the code point represented fits in a word on this
793 if (does_utf8_overflow(s, e)) {
801 Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
803 /* Returns a mortalized C string that is a displayable copy of the 'len'
804 * bytes starting at 's'. 'format' gives how to display each byte.
805 * Currently, there are only two formats, so it is currently a bool:
807 * 1 ab (that is a space between two hex digit bytes)
810 const STRLEN output_len = 4 * len + 1; /* 4 bytes per each input, plus a
812 const U8 * const e = s + len;
816 PERL_ARGS_ASSERT__BYTE_DUMP_STRING;
818 Newx(output, output_len, char);
823 const unsigned high_nibble = (*s & 0xF0) >> 4;
824 const unsigned low_nibble = (*s & 0x0F);
834 if (high_nibble < 10) {
835 *d++ = high_nibble + '0';
838 *d++ = high_nibble - 10 + 'a';
841 if (low_nibble < 10) {
842 *d++ = low_nibble + '0';
845 *d++ = low_nibble - 10 + 'a';
853 PERL_STATIC_INLINE char *
854 S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
856 /* How many bytes to print */
859 /* Which one is the non-continuation */
860 const STRLEN non_cont_byte_pos,
862 /* How many bytes should there be? */
863 const STRLEN expect_len)
865 /* Return the malformation warning text for an unexpected continuation
868 const char * const where = (non_cont_byte_pos == 1)
870 : Perl_form(aTHX_ "%d bytes",
871 (int) non_cont_byte_pos);
873 PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
875 /* We don't need to pass this parameter, but since it has already been
876 * calculated, it's likely faster to pass it; verify under DEBUGGING */
877 assert(expect_len == UTF8SKIP(s));
879 return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
880 " %s after start byte 0x%02x; need %d bytes, got %d)",
882 _byte_dump_string(s, print_len, 0),
883 *(s + non_cont_byte_pos),
887 (int) non_cont_byte_pos);
892 =for apidoc utf8n_to_uvchr
894 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
895 Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
897 Bottom level UTF-8 decode routine.
898 Returns the native code point value of the first character in the string C<s>,
899 which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding, and no longer than
900 C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
901 the length, in bytes, of that character.
903 The value of C<flags> determines the behavior when C<s> does not point to a
904 well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
905 causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
906 is the next possible position in C<s> that could begin a non-malformed
907 character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
908 is raised. Some UTF-8 input sequences may contain multiple malformations.
909 This function tries to find every possible one in each call, so multiple
910 warnings can be raised for the same sequence.
912 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
913 individual types of malformations, such as the sequence being overlong (that
914 is, when there is a shorter sequence that can express the same code point;
915 overlong sequences are expressly forbidden in the UTF-8 standard due to
916 potential security issues). Another malformation example is the first byte of
917 a character not being a legal first byte. See F<utf8.h> for the list of such
918 flags. Even if allowed, this function generally returns the Unicode
919 REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
920 F<utf8.h> to override this behavior for the overlong malformations, but don't
921 do that except for very specialized purposes.
923 The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
924 flags) malformation is found. If this flag is set, the routine assumes that
925 the caller will raise a warning, and this function will silently just set
926 C<retlen> to C<-1> (cast to C<STRLEN>) and return zero.
928 Note that this API requires disambiguation between successful decoding a C<NUL>
929 character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
930 in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
931 be set to 1. To disambiguate, upon a zero return, see if the first byte of
932 C<s> is 0 as well. If so, the input was a C<NUL>; if not, the input had an
933 error. Or you can use C<L</utf8n_to_uvchr_error>>.
935 Certain code points are considered problematic. These are Unicode surrogates,
936 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
937 By default these are considered regular code points, but certain situations
938 warrant special handling for them, which can be specified using the C<flags>
939 parameter. If C<flags> contains C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>, all
940 three classes are treated as malformations and handled as such. The flags
941 C<UTF8_DISALLOW_SURROGATE>, C<UTF8_DISALLOW_NONCHAR>, and
942 C<UTF8_DISALLOW_SUPER> (meaning above the legal Unicode maximum) can be set to
943 disallow these categories individually. C<UTF8_DISALLOW_ILLEGAL_INTERCHANGE>
944 restricts the allowed inputs to the strict UTF-8 traditionally defined by
945 Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
947 L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
948 The difference between traditional strictness and C9 strictness is that the
949 latter does not forbid non-character code points. (They are still discouraged,
950 however.) For more discussion see L<perlunicode/Noncharacter code points>.
952 The flags C<UTF8_WARN_ILLEGAL_INTERCHANGE>,
953 C<UTF8_WARN_ILLEGAL_C9_INTERCHANGE>, C<UTF8_WARN_SURROGATE>,
954 C<UTF8_WARN_NONCHAR>, and C<UTF8_WARN_SUPER> will cause warning messages to be
955 raised for their respective categories, but otherwise the code points are
956 considered valid (not malformations). To get a category to both be treated as
957 a malformation and raise a warning, specify both the WARN and DISALLOW flags.
958 (But note that warnings are not raised if lexically disabled nor if
959 C<UTF8_CHECK_ONLY> is also specified.)
961 Extremely high code points were never specified in any standard, and require an
962 extension to UTF-8 to express, which Perl does. It is likely that programs
963 written in something other than Perl would not be able to read files that
964 contain these; nor would Perl understand files written by something that uses a
965 different extension. For these reasons, there is a separate set of flags that
966 can warn and/or disallow these extremely high code points, even if other
967 above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
968 C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
969 L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
970 above-Unicode code points, including these, as malformations.
971 (Note that the Unicode standard considers anything above 0x10FFFF to be
972 illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
975 A somewhat misleadingly named synonym for C<UTF8_WARN_PERL_EXTENDED> is
976 retained for backward compatibility: C<UTF8_WARN_ABOVE_31_BIT>. Similarly,
977 C<UTF8_DISALLOW_ABOVE_31_BIT> is usable instead of the more accurately named
978 C<UTF8_DISALLOW_PERL_EXTENDED>. The names are misleading because these flags
979 can apply to code points that actually do fit in 31 bits. This happens on
980 EBCDIC platforms, and sometimes when the L<overlong
981 malformation|/C<UTF8_GOT_LONG>> is also present. The new names accurately
982 describe the situation in all cases.
984 It is now deprecated to have very high code points (above C<IV_MAX> on the
985 platforms) and this function will raise a deprecation warning for these (unless
986 such warnings are turned off). This value is typically 0x7FFF_FFFF (2**31 -1)
989 All other code points corresponding to Unicode characters, including private
990 use and those yet to be assigned, are never considered malformed and never
995 Also implemented as a macro in utf8.h
999 Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
1004 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
1006 return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
1011 =for apidoc utf8n_to_uvchr_error
1013 THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
1014 Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
1016 This function is for code that needs to know what the precise malformation(s)
1017 are when an error is found.
1019 It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
1020 all the others, C<errors>. If this parameter is 0, this function behaves
1021 identically to C<L</utf8n_to_uvchr>>. Otherwise, C<errors> should be a pointer
1022 to a C<U32> variable, which this function sets to indicate any errors found.
1023 Upon return, if C<*errors> is 0, there were no errors found. Otherwise,
1024 C<*errors> is the bit-wise C<OR> of the bits described in the list below. Some
1025 of these bits will be set if a malformation is found, even if the input
1026 C<flags> parameter indicates that the given malformation is allowed; those
1027 exceptions are noted:
1031 =item C<UTF8_GOT_PERL_EXTENDED>
1033 The input sequence is not standard UTF-8, but a Perl extension. This bit is
1034 set only if the input C<flags> parameter contains either the
1035 C<UTF8_DISALLOW_PERL_EXTENDED> or the C<UTF8_WARN_PERL_EXTENDED> flags.
1037 Code points above 0x7FFF_FFFF (2**31 - 1) were never specified in any standard,
1038 and so some extension must be used to express them. Perl uses a natural
1039 extension to UTF-8 to represent the ones up to 2**36-1, and invented a further
1040 extension to represent even higher ones, so that any code point that fits in a
1041 64-bit word can be represented. Text using these extensions is not likely to
1042 be portable to non-Perl code. We lump both of these extensions together and
1043 refer to them as Perl extended UTF-8. There exist other extensions that people
1044 have invented, incompatible with Perl's.
1046 On EBCDIC platforms starting in Perl v5.24, the Perl extension for representing
1047 extremely high code points kicks in at 0x3FFF_FFFF (2**30 -1), which is lower
1048 than on ASCII. Prior to that, code points 2**31 and higher were simply
1049 unrepresentable, and a different, incompatible method was used to represent
1050 code points between 2**30 and 2**31 - 1.
1052 On both platforms, ASCII and EBCDIC, C<UTF8_GOT_PERL_EXTENDED> is set if
1053 Perl extended UTF-8 is used.
1055 In earlier Perls, this bit was named C<UTF8_GOT_ABOVE_31_BIT>, which you still
1056 may use for backward compatibility. That name is misleading, as this flag may
1057 be set when the code point actually does fit in 31 bits. This happens on
1058 EBCDIC platforms, and sometimes when the L<overlong
1059 malformation|/C<UTF8_GOT_LONG>> is also present. The new name accurately
1060 describes the situation in all cases.
1062 =item C<UTF8_GOT_CONTINUATION>
1064 The input sequence was malformed in that the first byte was a a UTF-8
1067 =item C<UTF8_GOT_EMPTY>
1069 The input C<curlen> parameter was 0.
1071 =item C<UTF8_GOT_LONG>
1073 The input sequence was malformed in that there is some other sequence that
1074 evaluates to the same code point, but that sequence is shorter than this one.
1076 Until Unicode 3.1, it was legal for programs to accept this malformation, but
1077 it was discovered that this created security issues.
1079 =item C<UTF8_GOT_NONCHAR>
1081 The code point represented by the input UTF-8 sequence is for a Unicode
1082 non-character code point.
1083 This bit is set only if the input C<flags> parameter contains either the
1084 C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
1086 =item C<UTF8_GOT_NON_CONTINUATION>
1088 The input sequence was malformed in that a non-continuation type byte was found
1089 in a position where only a continuation type one should be.
1091 =item C<UTF8_GOT_OVERFLOW>
1093 The input sequence was malformed in that it is for a code point that is not
1094 representable in the number of bits available in a UV on the current platform.
1096 =item C<UTF8_GOT_SHORT>
1098 The input sequence was malformed in that C<curlen> is smaller than required for
1099 a complete sequence. In other words, the input is for a partial character
1102 =item C<UTF8_GOT_SUPER>
1104 The input sequence was malformed in that it is for a non-Unicode code point;
1105 that is, one above the legal Unicode maximum.
1106 This bit is set only if the input C<flags> parameter contains either the
1107 C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
1109 =item C<UTF8_GOT_SURROGATE>
1111 The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
1113 This bit is set only if the input C<flags> parameter contains either the
1114 C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
1118 To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
1119 flag to suppress any warnings, and then examine the C<*errors> return.
1125 Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
1131 const U8 * const s0 = s;
1132 U8 * send = NULL; /* (initialized to silence compilers' wrong
1134 U32 possible_problems = 0; /* A bit is set here for each potential problem
1135 found as we go along */
1137 STRLEN expectlen = 0; /* How long should this sequence be?
1138 (initialized to silence compilers' wrong
1140 STRLEN avail_len = 0; /* When input is too short, gives what that is */
1141 U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
1142 this gets set and discarded */
1144 /* The below are used only if there is both an overlong malformation and a
1145 * too short one. Otherwise the first two are set to 's0' and 'send', and
1146 * the third not used at all */
1147 U8 * adjusted_s0 = (U8 *) s0;
1148 U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
1149 routine; see [perl #130921] */
1150 UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
1152 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
1158 errors = &discard_errors;
1161 /* The order of malformation tests here is important. We should consume as
1162 * few bytes as possible in order to not skip any valid character. This is
1163 * required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
1164 * http://unicode.org/reports/tr36 for more discussion as to why. For
1165 * example, once we've done a UTF8SKIP, we can tell the expected number of
1166 * bytes, and could fail right off the bat if the input parameters indicate
1167 * that there are too few available. But it could be that just that first
1168 * byte is garbled, and the intended character occupies fewer bytes. If we
1169 * blindly assumed that the first byte is correct, and skipped based on
1170 * that number, we could skip over a valid input character. So instead, we
1171 * always examine the sequence byte-by-byte.
1173 * We also should not consume too few bytes, otherwise someone could inject
1174 * things. For example, an input could be deliberately designed to
1175 * overflow, and if this code bailed out immediately upon discovering that,
1176 * returning to the caller C<*retlen> pointing to the very next byte (one
1177 * which is actually part of of the overflowing sequence), that could look
1178 * legitimate to the caller, which could discard the initial partial
1179 * sequence and process the rest, inappropriately.
1181 * Some possible input sequences are malformed in more than one way. This
1182 * function goes to lengths to try to find all of them. This is necessary
1183 * for correctness, as the inputs may allow one malformation but not
1184 * another, and if we abandon searching for others after finding the
1185 * allowed one, we could allow in something that shouldn't have been.
1188 if (UNLIKELY(curlen == 0)) {
1189 possible_problems |= UTF8_GOT_EMPTY;
1191 uv = UNICODE_REPLACEMENT;
1192 goto ready_to_handle_errors;
1195 expectlen = UTF8SKIP(s);
1197 /* A well-formed UTF-8 character, as the vast majority of calls to this
1198 * function will be for, has this expected length. For efficiency, set
1199 * things up here to return it. It will be overriden only in those rare
1200 * cases where a malformation is found */
1202 *retlen = expectlen;
1205 /* An invariant is trivially well-formed */
1206 if (UTF8_IS_INVARIANT(uv)) {
1210 /* A continuation character can't start a valid sequence */
1211 if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
1212 possible_problems |= UTF8_GOT_CONTINUATION;
1214 uv = UNICODE_REPLACEMENT;
1215 goto ready_to_handle_errors;
1218 /* Here is not a continuation byte, nor an invariant. The only thing left
1219 * is a start byte (possibly for an overlong). (We can't use UTF8_IS_START
1220 * because it excludes start bytes like \xC0 that always lead to
1223 /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
1224 * that indicate the number of bytes in the character's whole UTF-8
1225 * sequence, leaving just the bits that are part of the value. */
1226 uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
1228 /* Setup the loop end point, making sure to not look past the end of the
1229 * input string, and flag it as too short if the size isn't big enough. */
1231 if (UNLIKELY(curlen < expectlen)) {
1232 possible_problems |= UTF8_GOT_SHORT;
1240 /* Now, loop through the remaining bytes in the character's sequence,
1241 * accumulating each into the working value as we go. */
1242 for (s = s0 + 1; s < send; s++) {
1243 if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
1244 uv = UTF8_ACCUMULATE(uv, *s);
1248 /* Here, found a non-continuation before processing all expected bytes.
1249 * This byte indicates the beginning of a new character, so quit, even
1250 * if allowing this malformation. */
1251 possible_problems |= UTF8_GOT_NON_CONTINUATION;
1253 } /* End of loop through the character's bytes */
1255 /* Save how many bytes were actually in the character */
1258 /* Note that there are two types of too-short malformation. One is when
1259 * there is actual wrong data before the normal termination of the
1260 * sequence. The other is that the sequence wasn't complete before the end
1261 * of the data we are allowed to look at, based on the input 'curlen'.
1262 * This means that we were passed data for a partial character, but it is
1263 * valid as far as we saw. The other is definitely invalid. This
1264 * distinction could be important to a caller, so the two types are kept
1267 * A convenience macro that matches either of the too-short conditions. */
1268 # define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
1270 if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
1272 uv = UNICODE_REPLACEMENT;
1275 /* Check for overflow. */
1276 if (UNLIKELY(does_utf8_overflow(s0, send))) {
1277 possible_problems |= UTF8_GOT_OVERFLOW;
1278 uv = UNICODE_REPLACEMENT;
1281 /* Check for overlong. If no problems so far, 'uv' is the correct code
1282 * point value. Simply see if it is expressible in fewer bytes. Otherwise
1283 * we must look at the UTF-8 byte sequence itself to see if it is for an
1285 if ( ( LIKELY(! possible_problems)
1286 && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
1287 || ( UNLIKELY(possible_problems)
1288 && ( UNLIKELY(! UTF8_IS_START(*s0))
1290 && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
1293 possible_problems |= UTF8_GOT_LONG;
1295 if ( UNLIKELY( possible_problems & UTF8_GOT_TOO_SHORT)
1297 /* The calculation in the 'true' branch of this 'if'
1298 * below won't work if overflows, and isn't needed
1299 * anyway. Further below we handle all overflow
1301 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW)))
1303 UV min_uv = uv_so_far;
1306 /* Here, the input is both overlong and is missing some trailing
1307 * bytes. There is no single code point it could be for, but there
1308 * may be enough information present to determine if what we have
1309 * so far is for an unallowed code point, such as for a surrogate.
1310 * The code further below has the intelligence to determine this,
1311 * but just for non-overlong UTF-8 sequences. What we do here is
1312 * calculate the smallest code point the input could represent if
1313 * there were no too short malformation. Then we compute and save
1314 * the UTF-8 for that, which is what the code below looks at
1315 * instead of the raw input. It turns out that the smallest such
1316 * code point is all we need. */
1317 for (i = curlen; i < expectlen; i++) {
1318 min_uv = UTF8_ACCUMULATE(min_uv,
1319 I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
1322 adjusted_s0 = temp_char_buf;
1323 (void) uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
1327 /* Here, we have found all the possible problems, except for when the input
1328 * is for a problematic code point not allowed by the input parameters. */
1330 /* uv is valid for overlongs */
1331 if ( ( ( LIKELY(! (possible_problems & ~UTF8_GOT_LONG))
1333 /* isn't problematic if < this */
1334 && uv >= UNICODE_SURROGATE_FIRST)
1335 || ( UNLIKELY(possible_problems)
1337 /* if overflow, we know without looking further
1338 * precisely which of the problematic types it is,
1339 * and we deal with those in the overflow handling
1341 && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
1342 && ( isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)
1343 || UNLIKELY(isUTF8_PERL_EXTENDED(s0)))))
1344 && ((flags & ( UTF8_DISALLOW_NONCHAR
1345 |UTF8_DISALLOW_SURROGATE
1346 |UTF8_DISALLOW_SUPER
1347 |UTF8_DISALLOW_PERL_EXTENDED
1349 |UTF8_WARN_SURROGATE
1351 |UTF8_WARN_PERL_EXTENDED))
1352 /* In case of a malformation, 'uv' is not valid, and has
1353 * been changed to something in the Unicode range.
1354 * Currently we don't output a deprecation message if there
1355 * is already a malformation, so we don't have to special
1356 * case the test immediately below */
1357 || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1358 && ckWARN_d(WARN_DEPRECATED))))
1360 /* If there were no malformations, or the only malformation is an
1361 * overlong, 'uv' is valid */
1362 if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
1363 if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
1364 possible_problems |= UTF8_GOT_SURROGATE;
1366 else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
1367 possible_problems |= UTF8_GOT_SUPER;
1369 else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
1370 possible_problems |= UTF8_GOT_NONCHAR;
1373 else { /* Otherwise, need to look at the source UTF-8, possibly
1374 adjusted to be non-overlong */
1376 if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
1377 >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
1379 possible_problems |= UTF8_GOT_SUPER;
1381 else if (curlen > 1) {
1382 if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
1383 NATIVE_UTF8_TO_I8(*adjusted_s0),
1384 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1386 possible_problems |= UTF8_GOT_SUPER;
1388 else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
1389 NATIVE_UTF8_TO_I8(*adjusted_s0),
1390 NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
1392 possible_problems |= UTF8_GOT_SURROGATE;
1396 /* We need a complete well-formed UTF-8 character to discern
1397 * non-characters, so can't look for them here */
1401 ready_to_handle_errors:
1404 * curlen contains the number of bytes in the sequence that
1405 * this call should advance the input by.
1406 * avail_len gives the available number of bytes passed in, but
1407 * only if this is less than the expected number of
1408 * bytes, based on the code point's start byte.
1409 * possible_problems' is 0 if there weren't any problems; otherwise a bit
1410 * is set in it for each potential problem found.
1411 * uv contains the code point the input sequence
1412 * represents; or if there is a problem that prevents
1413 * a well-defined value from being computed, it is
1414 * some subsitute value, typically the REPLACEMENT
1416 * s0 points to the first byte of the character
1417 * s points to just after were we left off processing
1419 * send points to just after where that character should
1420 * end, based on how many bytes the start byte tells
1421 * us should be in it, but no further than s0 +
1425 if (UNLIKELY(possible_problems)) {
1426 bool disallowed = FALSE;
1427 const U32 orig_problems = possible_problems;
1429 while (possible_problems) { /* Handle each possible problem */
1431 char * message = NULL;
1433 /* Each 'if' clause handles one problem. They are ordered so that
1434 * the first ones' messages will be displayed before the later
1435 * ones; this is kinda in decreasing severity order. But the
1436 * overlong must come last, as it changes 'uv' looked at by the
1438 if (possible_problems & UTF8_GOT_OVERFLOW) {
1440 /* Overflow means also got a super and are using Perl's
1441 * extended UTF-8, but we handle all three cases here */
1443 &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_PERL_EXTENDED);
1444 *errors |= UTF8_GOT_OVERFLOW;
1446 /* But the API says we flag all errors found */
1447 if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
1448 *errors |= UTF8_GOT_SUPER;
1451 & (UTF8_WARN_PERL_EXTENDED|UTF8_DISALLOW_PERL_EXTENDED))
1453 *errors |= UTF8_GOT_PERL_EXTENDED;
1456 /* Disallow if any of the three categories say to */
1457 if ( ! (flags & UTF8_ALLOW_OVERFLOW)
1458 || (flags & ( UTF8_DISALLOW_SUPER
1459 |UTF8_DISALLOW_PERL_EXTENDED)))
1464 /* Likewise, warn if any say to, plus if deprecation warnings
1465 * are on, because this code point is above IV_MAX */
1466 if ( ckWARN_d(WARN_DEPRECATED)
1467 || ! (flags & UTF8_ALLOW_OVERFLOW)
1468 || (flags & (UTF8_WARN_SUPER|UTF8_WARN_PERL_EXTENDED)))
1471 /* The warnings code explicitly says it doesn't handle the
1472 * case of packWARN2 and two categories which have
1473 * parent-child relationship. Even if it works now to
1474 * raise the warning if either is enabled, it wouldn't
1475 * necessarily do so in the future. We output (only) the
1476 * most dire warning */
1477 if (! (flags & UTF8_CHECK_ONLY)) {
1478 if (ckWARN_d(WARN_UTF8)) {
1479 pack_warn = packWARN(WARN_UTF8);
1481 else if (ckWARN_d(WARN_NON_UNICODE)) {
1482 pack_warn = packWARN(WARN_NON_UNICODE);
1485 message = Perl_form(aTHX_ "%s: %s (overflows)",
1487 _byte_dump_string(s0, curlen, 0));
1492 else if (possible_problems & UTF8_GOT_EMPTY) {
1493 possible_problems &= ~UTF8_GOT_EMPTY;
1494 *errors |= UTF8_GOT_EMPTY;
1496 if (! (flags & UTF8_ALLOW_EMPTY)) {
1498 /* This so-called malformation is now treated as a bug in
1499 * the caller. If you have nothing to decode, skip calling
1504 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1505 pack_warn = packWARN(WARN_UTF8);
1506 message = Perl_form(aTHX_ "%s (empty string)",
1511 else if (possible_problems & UTF8_GOT_CONTINUATION) {
1512 possible_problems &= ~UTF8_GOT_CONTINUATION;
1513 *errors |= UTF8_GOT_CONTINUATION;
1515 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
1517 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1518 pack_warn = packWARN(WARN_UTF8);
1519 message = Perl_form(aTHX_
1520 "%s: %s (unexpected continuation byte 0x%02x,"
1521 " with no preceding start byte)",
1523 _byte_dump_string(s0, 1, 0), *s0);
1527 else if (possible_problems & UTF8_GOT_SHORT) {
1528 possible_problems &= ~UTF8_GOT_SHORT;
1529 *errors |= UTF8_GOT_SHORT;
1531 if (! (flags & UTF8_ALLOW_SHORT)) {
1533 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1534 pack_warn = packWARN(WARN_UTF8);
1535 message = Perl_form(aTHX_
1536 "%s: %s (too short; %d byte%s available, need %d)",
1538 _byte_dump_string(s0, send - s0, 0),
1540 avail_len == 1 ? "" : "s",
1546 else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
1547 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
1548 *errors |= UTF8_GOT_NON_CONTINUATION;
1550 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
1552 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1554 /* If we don't know for sure that the input length is
1555 * valid, avoid as much as possible reading past the
1556 * end of the buffer */
1557 int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
1560 pack_warn = packWARN(WARN_UTF8);
1561 message = Perl_form(aTHX_ "%s",
1562 unexpected_non_continuation_text(s0,
1569 else if (possible_problems & UTF8_GOT_SURROGATE) {
1570 possible_problems &= ~UTF8_GOT_SURROGATE;
1572 if (flags & UTF8_WARN_SURROGATE) {
1573 *errors |= UTF8_GOT_SURROGATE;
1575 if ( ! (flags & UTF8_CHECK_ONLY)
1576 && ckWARN_d(WARN_SURROGATE))
1578 pack_warn = packWARN(WARN_SURROGATE);
1580 /* These are the only errors that can occur with a
1581 * surrogate when the 'uv' isn't valid */
1582 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1583 message = Perl_form(aTHX_
1584 "UTF-16 surrogate (any UTF-8 sequence that"
1585 " starts with \"%s\" is for a surrogate)",
1586 _byte_dump_string(s0, curlen, 0));
1589 message = Perl_form(aTHX_ surrogate_cp_format, uv);
1594 if (flags & UTF8_DISALLOW_SURROGATE) {
1596 *errors |= UTF8_GOT_SURROGATE;
1599 else if (possible_problems & UTF8_GOT_SUPER) {
1600 possible_problems &= ~UTF8_GOT_SUPER;
1602 if (flags & UTF8_WARN_SUPER) {
1603 *errors |= UTF8_GOT_SUPER;
1605 if ( ! (flags & UTF8_CHECK_ONLY)
1606 && ckWARN_d(WARN_NON_UNICODE))
1608 pack_warn = packWARN(WARN_NON_UNICODE);
1610 if (orig_problems & UTF8_GOT_TOO_SHORT) {
1611 message = Perl_form(aTHX_
1612 "Any UTF-8 sequence that starts with"
1613 " \"%s\" is for a non-Unicode code point,"
1614 " may not be portable",
1615 _byte_dump_string(s0, curlen, 0));
1618 message = Perl_form(aTHX_ super_cp_format, uv);
1623 /* Test for Perl's extended UTF-8 after the regular SUPER ones,
1624 * and before possibly bailing out, so that the more dire
1625 * warning will override the regular one. */
1626 if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
1627 if ( ! (flags & UTF8_CHECK_ONLY)
1628 && (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
1629 && ckWARN_d(WARN_NON_UNICODE))
1631 pack_warn = packWARN(WARN_NON_UNICODE);
1633 /* If it is an overlong that evaluates to a code point
1634 * that doesn't have to use the Perl extended UTF-8, it
1635 * still used it, and so we output a message that
1636 * doesn't refer to the code point. The same is true
1637 * if there was a SHORT malformation where the code
1638 * point is not valid. In that case, 'uv' will have
1639 * been set to the REPLACEMENT CHAR, and the message
1640 * below without the code point in it will be selected
1642 if (UNICODE_IS_PERL_EXTENDED(uv)) {
1643 message = Perl_form(aTHX_
1644 perl_extended_cp_format, uv);
1647 message = Perl_form(aTHX_
1648 "Any UTF-8 sequence that starts with"
1649 " \"%s\" is a Perl extension, and"
1650 " so is not portable",
1651 _byte_dump_string(s0, curlen, 0));
1655 if (flags & ( UTF8_WARN_PERL_EXTENDED
1656 |UTF8_DISALLOW_PERL_EXTENDED))
1658 *errors |= UTF8_GOT_PERL_EXTENDED;
1660 if (flags & UTF8_DISALLOW_PERL_EXTENDED) {
1666 if (flags & UTF8_DISALLOW_SUPER) {
1667 *errors |= UTF8_GOT_SUPER;
1671 /* The deprecated warning overrides any non-deprecated one. If
1672 * there are other problems, a deprecation message is not
1673 * really helpful, so don't bother to raise it in that case.
1674 * This also keeps the code from having to handle the case
1675 * where 'uv' is not valid. */
1676 if ( ! (orig_problems
1677 & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1678 && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
1679 && ckWARN_d(WARN_DEPRECATED))
1681 message = Perl_form(aTHX_ cp_above_legal_max,
1682 uv, MAX_NON_DEPRECATED_CP);
1683 pack_warn = packWARN(WARN_DEPRECATED);
1686 else if (possible_problems & UTF8_GOT_NONCHAR) {
1687 possible_problems &= ~UTF8_GOT_NONCHAR;
1689 if (flags & UTF8_WARN_NONCHAR) {
1690 *errors |= UTF8_GOT_NONCHAR;
1692 if ( ! (flags & UTF8_CHECK_ONLY)
1693 && ckWARN_d(WARN_NONCHAR))
1695 /* The code above should have guaranteed that we don't
1696 * get here with errors other than overlong */
1697 assert (! (orig_problems
1698 & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
1700 pack_warn = packWARN(WARN_NONCHAR);
1701 message = Perl_form(aTHX_ nonchar_cp_format, uv);
1705 if (flags & UTF8_DISALLOW_NONCHAR) {
1707 *errors |= UTF8_GOT_NONCHAR;
1710 else if (possible_problems & UTF8_GOT_LONG) {
1711 possible_problems &= ~UTF8_GOT_LONG;
1712 *errors |= UTF8_GOT_LONG;
1714 if (flags & UTF8_ALLOW_LONG) {
1716 /* We don't allow the actual overlong value, unless the
1717 * special extra bit is also set */
1718 if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
1719 & ~UTF8_ALLOW_LONG)))
1721 uv = UNICODE_REPLACEMENT;
1727 if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
1728 pack_warn = packWARN(WARN_UTF8);
1730 /* These error types cause 'uv' to be something that
1731 * isn't what was intended, so can't use it in the
1732 * message. The other error types either can't
1733 * generate an overlong, or else the 'uv' is valid */
1735 (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
1737 message = Perl_form(aTHX_
1738 "%s: %s (any UTF-8 sequence that starts"
1739 " with \"%s\" is overlong which can and"
1740 " should be represented with a"
1741 " different, shorter sequence)",
1743 _byte_dump_string(s0, send - s0, 0),
1744 _byte_dump_string(s0, curlen, 0));
1747 U8 tmpbuf[UTF8_MAXBYTES+1];
1748 const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
1750 const char * preface = (uv <= PERL_UNICODE_MAX)
1753 message = Perl_form(aTHX_
1754 "%s: %s (overlong; instead use %s to represent"
1757 _byte_dump_string(s0, send - s0, 0),
1758 _byte_dump_string(tmpbuf, e - tmpbuf, 0),
1760 ((uv < 256) ? 2 : 4), /* Field width of 2 for
1761 small code points */
1766 } /* End of looking through the possible flags */
1768 /* Display the message (if any) for the problem being handled in
1769 * this iteration of the loop */
1772 Perl_warner(aTHX_ pack_warn, "%s in %s", message,
1775 Perl_warner(aTHX_ pack_warn, "%s", message);
1777 } /* End of 'while (possible_problems)' */
1779 /* Since there was a possible problem, the returned length may need to
1780 * be changed from the one stored at the beginning of this function.
1781 * Instead of trying to figure out if that's needed, just do it. */
1787 if (flags & UTF8_CHECK_ONLY && retlen) {
1788 *retlen = ((STRLEN) -1);
1794 return UNI_TO_NATIVE(uv);
1798 =for apidoc utf8_to_uvchr_buf
1800 Returns the native code point of the first character in the string C<s> which
1801 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1802 C<*retlen> will be set to the length, in bytes, of that character.
1804 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1805 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1806 C<NULL>) to -1. If those warnings are off, the computed value, if well-defined
1807 (or the Unicode REPLACEMENT CHARACTER if not), is silently returned, and
1808 C<*retlen> is set (if C<retlen> isn't C<NULL>) so that (S<C<s> + C<*retlen>>) is
1809 the next possible position in C<s> that could begin a non-malformed character.
1810 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
1813 Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1814 unless those are turned off.
1818 Also implemented as a macro in utf8.h
1824 Perl_utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1826 PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
1830 return utf8n_to_uvchr(s, send - s, retlen,
1831 ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
1834 /* This is marked as deprecated
1836 =for apidoc utf8_to_uvuni_buf
1838 Only in very rare circumstances should code need to be dealing in Unicode
1839 (as opposed to native) code points. In those few cases, use
1840 C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead.
1842 Returns the Unicode (not-native) code point of the first character in the
1844 is assumed to be in UTF-8 encoding; C<send> points to 1 beyond the end of C<s>.
1845 C<retlen> will be set to the length, in bytes, of that character.
1847 If C<s> does not point to a well-formed UTF-8 character and UTF8 warnings are
1848 enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
1849 NULL) to -1. If those warnings are off, the computed value if well-defined (or
1850 the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
1851 is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
1852 next possible position in C<s> that could begin a non-malformed character.
1853 See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
1855 Code points above the platform's C<IV_MAX> will raise a deprecation warning,
1856 unless those are turned off.
1862 Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
1864 PERL_ARGS_ASSERT_UTF8_TO_UVUNI_BUF;
1868 /* Call the low level routine, asking for checks */
1869 return NATIVE_TO_UNI(utf8_to_uvchr_buf(s, send, retlen));
1873 =for apidoc utf8_length
1875 Return the length of the UTF-8 char encoded string C<s> in characters.
1876 Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
1877 up past C<e>, croaks.
1883 Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
1887 PERL_ARGS_ASSERT_UTF8_LENGTH;
1889 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
1890 * the bitops (especially ~) can create illegal UTF-8.
1891 * In other words: in Perl UTF-8 is not just for Unicode. */
1894 goto warn_and_return;
1904 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1905 "%s in %s", unees, OP_DESC(PL_op));
1907 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1914 =for apidoc bytes_cmp_utf8
1916 Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
1917 sequence of characters (stored as UTF-8)
1918 in C<u>, C<ulen>. Returns 0 if they are
1919 equal, -1 or -2 if the first string is less than the second string, +1 or +2
1920 if the first string is greater than the second string.
1922 -1 or +1 is returned if the shorter string was identical to the start of the
1923 longer string. -2 or +2 is returned if
1924 there was a difference between characters
1931 Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
1933 const U8 *const bend = b + blen;
1934 const U8 *const uend = u + ulen;
1936 PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
1938 while (b < bend && u < uend) {
1940 if (!UTF8_IS_INVARIANT(c)) {
1941 if (UTF8_IS_DOWNGRADEABLE_START(c)) {
1944 if (UTF8_IS_CONTINUATION(c1)) {
1945 c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
1947 /* diag_listed_as: Malformed UTF-8 character%s */
1948 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1950 unexpected_non_continuation_text(u - 2, 2, 1, 2),
1951 PL_op ? " in " : "",
1952 PL_op ? OP_DESC(PL_op) : "");
1957 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
1958 "%s in %s", unees, OP_DESC(PL_op));
1960 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), "%s", unees);
1961 return -2; /* Really want to return undef :-) */
1968 return *b < c ? -2 : +2;
1973 if (b == bend && u == uend)
1976 return b < bend ? +1 : -1;
1980 =for apidoc utf8_to_bytes
1982 Converts a string C<"s"> of length C<*lenp> from UTF-8 into native byte encoding.
1983 Unlike L</bytes_to_utf8>, this over-writes the original string, and
1984 updates C<*lenp> to contain the new length.
1985 Returns zero on failure (leaving C<"s"> unchanged) setting C<*lenp> to -1.
1987 Upon successful return, the number of variants in the string can be computed by
1988 having saved the value of C<*lenp> before the call, and subtracting the
1989 after-call value of C<*lenp> from it.
1991 If you need a copy of the string, see L</bytes_from_utf8>.
1997 Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *lenp)
2001 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
2002 PERL_UNUSED_CONTEXT;
2004 /* This is a no-op if no variants at all in the input */
2005 if (is_utf8_invariant_string_loc(s, *lenp, (const U8 **) &first_variant)) {
2010 U8 * const save = s;
2011 U8 * const send = s + *lenp;
2014 /* Nothing before the first variant needs to be changed, so start the real
2018 if (! UTF8_IS_INVARIANT(*s)) {
2019 if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, send)) {
2020 *lenp = ((STRLEN) -1);
2028 /* Is downgradable, so do it */
2029 d = s = first_variant;
2032 if (! UVCHR_IS_INVARIANT(c)) {
2033 /* Then it is two-byte encoded */
2034 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2047 =for apidoc bytes_from_utf8
2049 Converts a potentially UTF-8 encoded string C<s> of length C<*lenp> into native
2050 byte encoding. On input, the boolean C<*is_utf8p> gives whether or not C<s> is
2051 actually encoded in UTF-8.
2053 Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, this is non-destructive of
2056 Do nothing if C<*is_utf8p> is 0, or if there are code points in the string
2057 not expressible in native byte encoding. In these cases, C<*is_utf8p> and
2058 C<*lenp> are unchanged, and the return value is the original C<s>.
2060 Otherwise, C<*is_utf8p> is set to 0, and the return value is a pointer to a
2061 newly created string containing a downgraded copy of C<s>, and whose length is
2062 returned in C<*lenp>, updated. The new string is C<NUL>-terminated.
2064 Upon successful return, the number of variants in the string can be computed by
2065 having saved the value of C<*lenp> before the call, and subtracting the
2066 after-call value of C<*lenp> from it.
2070 There is a macro that avoids this function call, but this is retained for
2071 anyone who calls it with the Perl_ prefix */
2074 Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *lenp, bool *is_utf8p)
2076 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
2077 PERL_UNUSED_CONTEXT;
2079 return bytes_from_utf8_loc(s, lenp, is_utf8p, NULL);
2083 No = here because currently externally undocumented
2084 for apidoc bytes_from_utf8_loc
2086 Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where
2087 to store the location of the first character in C<"s"> that cannot be
2088 converted to non-UTF8.
2090 If that parameter is C<NULL>, this function behaves identically to
2093 Otherwise if C<*is_utf8p> is 0 on input, the function behaves identically to
2094 C<bytes_from_utf8>, except it also sets C<*first_non_downgradable> to C<NULL>.
2096 Otherwise, the function returns a newly created C<NUL>-terminated string
2097 containing the non-UTF8 equivalent of the convertible first portion of
2098 C<"s">. C<*lenp> is set to its length, not including the terminating C<NUL>.
2099 If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
2100 and C<*first_non_downgradable> is set to C<NULL>.
2102 Otherwise, C<*first_non_downgradable> set to point to the first byte of the
2103 first character in the original string that wasn't converted. C<*is_utf8p> is
2104 unchanged. Note that the new string may have length 0.
2106 Another way to look at it is, if C<*first_non_downgradable> is non-C<NULL> and
2107 C<*is_utf8p> is TRUE, this function starts at the beginning of C<"s"> and
2108 converts as many characters in it as possible stopping at the first one it
2109 finds that can't be converted to non-UTF-8. C<*first_non_downgradable> is
2110 set to point to that. The function returns the portion that could be converted
2111 in a newly created C<NUL>-terminated string, and C<*lenp> is set to its length,
2112 not including the terminating C<NUL>. If the very first character in the
2113 original could not be converted, C<*lenp> will be 0, and the new string will
2114 contain just a single C<NUL>. If the entire input string was converted,
2115 C<*is_utf8p> is set to FALSE and C<*first_non_downgradable> is set to C<NULL>.
2117 Upon successful return, the number of variants in the converted portion of the
2118 string can be computed by having saved the value of C<*lenp> before the call,
2119 and subtracting the after-call value of C<*lenp> from it.
2127 Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** first_unconverted)
2130 const U8 *original = s;
2131 U8 *converted_start;
2132 const U8 *send = s + *lenp;
2134 PERL_ARGS_ASSERT_BYTES_FROM_UTF8_LOC;
2137 if (first_unconverted) {
2138 *first_unconverted = NULL;
2141 return (U8 *) original;
2144 Newx(d, (*lenp) + 1, U8);
2146 converted_start = d;
2149 if (! UTF8_IS_INVARIANT(c)) {
2151 /* Then it is multi-byte encoded. If the code point is above 0xFF,
2152 * have to stop now */
2153 if (UNLIKELY (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s - 1, send))) {
2154 if (first_unconverted) {
2155 *first_unconverted = s - 1;
2156 goto finish_and_return;
2159 Safefree(converted_start);
2160 return (U8 *) original;
2164 c = EIGHT_BIT_UTF8_TO_NATIVE(c, *s);
2170 /* Here, converted the whole of the input */
2172 if (first_unconverted) {
2173 *first_unconverted = NULL;
2178 *lenp = d - converted_start;
2180 /* Trim unused space */
2181 Renew(converted_start, *lenp + 1, U8);
2183 return converted_start;
2187 =for apidoc bytes_to_utf8
2189 Converts a string C<s> of length C<*lenp> bytes from the native encoding into
2191 Returns a pointer to the newly-created string, and sets C<*lenp> to
2192 reflect the new length in bytes.
2194 Upon successful return, the number of variants in the string can be computed by
2195 having saved the value of C<*lenp> before the call, and subtracting it from the
2196 after-call value of C<*lenp>.
2198 A C<NUL> character will be written after the end of the string.
2200 If you want to convert to UTF-8 from encodings other than
2201 the native (Latin1 or EBCDIC),
2202 see L</sv_recode_to_utf8>().
2208 Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
2210 const U8 * const send = s + (*lenp);
2214 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
2215 PERL_UNUSED_CONTEXT;
2217 Newx(d, (*lenp) * 2 + 1, U8);
2221 append_utf8_from_native_byte(*s, &d);
2230 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
2232 * Destination must be pre-extended to 3/2 source. Do not use in-place.
2233 * We optimize for native, for obvious reasons. */
2236 Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
2241 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
2244 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %" UVuf,
2250 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
2252 if (OFFUNI_IS_INVARIANT(uv)) {
2253 *d++ = LATIN1_TO_NATIVE((U8) uv);
2256 if (uv <= MAX_UTF8_TWO_BYTE) {
2257 *d++ = UTF8_TWO_BYTE_HI(UNI_TO_NATIVE(uv));
2258 *d++ = UTF8_TWO_BYTE_LO(UNI_TO_NATIVE(uv));
2261 #define FIRST_HIGH_SURROGATE UNICODE_SURROGATE_FIRST
2262 #define LAST_HIGH_SURROGATE 0xDBFF
2263 #define FIRST_LOW_SURROGATE 0xDC00
2264 #define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
2266 /* This assumes that most uses will be in the first Unicode plane, not
2267 * needing surrogates */
2268 if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
2269 && uv <= UNICODE_SURROGATE_LAST))
2271 if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
2272 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2275 UV low = (p[0] << 8) + p[1];
2276 if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
2277 || UNLIKELY(low > LAST_LOW_SURROGATE))
2279 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
2282 uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
2283 + (low - FIRST_LOW_SURROGATE) + 0x10000;
2287 d = uvoffuni_to_utf8_flags(d, uv, 0);
2290 *d++ = (U8)(( uv >> 12) | 0xe0);
2291 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2292 *d++ = (U8)(( uv & 0x3f) | 0x80);
2296 *d++ = (U8)(( uv >> 18) | 0xf0);
2297 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
2298 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
2299 *d++ = (U8)(( uv & 0x3f) | 0x80);
2304 *newlen = d - dstart;
2308 /* Note: this one is slightly destructive of the source. */
2311 Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
2314 U8* const send = s + bytelen;
2316 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
2319 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %" UVuf,
2323 const U8 tmp = s[0];
2328 return utf16_to_utf8(p, d, bytelen, newlen);
2332 Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
2334 U8 tmpbuf[UTF8_MAXBYTES+1];
2335 uvchr_to_utf8(tmpbuf, c);
2336 return _is_utf8_FOO_with_len(classnum, tmpbuf, tmpbuf + sizeof(tmpbuf));
2339 /* Internal function so we can deprecate the external one, and call
2340 this one from other deprecated functions in this file */
2343 Perl__is_utf8_idstart(pTHX_ const U8 *p)
2345 PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
2349 return is_utf8_common(p, &PL_utf8_idstart, "IdStart", NULL);
2353 Perl__is_uni_perl_idcont(pTHX_ UV c)
2355 U8 tmpbuf[UTF8_MAXBYTES+1];
2356 uvchr_to_utf8(tmpbuf, c);
2357 return _is_utf8_perl_idcont_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
2361 Perl__is_uni_perl_idstart(pTHX_ UV c)
2363 U8 tmpbuf[UTF8_MAXBYTES+1];
2364 uvchr_to_utf8(tmpbuf, c);
2365 return _is_utf8_perl_idstart_with_len(tmpbuf, tmpbuf + sizeof(tmpbuf));
2369 Perl__to_upper_title_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2372 /* We have the latin1-range values compiled into the core, so just use
2373 * those, converting the result to UTF-8. The only difference between upper
2374 * and title case in this range is that LATIN_SMALL_LETTER_SHARP_S is
2375 * either "SS" or "Ss". Which one to use is passed into the routine in
2376 * 'S_or_s' to avoid a test */
2378 UV converted = toUPPER_LATIN1_MOD(c);
2380 PERL_ARGS_ASSERT__TO_UPPER_TITLE_LATIN1;
2382 assert(S_or_s == 'S' || S_or_s == 's');
2384 if (UVCHR_IS_INVARIANT(converted)) { /* No difference between the two for
2385 characters in this range */
2386 *p = (U8) converted;
2391 /* toUPPER_LATIN1_MOD gives the correct results except for three outliers,
2392 * which it maps to one of them, so as to only have to have one check for
2393 * it in the main case */
2394 if (UNLIKELY(converted == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
2396 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
2397 converted = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS;
2400 converted = GREEK_CAPITAL_LETTER_MU;
2402 #if UNICODE_MAJOR_VERSION > 2 \
2403 || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
2404 && UNICODE_DOT_DOT_VERSION >= 8)
2405 case LATIN_SMALL_LETTER_SHARP_S:
2412 Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect"
2413 " '%c' to map to '%c'",
2414 c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
2415 NOT_REACHED; /* NOTREACHED */
2419 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2420 *p = UTF8_TWO_BYTE_LO(converted);
2426 /* Call the function to convert a UTF-8 encoded character to the specified case.
2427 * Note that there may be more than one character in the result.
2428 * INP is a pointer to the first byte of the input character
2429 * OUTP will be set to the first byte of the string of changed characters. It
2430 * needs to have space for UTF8_MAXBYTES_CASE+1 bytes
2431 * LENP will be set to the length in bytes of the string of changed characters
2433 * The functions return the ordinal of the first character in the string of
2435 #define CALL_UPPER_CASE(uv, s, d, lenp) \
2436 _to_utf8_case(uv, s, d, lenp, &PL_utf8_toupper, "ToUc", "")
2437 #define CALL_TITLE_CASE(uv, s, d, lenp) \
2438 _to_utf8_case(uv, s, d, lenp, &PL_utf8_totitle, "ToTc", "")
2439 #define CALL_LOWER_CASE(uv, s, d, lenp) \
2440 _to_utf8_case(uv, s, d, lenp, &PL_utf8_tolower, "ToLc", "")
2442 /* This additionally has the input parameter 'specials', which if non-zero will
2443 * cause this to use the specials hash for folding (meaning get full case
2444 * folding); otherwise, when zero, this implies a simple case fold */
2445 #define CALL_FOLD_CASE(uv, s, d, lenp, specials) \
2446 _to_utf8_case(uv, s, d, lenp, &PL_utf8_tofold, "ToCf", (specials) ? "" : NULL)
2449 Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
2451 /* Convert the Unicode character whose ordinal is <c> to its uppercase
2452 * version and store that in UTF-8 in <p> and its length in bytes in <lenp>.
2453 * Note that the <p> needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
2454 * the changed version may be longer than the original character.
2456 * The ordinal of the first character of the changed version is returned
2457 * (but note, as explained above, that there may be more.) */
2459 PERL_ARGS_ASSERT_TO_UNI_UPPER;
2462 return _to_upper_title_latin1((U8) c, p, lenp, 'S');
2465 uvchr_to_utf8(p, c);
2466 return CALL_UPPER_CASE(c, p, p, lenp);
2470 Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
2472 PERL_ARGS_ASSERT_TO_UNI_TITLE;
2475 return _to_upper_title_latin1((U8) c, p, lenp, 's');
2478 uvchr_to_utf8(p, c);
2479 return CALL_TITLE_CASE(c, p, p, lenp);
2483 S_to_lower_latin1(const U8 c, U8* p, STRLEN *lenp, const char dummy)
2485 /* We have the latin1-range values compiled into the core, so just use
2486 * those, converting the result to UTF-8. Since the result is always just
2487 * one character, we allow <p> to be NULL */
2489 U8 converted = toLOWER_LATIN1(c);
2491 PERL_UNUSED_ARG(dummy);
2494 if (NATIVE_BYTE_IS_INVARIANT(converted)) {
2499 /* Result is known to always be < 256, so can use the EIGHT_BIT
2501 *p = UTF8_EIGHT_BIT_HI(converted);
2502 *(p+1) = UTF8_EIGHT_BIT_LO(converted);
2510 Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
2512 PERL_ARGS_ASSERT_TO_UNI_LOWER;
2515 return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
2518 uvchr_to_utf8(p, c);
2519 return CALL_LOWER_CASE(c, p, p, lenp);
2523 Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
2524 const unsigned int flags)
2526 /* Corresponds to to_lower_latin1(); <flags> bits meanings:
2527 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2528 * FOLD_FLAGS_FULL iff full folding is to be used;
2530 * Not to be used for locale folds
2535 PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
2536 PERL_UNUSED_CONTEXT;
2538 assert (! (flags & FOLD_FLAGS_LOCALE));
2540 if (UNLIKELY(c == MICRO_SIGN)) {
2541 converted = GREEK_SMALL_LETTER_MU;
2543 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
2544 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
2545 || UNICODE_DOT_DOT_VERSION > 0)
2546 else if ( (flags & FOLD_FLAGS_FULL)
2547 && UNLIKELY(c == LATIN_SMALL_LETTER_SHARP_S))
2549 /* If can't cross 127/128 boundary, can't return "ss"; instead return
2550 * two U+017F characters, as fc("\df") should eq fc("\x{17f}\x{17f}")
2551 * under those circumstances. */
2552 if (flags & FOLD_FLAGS_NOMIX_ASCII) {
2553 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
2554 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
2556 return LATIN_SMALL_LETTER_LONG_S;
2566 else { /* In this range the fold of all other characters is their lower
2568 converted = toLOWER_LATIN1(c);
2571 if (UVCHR_IS_INVARIANT(converted)) {
2572 *p = (U8) converted;
2576 *(p)++ = UTF8_TWO_BYTE_HI(converted);
2577 *p = UTF8_TWO_BYTE_LO(converted);
2585 Perl__to_uni_fold_flags(pTHX_ UV c, U8* p, STRLEN *lenp, U8 flags)
2588 /* Not currently externally documented, and subject to change
2589 * <flags> bits meanings:
2590 * FOLD_FLAGS_FULL iff full folding is to be used;
2591 * FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
2592 * locale are to be used.
2593 * FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
2596 PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
2598 if (flags & FOLD_FLAGS_LOCALE) {
2599 /* Treat a UTF-8 locale as not being in locale at all */
2600 if (IN_UTF8_CTYPE_LOCALE) {
2601 flags &= ~FOLD_FLAGS_LOCALE;
2604 _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
2605 goto needs_full_generality;
2610 return _to_fold_latin1((U8) c, p, lenp,
2611 flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
2614 /* Here, above 255. If no special needs, just use the macro */
2615 if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
2616 uvchr_to_utf8(p, c);
2617 return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
2619 else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
2620 the special flags. */
2621 U8 utf8_c[UTF8_MAXBYTES + 1];
2623 needs_full_generality:
2624 uvchr_to_utf8(utf8_c, c);
2625 return _toFOLD_utf8_flags(utf8_c, utf8_c + sizeof(utf8_c),
2630 PERL_STATIC_INLINE bool
2631 S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
2632 const char *const swashname, SV* const invlist)
2634 /* returns a boolean giving whether or not the UTF8-encoded character that
2635 * starts at <p> is in the swash indicated by <swashname>. <swash>
2636 * contains a pointer to where the swash indicated by <swashname>
2637 * is to be stored; which this routine will do, so that future calls will
2638 * look at <*swash> and only generate a swash if it is not null. <invlist>
2639 * is NULL or an inversion list that defines the swash. If not null, it
2640 * saves time during initialization of the swash.
2642 * Note that it is assumed that the buffer length of <p> is enough to
2643 * contain all the bytes that comprise the character. Thus, <*p> should
2644 * have been checked before this call for mal-formedness enough to assure
2647 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
2649 /* The API should have included a length for the UTF-8 character in <p>,
2650 * but it doesn't. We therefore assume that p has been validated at least
2651 * as far as there being enough bytes available in it to accommodate the
2652 * character without reading beyond the end, and pass that number on to the
2653 * validating routine */
2654 if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
2655 _force_out_malformed_utf8_message(p, p + UTF8SKIP(p),
2656 _UTF8_NO_CONFIDENCE_IN_CURLEN,
2658 NOT_REACHED; /* NOTREACHED */
2662 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2663 *swash = _core_swash_init("utf8",
2665 /* Only use the name if there is no inversion
2666 * list; otherwise will go out to disk */
2667 (invlist) ? "" : swashname,
2669 &PL_sv_undef, 1, 0, invlist, &flags);
2672 return swash_fetch(*swash, p, TRUE) != 0;
2675 PERL_STATIC_INLINE bool
2676 S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e,
2677 SV **swash, const char *const swashname,
2680 /* returns a boolean giving whether or not the UTF8-encoded character that
2681 * starts at <p>, and extending no further than <e - 1> is in the swash
2682 * indicated by <swashname>. <swash> contains a pointer to where the swash
2683 * indicated by <swashname> is to be stored; which this routine will do, so
2684 * that future calls will look at <*swash> and only generate a swash if it
2685 * is not null. <invlist> is NULL or an inversion list that defines the
2686 * swash. If not null, it saves time during initialization of the swash.
2689 PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
2691 if (! isUTF8_CHAR(p, e)) {
2692 _force_out_malformed_utf8_message(p, e, 0, 1);
2693 NOT_REACHED; /* NOTREACHED */
2697 U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
2698 *swash = _core_swash_init("utf8",
2700 /* Only use the name if there is no inversion
2701 * list; otherwise will go out to disk */
2702 (invlist) ? "" : swashname,
2704 &PL_sv_undef, 1, 0, invlist, &flags);
2707 return swash_fetch(*swash, p, TRUE) != 0;
2711 S_warn_on_first_deprecated_use(pTHX_ const char * const name,
2712 const char * const alternative,
2713 const bool use_locale,
2714 const char * const file,
2715 const unsigned line)
2719 PERL_ARGS_ASSERT_WARN_ON_FIRST_DEPRECATED_USE;
2721 if (ckWARN_d(WARN_DEPRECATED)) {
2723 key = Perl_form(aTHX_ "%s;%d;%s;%d", name, use_locale, file, line);
2724 if (! hv_fetch(PL_seen_deprecated_macro, key, strlen(key), 0)) {
2725 if (! PL_seen_deprecated_macro) {
2726 PL_seen_deprecated_macro = newHV();
2728 if (! hv_store(PL_seen_deprecated_macro, key,
2729 strlen(key), &PL_sv_undef, 0))
2731 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
2734 if (instr(file, "mathoms.c")) {
2735 Perl_warner(aTHX_ WARN_DEPRECATED,
2736 "In %s, line %d, starting in Perl v5.30, %s()"
2737 " will be removed. Avoid this message by"
2738 " converting to use %s().\n",
2739 file, line, name, alternative);
2742 Perl_warner(aTHX_ WARN_DEPRECATED,
2743 "In %s, line %d, starting in Perl v5.30, %s() will"
2744 " require an additional parameter. Avoid this"
2745 " message by converting to use %s().\n",
2746 file, line, name, alternative);
2753 Perl__is_utf8_FOO(pTHX_ U8 classnum,
2755 const char * const name,
2756 const char * const alternative,
2757 const bool use_utf8,
2758 const bool use_locale,
2759 const char * const file,
2760 const unsigned line)
2762 PERL_ARGS_ASSERT__IS_UTF8_FOO;
2764 warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
2766 if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
2776 case _CC_ALPHANUMERIC:
2780 return is_utf8_common(p,
2781 &PL_utf8_swash_ptrs[classnum],
2782 swash_property_names[classnum],
2783 PL_XPosix_ptrs[classnum]);
2786 return is_XPERLSPACE_high(p);
2788 return is_HORIZWS_high(p);
2790 return is_XDIGIT_high(p);
2796 return is_VERTWS_high(p);
2798 if (! PL_utf8_perl_idstart) {
2799 PL_utf8_perl_idstart
2800 = _new_invlist_C_array(_Perl_IDStart_invlist);
2802 return is_utf8_common(p, &PL_utf8_perl_idstart,
2803 "_Perl_IDStart", NULL);
2805 if (! PL_utf8_perl_idcont) {
2807 = _new_invlist_C_array(_Perl_IDCont_invlist);
2809 return is_utf8_common(p, &PL_utf8_perl_idcont,
2810 "_Perl_IDCont", NULL);
2814 /* idcont is the same as wordchar below 256 */
2815 if (classnum == _CC_IDCONT) {
2816 classnum = _CC_WORDCHAR;
2818 else if (classnum == _CC_IDFIRST) {
2822 classnum = _CC_ALPHA;
2826 if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
2827 return _generic_isCC(*p, classnum);
2830 return _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )), classnum);
2833 if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
2834 return isFOO_lc(classnum, *p);
2837 return isFOO_lc(classnum, EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )));
2840 NOT_REACHED; /* NOTREACHED */
2844 Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
2847 PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
2849 assert(classnum < _FIRST_NON_SWASH_CC);
2851 return is_utf8_common_with_len(p,
2853 &PL_utf8_swash_ptrs[classnum],
2854 swash_property_names[classnum],
2855 PL_XPosix_ptrs[classnum]);
2859 Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
2863 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
2865 if (! PL_utf8_perl_idstart) {
2866 invlist = _new_invlist_C_array(_Perl_IDStart_invlist);
2868 return is_utf8_common_with_len(p, e, &PL_utf8_perl_idstart,
2869 "_Perl_IDStart", invlist);
2873 Perl__is_utf8_xidstart(pTHX_ const U8 *p)
2875 PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
2879 return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
2883 Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
2887 PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
2889 if (! PL_utf8_perl_idcont) {
2890 invlist = _new_invlist_C_array(_Perl_IDCont_invlist);
2892 return is_utf8_common_with_len(p, e, &PL_utf8_perl_idcont,
2893 "_Perl_IDCont", invlist);
2897 Perl__is_utf8_idcont(pTHX_ const U8 *p)
2899 PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
2901 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
2905 Perl__is_utf8_xidcont(pTHX_ const U8 *p)
2907 PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
2909 return is_utf8_common(p, &PL_utf8_idcont, "XIdContinue", NULL);
2913 Perl__is_utf8_mark(pTHX_ const U8 *p)
2915 PERL_ARGS_ASSERT__IS_UTF8_MARK;
2917 return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
2920 /* change namve uv1 to 'from' */
2922 S__to_utf8_case(pTHX_ const UV uv1, const U8 *p, U8* ustrp, STRLEN *lenp,
2923 SV **swashp, const char *normal, const char *special)
2927 PERL_ARGS_ASSERT__TO_UTF8_CASE;
2929 /* For code points that don't change case, we already know that the output
2930 * of this function is the unchanged input, so we can skip doing look-ups
2931 * for them. Unfortunately the case-changing code points are scattered
2932 * around. But there are some long consecutive ranges where there are no
2933 * case changing code points. By adding tests, we can eliminate the lookup
2934 * for all the ones in such ranges. This is currently done here only for
2935 * just a few cases where the scripts are in common use in modern commerce
2936 * (and scripts adjacent to those which can be included without additional
2939 if (uv1 >= 0x0590) {
2940 /* This keeps from needing further processing the code points most
2941 * likely to be used in the following non-cased scripts: Hebrew,
2942 * Arabic, Syriac, Thaana, NKo, Samaritan, Mandaic, Devanagari,
2943 * Bengali, Gurmukhi, Gujarati, Oriya, Tamil, Telugu, Kannada,
2944 * Malayalam, Sinhala, Thai, Lao, Tibetan, Myanmar */
2949 /* The following largish code point ranges also don't have case
2950 * changes, but khw didn't think they warranted extra tests to speed
2951 * them up (which would slightly slow down everything else above them):
2952 * 1100..139F Hangul Jamo, Ethiopic
2953 * 1400..1CFF Unified Canadian Aboriginal Syllabics, Ogham, Runic,
2954 * Tagalog, Hanunoo, Buhid, Tagbanwa, Khmer, Mongolian,
2955 * Limbu, Tai Le, New Tai Lue, Buginese, Tai Tham,
2956 * Combining Diacritical Marks Extended, Balinese,
2957 * Sundanese, Batak, Lepcha, Ol Chiki
2958 * 2000..206F General Punctuation
2961 if (uv1 >= 0x2D30) {
2963 /* This keeps the from needing further processing the code points
2964 * most likely to be used in the following non-cased major scripts:
2965 * CJK, Katakana, Hiragana, plus some less-likely scripts.
2967 * (0x2D30 above might have to be changed to 2F00 in the unlikely
2968 * event that Unicode eventually allocates the unused block as of
2969 * v8.0 2FE0..2FEF to code points that are cased. khw has verified
2970 * that the test suite will start having failures to alert you
2971 * should that happen) */
2976 if (uv1 >= 0xAC00) {
2977 if (UNLIKELY(UNICODE_IS_SURROGATE(uv1))) {
2978 if (ckWARN_d(WARN_SURROGATE)) {
2979 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
2980 Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
2981 "Operation \"%s\" returns its argument for"
2982 " UTF-16 surrogate U+%04" UVXf, desc, uv1);
2987 /* AC00..FAFF Catches Hangul syllables and private use, plus
2994 if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
2995 if ( UNLIKELY(uv1 > MAX_NON_DEPRECATED_CP)
2996 && ckWARN_d(WARN_DEPRECATED))
2998 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
2999 cp_above_legal_max, uv1, MAX_NON_DEPRECATED_CP);
3001 if (ckWARN_d(WARN_NON_UNICODE)) {
3002 const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
3003 Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
3004 "Operation \"%s\" returns its argument for"
3005 " non-Unicode code point 0x%04" UVXf, desc, uv1);
3009 #ifdef HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C
3011 > HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
3014 /* As of Unicode 10.0, this means we avoid swash creation
3015 * for anything beyond high Plane 1 (below emojis) */
3022 /* Note that non-characters are perfectly legal, so no warning should
3023 * be given. There are so few of them, that it isn't worth the extra
3024 * tests to avoid swash creation */
3027 if (!*swashp) /* load on-demand */
3028 *swashp = _core_swash_init("utf8", normal, &PL_sv_undef,
3032 /* It might be "special" (sometimes, but not always,
3033 * a multicharacter mapping) */
3037 /* If passed in the specials name, use that; otherwise use any
3038 * given in the swash */
3039 if (*special != '\0') {
3040 hv = get_hv(special, 0);
3043 svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
3045 hv = MUTABLE_HV(SvRV(*svp));
3050 && (svp = hv_fetch(hv, (const char*)p, UVCHR_SKIP(uv1), FALSE))
3055 s = SvPV_const(*svp, len);
3058 len = uvchr_to_utf8(ustrp, *(U8*)s) - ustrp;
3060 Copy(s, ustrp, len, U8);
3065 if (!len && *swashp) {
3066 const UV uv2 = swash_fetch(*swashp, p, TRUE /* => is UTF-8 */);
3069 /* It was "normal" (a single character mapping). */
3070 len = uvchr_to_utf8(ustrp, uv2) - ustrp;
3078 return valid_utf8_to_uvchr(ustrp, 0);
3081 /* Here, there was no mapping defined, which means that the code point maps
3082 * to itself. Return the inputs */
3085 if (p != ustrp) { /* Don't copy onto itself */
3086 Copy(p, ustrp, len, U8);
3097 S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
3098 U8* const ustrp, STRLEN *lenp)
3100 /* This is called when changing the case of a UTF-8-encoded character above
3101 * the Latin1 range, and the operation is in a non-UTF-8 locale. If the
3102 * result contains a character that crosses the 255/256 boundary, disallow
3103 * the change, and return the original code point. See L<perlfunc/lc> for
3106 * p points to the original string whose case was changed; assumed
3107 * by this routine to be well-formed
3108 * result the code point of the first character in the changed-case string
3109 * ustrp points to the changed-case string (<result> represents its
3111 * lenp points to the length of <ustrp> */
3113 UV original; /* To store the first code point of <p> */
3115 PERL_ARGS_ASSERT_CHECK_LOCALE_BOUNDARY_CROSSING;
3117 assert(UTF8_IS_ABOVE_LATIN1(*p));
3119 /* We know immediately if the first character in the string crosses the
3120 * boundary, so can skip */
3123 /* Look at every character in the result; if any cross the
3124 * boundary, the whole thing is disallowed */
3125 U8* s = ustrp + UTF8SKIP(ustrp);
3126 U8* e = ustrp + *lenp;
3128 if (! UTF8_IS_ABOVE_LATIN1(*s)) {
3134 /* Here, no characters crossed, result is ok as-is, but we warn. */
3135 _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
3141 /* Failed, have to return the original */
3142 original = valid_utf8_to_uvchr(p, lenp);
3144 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3145 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3146 "Can't do %s(\"\\x{%" UVXf "}\") on non-UTF-8"
3147 " locale; resolved to \"\\x{%" UVXf "}\".",
3151 Copy(p, ustrp, *lenp, char);
3156 S_check_and_deprecate(pTHX_ const U8 *p,
3158 const unsigned int type, /* See below */
3159 const bool use_locale, /* Is this a 'LC_'
3161 const char * const file,
3162 const unsigned line)
3164 /* This is a temporary function to deprecate the unsafe calls to the case
3165 * changing macros and functions. It keeps all the special stuff in just
3168 * It updates *e with the pointer to the end of the input string. If using
3169 * the old-style macros, *e is NULL on input, and so this function assumes
3170 * the input string is long enough to hold the entire UTF-8 sequence, and
3171 * sets *e accordingly, but it then returns a flag to pass the
3172 * utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
3173 * using the full length if possible.
3175 * It also does the assert that *e > p when *e is not NULL. This should be
3176 * migrated to the callers when this function gets deleted.
3178 * The 'type' parameter is used for the caller to specify which case
3179 * changing function this is called from: */
3181 # define DEPRECATE_TO_UPPER 0
3182 # define DEPRECATE_TO_TITLE 1
3183 # define DEPRECATE_TO_LOWER 2
3184 # define DEPRECATE_TO_FOLD 3
3186 U32 utf8n_flags = 0;
3188 const char * alternative;
3190 PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
3193 utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
3194 *e = p + UTF8SKIP(p);
3196 /* For mathoms.c calls, we use the function name we know is stored
3197 * there. It could be part of a larger path */
3198 if (type == DEPRECATE_TO_UPPER) {
3199 name = instr(file, "mathoms.c")
3202 alternative = "toUPPER_utf8_safe";
3204 else if (type == DEPRECATE_TO_TITLE) {
3205 name = instr(file, "mathoms.c")
3208 alternative = "toTITLE_utf8_safe";
3210 else if (type == DEPRECATE_TO_LOWER) {
3211 name = instr(file, "mathoms.c")
3214 alternative = "toLOWER_utf8_safe";
3216 else if (type == DEPRECATE_TO_FOLD) {
3217 name = instr(file, "mathoms.c")
3220 alternative = "toFOLD_utf8_safe";
3222 else Perl_croak(aTHX_ "panic: Unexpected case change type");
3224 warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
3233 /* The process for changing the case is essentially the same for the four case
3234 * change types, except there are complications for folding. Otherwise the
3235 * difference is only which case to change to. To make sure that they all do
3236 * the same thing, the bodies of the functions are extracted out into the
3237 * following two macros. The functions are written with the same variable
3238 * names, and these are known and used inside these macros. It would be
3239 * better, of course, to have inline functions to do it, but since different
3240 * macros are called, depending on which case is being changed to, this is not
3241 * feasible in C (to khw's knowledge). Two macros are created so that the fold
3242 * function can start with the common start macro, then finish with its special
3243 * handling; while the other three cases can just use the common end macro.
3245 * The algorithm is to use the proper (passed in) macro or function to change
3246 * the case for code points that are below 256. The macro is used if using
3247 * locale rules for the case change; the function if not. If the code point is
3248 * above 255, it is computed from the input UTF-8, and another macro is called
3249 * to do the conversion. If necessary, the output is converted to UTF-8. If
3250 * using a locale, we have to check that the change did not cross the 255/256
3251 * boundary, see check_locale_boundary_crossing() for further details.
3253 * The macros are split with the correct case change for the below-256 case
3254 * stored into 'result', and in the middle of an else clause for the above-255
3255 * case. At that point in the 'else', 'result' is not the final result, but is
3256 * the input code point calculated from the UTF-8. The fold code needs to
3257 * realize all this and take it from there.
3259 * If you read the two macros as sequential, it's easier to understand what's
3261 #define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
3262 L1_func_extra_param) \
3264 if (flags & (locale_flags)) { \
3265 /* Treat a UTF-8 locale as not being in locale at all */ \
3266 if (IN_UTF8_CTYPE_LOCALE) { \
3267 flags &= ~(locale_flags); \
3270 _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
3274 if (UTF8_IS_INVARIANT(*p)) { \
3275 if (flags & (locale_flags)) { \
3276 result = LC_L1_change_macro(*p); \
3279 return L1_func(*p, ustrp, lenp, L1_func_extra_param); \
3282 else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
3283 if (flags & (locale_flags)) { \
3284 result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \
3288 return L1_func(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), \
3289 ustrp, lenp, L1_func_extra_param); \
3292 else { /* malformed UTF-8 or ord above 255 */ \
3293 STRLEN len_result; \
3294 result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
3295 if (len_result == (STRLEN) -1) { \
3296 _force_out_malformed_utf8_message(p, e, utf8n_flags, \
3300 #define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
3301 result = change_macro(result, p, ustrp, lenp); \
3303 if (flags & (locale_flags)) { \
3304 result = check_locale_boundary_crossing(p, result, ustrp, lenp); \
3309 /* Here, used locale rules. Convert back to UTF-8 */ \
3310 if (UTF8_IS_INVARIANT(result)) { \
3311 *ustrp = (U8) result; \
3315 *ustrp = UTF8_EIGHT_BIT_HI((U8) result); \
3316 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result); \
3323 =for apidoc to_utf8_upper
3325 Instead use L</toUPPER_utf8_safe>.
3329 /* Not currently externally documented, and subject to change:
3330 * <flags> is set iff iff the rules from the current underlying locale are to
3334 Perl__to_utf8_upper_flags(pTHX_ const U8 *p,
3339 const char * const file,
3343 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
3344 cBOOL(flags), file, line);
3346 PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
3348 /* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
3349 /* 2nd char of uc(U+DF) is 'S' */
3350 CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S');
3351 CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
3355 =for apidoc to_utf8_title
3357 Instead use L</toTITLE_utf8_safe>.
3361 /* Not currently externally documented, and subject to change:
3362 * <flags> is set iff the rules from the current underlying locale are to be
3363 * used. Since titlecase is not defined in POSIX, for other than a
3364 * UTF-8 locale, uppercase is used instead for code points < 256.
3368 Perl__to_utf8_title_flags(pTHX_ const U8 *p,
3373 const char * const file,
3377 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
3378 cBOOL(flags), file, line);
3380 PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
3382 /* 2nd char of ucfirst(U+DF) is 's' */
3383 CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's');
3384 CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
3388 =for apidoc to_utf8_lower
3390 Instead use L</toLOWER_utf8_safe>.
3394 /* Not currently externally documented, and subject to change:
3395 * <flags> is set iff iff the rules from the current underlying locale are to
3400 Perl__to_utf8_lower_flags(pTHX_ const U8 *p,
3405 const char * const file,
3409 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
3410 cBOOL(flags), file, line);
3412 PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
3414 CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */)
3415 CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
3419 =for apidoc to_utf8_fold
3421 Instead use L</toFOLD_utf8_safe>.
3425 /* Not currently externally documented, and subject to change,
3427 * bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
3428 * locale are to be used.
3429 * bit FOLD_FLAGS_FULL is set iff full case folds are to be used;
3430 * otherwise simple folds
3431 * bit FOLD_FLAGS_NOMIX_ASCII is set iff folds of non-ASCII to ASCII are
3436 Perl__to_utf8_fold_flags(pTHX_ const U8 *p,
3441 const char * const file,
3445 const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
3446 cBOOL(flags), file, line);
3448 PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
3450 /* These are mutually exclusive */
3451 assert (! ((flags & FOLD_FLAGS_LOCALE) && (flags & FOLD_FLAGS_NOMIX_ASCII)));
3453 assert(p != ustrp); /* Otherwise overwrites */
3455 CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
3456 ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)));
3458 result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
3460 if (flags & FOLD_FLAGS_LOCALE) {
3462 # define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
3463 const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
3465 # ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3466 # define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
3468 const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
3470 /* Special case these two characters, as what normally gets
3471 * returned under locale doesn't work */
3472 if (UTF8SKIP(p) == cap_sharp_s_len
3473 && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
3475 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3476 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3477 "Can't do fc(\"\\x{1E9E}\") on non-UTF-8 locale; "
3478 "resolved to \"\\x{17F}\\x{17F}\".");
3483 if (UTF8SKIP(p) == long_s_t_len
3484 && memEQ((char *) p, LONG_S_T, long_s_t_len))
3486 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3487 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3488 "Can't do fc(\"\\x{FB05}\") on non-UTF-8 locale; "
3489 "resolved to \"\\x{FB06}\".");
3490 goto return_ligature_st;
3493 #if UNICODE_MAJOR_VERSION == 3 \
3494 && UNICODE_DOT_VERSION == 0 \
3495 && UNICODE_DOT_DOT_VERSION == 1
3496 # define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
3498 /* And special case this on this Unicode version only, for the same
3499 * reaons the other two are special cased. They would cross the
3500 * 255/256 boundary which is forbidden under /l, and so the code
3501 * wouldn't catch that they are equivalent (which they are only in
3503 else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
3504 && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
3506 /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
3507 Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
3508 "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
3509 "resolved to \"\\x{0131}\".");
3510 goto return_dotless_i;
3514 return check_locale_boundary_crossing(p, result, ustrp, lenp);
3516 else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
3520 /* This is called when changing the case of a UTF-8-encoded
3521 * character above the ASCII range, and the result should not
3522 * contain an ASCII character. */
3524 UV original; /* To store the first code point of <p> */
3526 /* Look at every character in the result; if any cross the
3527 * boundary, the whole thing is disallowed */
3529 U8* e = ustrp + *lenp;
3532 /* Crossed, have to return the original */
3533 original = valid_utf8_to_uvchr(p, lenp);
3535 /* But in these instances, there is an alternative we can
3536 * return that is valid */
3537 if (original == LATIN_SMALL_LETTER_SHARP_S
3538 #ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
3539 || original == LATIN_CAPITAL_LETTER_SHARP_S
3544 else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
3545 goto return_ligature_st;
3547 #if UNICODE_MAJOR_VERSION == 3 \
3548 && UNICODE_DOT_VERSION == 0 \
3549 && UNICODE_DOT_DOT_VERSION == 1
3551 else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
3552 goto return_dotless_i;
3555 Copy(p, ustrp, *lenp, char);
3561 /* Here, no characters crossed, result is ok as-is */
3566 /* Here, used locale rules. Convert back to UTF-8 */
3567 if (UTF8_IS_INVARIANT(result)) {
3568 *ustrp = (U8) result;
3572 *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
3573 *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
3580 /* Certain folds to 'ss' are prohibited by the options, but they do allow
3581 * folds to a string of two of these characters. By returning this
3582 * instead, then, e.g.,
3583 * fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")
3586 *lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
3587 Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
3589 return LATIN_SMALL_LETTER_LONG_S;
3592 /* Two folds to 'st' are prohibited by the options; instead we pick one and
3593 * have the other one fold to it */
3595 *lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
3596 Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
3597 return LATIN_SMALL_LIGATURE_ST;
3599 #if UNICODE_MAJOR_VERSION == 3 \
3600 && UNICODE_DOT_VERSION == 0 \
3601 && UNICODE_DOT_DOT_VERSION == 1
3604 *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
3605 Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
3606 return LATIN_SMALL_LETTER_DOTLESS_I;
3613 * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
3614 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
3615 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
3619 Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
3620 I32 minbits, I32 none)
3622 PERL_ARGS_ASSERT_SWASH_INIT;
3624 /* Returns a copy of a swash initiated by the called function. This is the
3625 * public interface, and returning a copy prevents others from doing
3626 * mischief on the original */
3628 return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none,
3633 Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
3634 I32 minbits, I32 none, SV* invlist,
3638 /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
3639 * use the following define */
3641 #define CORE_SWASH_INIT_RETURN(x) \
3642 PL_curpm= old_PL_curpm; \
3645 /* Initialize and return a swash, creating it if necessary. It does this
3646 * by calling utf8_heavy.pl in the general case. The returned value may be
3647 * the swash's inversion list instead if the input parameters allow it.
3648 * Which is returned should be immaterial to callers, as the only
3649 * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
3650 * and swash_to_invlist() handle both these transparently.
3652 * This interface should only be used by functions that won't destroy or
3653 * adversely change the swash, as doing so affects all other uses of the
3654 * swash in the program; the general public should use 'Perl_swash_init'
3657 * pkg is the name of the package that <name> should be in.
3658 * name is the name of the swash to find. Typically it is a Unicode
3659 * property name, including user-defined ones
3660 * listsv is a string to initialize the swash with. It must be of the form
3661 * documented as the subroutine return value in
3662 * L<perlunicode/User-Defined Character Properties>
3663 * minbits is the number of bits required to represent each data element.
3664 * It is '1' for binary properties.
3665 * none I (khw) do not understand this one, but it is used only in tr///.
3666 * invlist is an inversion list to initialize the swash with (or NULL)
3667 * flags_p if non-NULL is the address of various input and output flag bits
3668 * to the routine, as follows: ('I' means is input to the routine;
3669 * 'O' means output from the routine. Only flags marked O are
3670 * meaningful on return.)
3671 * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
3672 * came from a user-defined property. (I O)
3673 * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
3674 * when the swash cannot be located, to simply return NULL. (I)
3675 * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
3676 * return of an inversion list instead of a swash hash if this routine
3677 * thinks that would result in faster execution of swash_fetch() later
3680 * Thus there are three possible inputs to find the swash: <name>,
3681 * <listsv>, and <invlist>. At least one must be specified. The result
3682 * will be the union of the specified ones, although <listsv>'s various
3683 * actions can intersect, etc. what <name> gives. To avoid going out to
3684 * disk at all, <invlist> should specify completely what the swash should
3685 * have, and <listsv> should be &PL_sv_undef and <name> should be "".
3687 * <invlist> is only valid for binary properties */
3689 PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
3691 SV* retval = &PL_sv_undef;
3692 HV* swash_hv = NULL;
3693 const int invlist_swash_boundary =
3694 (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST)
3695 ? 512 /* Based on some benchmarking, but not extensive, see commit
3697 : -1; /* Never return just an inversion list */
3699 assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
3700 assert(! invlist || minbits == 1);
3702 PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the
3703 regex that triggered the swash init and the swash init
3704 perl logic itself. See perl #122747 */
3706 /* If data was passed in to go out to utf8_heavy to find the swash of, do
3708 if (listsv != &PL_sv_undef || strNE(name, "")) {
3710 const size_t pkg_len = strlen(pkg);
3711 const size_t name_len = strlen(name);
3712 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
3716 PERL_ARGS_ASSERT__CORE_SWASH_INIT;
3718 PUSHSTACKi(PERLSI_MAGIC);
3722 /* We might get here via a subroutine signature which uses a utf8
3723 * parameter name, at which point PL_subname will have been set
3724 * but not yet used. */
3725 save_item(PL_subname);
3726 if (PL_parser && PL_parser->error_count)
3727 SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
3728 method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
3729 if (!method) { /* demand load UTF-8 */
3731 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3732 GvSV(PL_errgv) = NULL;
3733 #ifndef NO_TAINT_SUPPORT
3734 /* It is assumed that callers of this routine are not passing in
3735 * any user derived data. */
3736 /* Need to do this after save_re_context() as it will set
3737 * PL_tainted to 1 while saving $1 etc (see the code after getrx:
3738 * in Perl_magic_get). Even line to create errsv_save can turn on
3740 SAVEBOOL(TAINT_get);
3743 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
3746 /* Not ERRSV, as there is no need to vivify a scalar we are
3747 about to discard. */
3748 SV * const errsv = GvSV(PL_errgv);
3749 if (!SvTRUE(errsv)) {
3750 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3751 SvREFCNT_dec(errsv);
3759 mPUSHp(pkg, pkg_len);
3760 mPUSHp(name, name_len);
3765 if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
3766 GvSV(PL_errgv) = NULL;
3767 /* If we already have a pointer to the method, no need to use
3768 * call_method() to repeat the lookup. */
3770 ? call_sv(MUTABLE_SV(method), G_SCALAR)
3771 : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
3773 retval = *PL_stack_sp--;
3774 SvREFCNT_inc(retval);
3777 /* Not ERRSV. See above. */
3778 SV * const errsv = GvSV(PL_errgv);
3779 if (!SvTRUE(errsv)) {
3780 GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
3781 SvREFCNT_dec(errsv);
3786 if (IN_PERL_COMPILETIME) {
3787 CopHINTS_set(PL_curcop, PL_hints);
3789 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
3790 if (SvPOK(retval)) {
3792 /* If caller wants to handle missing properties, let them */
3793 if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
3794 CORE_SWASH_INIT_RETURN(NULL);
3797 "Can't find Unicode property definition \"%" SVf "\"",
3799 NOT_REACHED; /* NOTREACHED */
3802 } /* End of calling the module to find the swash */
3804 /* If this operation fetched a swash, and we will need it later, get it */
3805 if (retval != &PL_sv_undef
3806 && (minbits == 1 || (flags_p
3808 & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
3810 swash_hv = MUTABLE_HV(SvRV(retval));
3812 /* If we don't already know that there is a user-defined component to
3813 * this swash, and the user has indicated they wish to know if there is
3814 * one (by passing <flags_p>), find out */
3815 if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
3816 SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
3817 if (user_defined && SvUV(*user_defined)) {
3818 *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
3823 /* Make sure there is an inversion list for binary properties */
3825 SV** swash_invlistsvp = NULL;
3826 SV* swash_invlist = NULL;
3827 bool invlist_in_swash_is_valid = FALSE;
3828 bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
3829 an unclaimed reference count */
3831 /* If this operation fetched a swash, get its already existing
3832 * inversion list, or create one for it */
3835 swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
3836 if (swash_invlistsvp) {
3837 swash_invlist = *swash_invlistsvp;
3838 invlist_in_swash_is_valid = TRUE;
3841 swash_invlist = _swash_to_invlist(retval);
3842 swash_invlist_unclaimed = TRUE;
3846 /* If an inversion list was passed in, have to include it */
3849 /* Any fetched swash will by now have an inversion list in it;
3850 * otherwise <swash_invlist> will be NULL, indicating that we
3851 * didn't fetch a swash */
3852 if (swash_invlist) {
3854 /* Add the passed-in inversion list, which invalidates the one
3855 * already stored in the swash */
3856 invlist_in_swash_is_valid = FALSE;
3857 SvREADONLY_off(swash_invlist); /* Turned on again below */
3858 _invlist_union(invlist, swash_invlist, &swash_invlist);
3862 /* Here, there is no swash already. Set up a minimal one, if
3863 * we are going to return a swash */
3864 if ((int) _invlist_len(invlist) > invlist_swash_boundary) {
3866 retval = newRV_noinc(MUTABLE_SV(swash_hv));
3868 swash_invlist = invlist;
3872 /* Here, we have computed the union of all the passed-in data. It may
3873 * be that there was an inversion list in the swash which didn't get
3874 * touched; otherwise save the computed one */
3875 if (! invlist_in_swash_is_valid
3876 && (int) _invlist_len(swash_invlist) > invlist_swash_boundary)
3878 if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
3880 Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
3882 /* We just stole a reference count. */
3883 if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
3884 else SvREFCNT_inc_simple_void_NN(swash_invlist);
3887 /* The result is immutable. Forbid attempts to change it. */
3888 SvREADONLY_on(swash_invlist);
3890 /* Use the inversion list stand-alone if small enough */
3891 if ((int) _invlist_len(swash_invlist) <= invlist_swash_boundary) {
3892 SvREFCNT_dec(retval);
3893 if (!swash_invlist_unclaimed)
3894 SvREFCNT_inc_simple_void_NN(swash_invlist);
3895 retval = newRV_noinc(swash_invlist);
3899 CORE_SWASH_INIT_RETURN(retval);
3900 #undef CORE_SWASH_INIT_RETURN
3904 /* This API is wrong for special case conversions since we may need to
3905 * return several Unicode characters for a single Unicode character
3906 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
3907 * the lower-level routine, and it is similarly broken for returning
3908 * multiple values. --jhi
3909 * For those, you should use S__to_utf8_case() instead */
3910 /* Now SWASHGET is recasted into S_swatch_get in this file. */
3913 * Returns the value of property/mapping C<swash> for the first character
3914 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
3915 * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
3916 * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
3918 * A "swash" is a hash which contains initially the keys/values set up by
3919 * SWASHNEW. The purpose is to be able to completely represent a Unicode
3920 * property for all possible code points. Things are stored in a compact form
3921 * (see utf8_heavy.pl) so that calculation is required to find the actual
3922 * property value for a given code point. As code points are looked up, new
3923 * key/value pairs are added to the hash, so that the calculation doesn't have
3924 * to ever be re-done. Further, each calculation is done, not just for the
3925 * desired one, but for a whole block of code points adjacent to that one.
3926 * For binary properties on ASCII machines, the block is usually for 64 code
3927 * points, starting with a code point evenly divisible by 64. Thus if the
3928 * property value for code point 257 is requested, the code goes out and
3929 * calculates the property values for all 64 code points between 256 and 319,
3930 * and stores these as a single 64-bit long bit vector, called a "swatch",
3931 * under the key for code point 256. The key is the UTF-8 encoding for code
3932 * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
3933 * for a code point is 13 bytes, the key will be 12 bytes long. If the value
3934 * for code point 258 is then requested, this code realizes that it would be
3935 * stored under the key for 256, and would find that value and extract the
3936 * relevant bit, offset from 256.
3938 * Non-binary properties are stored in as many bits as necessary to represent
3939 * their values (32 currently, though the code is more general than that), not
3940 * as single bits, but the principle is the same: the value for each key is a
3941 * vector that encompasses the property values for all code points whose UTF-8
3942 * representations are represented by the key. That is, for all code points
3943 * whose UTF-8 representations are length N bytes, and the key is the first N-1
3947 Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
3949 HV *const hv = MUTABLE_HV(SvRV(swash));
3954 const U8 *tmps = NULL;
3958 PERL_ARGS_ASSERT_SWASH_FETCH;
3960 /* If it really isn't a hash, it isn't really swash; must be an inversion
3962 if (SvTYPE(hv) != SVt_PVHV) {
3963 return _invlist_contains_cp((SV*)hv,
3965 ? valid_utf8_to_uvchr(ptr, NULL)
3969 /* We store the values in a "swatch" which is a vec() value in a swash
3970 * hash. Code points 0-255 are a single vec() stored with key length
3971 * (klen) 0. All other code points have a UTF-8 representation
3972 * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
3973 * share 0xAA..0xYY, which is the key in the hash to that vec. So the key
3974 * length for them is the length of the encoded char - 1. ptr[klen] is the
3975 * final byte in the sequence representing the character */
3976 if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
3981 else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
3984 off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
3987 klen = UTF8SKIP(ptr) - 1;
3989 /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
3990 * the vec is the final byte in the sequence. (In EBCDIC this is
3991 * converted to I8 to get consecutive values.) To help you visualize
3993 * Straight 1047 After final byte
3994 * UTF-8 UTF-EBCDIC I8 transform
3995 * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
3996 * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
3998 * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
3999 * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
4001 * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
4002 * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
4004 * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
4005 * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
4007 * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
4008 * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
4010 * (There are no discontinuities in the elided (...) entries.)
4011 * The UTF-8 key for these 33 code points is '\xD0' (which also is the
4012 * key for the next 31, up through U+043F, whose UTF-8 final byte is
4013 * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
4014 * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
4015 * index into the vec() swatch (after subtracting 0x80, which we
4016 * actually do with an '&').
4017 * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
4018 * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
4019 * dicontinuities which go away by transforming it into I8, and we
4020 * effectively subtract 0xA0 to get the index. */
4021 needents = (1 << UTF_ACCUMULATION_SHIFT);
4022 off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
4026 * This single-entry cache saves about 1/3 of the UTF-8 overhead in test
4027 * suite. (That is, only 7-8% overall over just a hash cache. Still,
4028 * it's nothing to sniff at.) Pity we usually come through at least
4029 * two function calls to get here...
4031 * NB: this code assumes that swatches are never modified, once generated!
4034 if (hv == PL_last_swash_hv &&
4035 klen == PL_last_swash_klen &&
4036 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
4038 tmps = PL_last_swash_tmps;
4039 slen = PL_last_swash_slen;
4042 /* Try our second-level swatch cache, kept in a hash. */
4043 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
4045 /* If not cached, generate it via swatch_get */
4046 if (!svp || !SvPOK(*svp)
4047 || !(tmps = (const U8*)SvPV_const(*svp, slen)))
4050 const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
4051 swatch = swatch_get(swash,
4052 code_point & ~((UV)needents - 1),
4055 else { /* For the first 256 code points, the swatch has a key of
4057 swatch = swatch_get(swash, 0, needents);
4060 if (IN_PERL_COMPILETIME)
4061 CopHINTS_set(PL_curcop, PL_hints);
4063 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
4065 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
4066 || (slen << 3) < needents)
4067 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
4068 "svp=%p, tmps=%p, slen=%" UVuf ", needents=%" UVuf,
4069 svp, tmps, (UV)slen, (UV)needents);
4072 PL_last_swash_hv = hv;
4073 assert(klen <= sizeof(PL_last_swash_key));
4074 PL_last_swash_klen = (U8)klen;
4075 /* FIXME change interpvar.h? */
4076 PL_last_swash_tmps = (U8 *) tmps;
4077 PL_last_swash_slen = slen;
4079 Copy(ptr, PL_last_swash_key, klen, U8);
4082 switch ((int)((slen << 3) / needents)) {
4084 return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
4086 return ((UV) tmps[off]);
4090 ((UV) tmps[off ] << 8) +
4091 ((UV) tmps[off + 1]);
4095 ((UV) tmps[off ] << 24) +
4096 ((UV) tmps[off + 1] << 16) +
4097 ((UV) tmps[off + 2] << 8) +
4098 ((UV) tmps[off + 3]);
4100 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
4101 "slen=%" UVuf ", needents=%" UVuf, (UV)slen, (UV)needents);
4102 NORETURN_FUNCTION_END;
4105 /* Read a single line of the main body of the swash input text. These are of
4108 * where each number is hex. The first two numbers form the minimum and
4109 * maximum of a range, and the third is the value associated with the range.
4110 * Not all swashes should have a third number
4112 * On input: l points to the beginning of the line to be examined; it points
4113 * to somewhere in the string of the whole input text, and is
4114 * terminated by a \n or the null string terminator.
4115 * lend points to the null terminator of that string
4116 * wants_value is non-zero if the swash expects a third number
4117 * typestr is the name of the swash's mapping, like 'ToLower'
4118 * On output: *min, *max, and *val are set to the values read from the line.
4119 * returns a pointer just beyond the line examined. If there was no
4120 * valid min number on the line, returns lend+1
4124 S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
4125 const bool wants_value, const U8* const typestr)
4127 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
4128 STRLEN numlen; /* Length of the number */
4129 I32 flags = PERL_SCAN_SILENT_ILLDIGIT
4130 | PERL_SCAN_DISALLOW_PREFIX
4131 | PERL_SCAN_SILENT_NON_PORTABLE;
4133 /* nl points to the next \n in the scan */
4134 U8* const nl = (U8*)memchr(l, '\n', lend - l);
4136 PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
4138 /* Get the first number on the line: the range minimum */
4140 *min = grok_hex((char *)l, &numlen, &flags, NULL);
4141 *max = *min; /* So can never return without setting max */
4142 if (numlen) /* If found a hex number, position past it */
4144 else if (nl) { /* Else, go handle next line, if any */
4145 return nl + 1; /* 1 is length of "\n" */
4147 else { /* Else, no next line */
4148 return lend + 1; /* to LIST's end at which \n is not found */
4151 /* The max range value follows, separated by a BLANK */
4154 flags = PERL_SCAN_SILENT_ILLDIGIT
4155 | PERL_SCAN_DISALLOW_PREFIX
4156 | PERL_SCAN_SILENT_NON_PORTABLE;
4158 *max = grok_hex((char *)l, &numlen, &flags, NULL);
4161 else /* If no value here, it is a single element range */
4164 /* Non-binary tables have a third entry: what the first element of the
4165 * range maps to. The map for those currently read here is in hex */
4169 flags = PERL_SCAN_SILENT_ILLDIGIT
4170 | PERL_SCAN_DISALLOW_PREFIX
4171 | PERL_SCAN_SILENT_NON_PORTABLE;
4173 *val = grok_hex((char *)l, &numlen, &flags, NULL);
4182 /* diag_listed_as: To%s: illegal mapping '%s' */
4183 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
4189 *val = 0; /* bits == 1, then any val should be ignored */
4191 else { /* Nothing following range min, should be single element with no
4196 /* diag_listed_as: To%s: illegal mapping '%s' */
4197 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
4201 *val = 0; /* bits == 1, then val should be ignored */
4204 /* Position to next line if any, or EOF */
4214 * Returns a swatch (a bit vector string) for a code point sequence
4215 * that starts from the value C<start> and comprises the number C<span>.
4216 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
4217 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
4220 S_swatch_get(pTHX_ SV* swash, UV start, UV span)
4223 U8 *l, *lend, *x, *xend, *s, *send;
4224 STRLEN lcur, xcur, scur;
4225 HV *const hv = MUTABLE_HV(SvRV(swash));
4226 SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
4228 SV** listsvp = NULL; /* The string containing the main body of the table */
4229 SV** extssvp = NULL;
4230 SV** invert_it_svp = NULL;
4233 STRLEN octets; /* if bits == 1, then octets == 0 */
4235 UV end = start + span;
4237 if (invlistsvp == NULL) {
4238 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
4239 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
4240 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
4241 extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
4242 listsvp = hv_fetchs(hv, "LIST", FALSE);
4243 invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
4245 bits = SvUV(*bitssvp);
4246 none = SvUV(*nonesvp);
4247 typestr = (U8*)SvPV_nolen(*typesvp);
4253 octets = bits >> 3; /* if bits == 1, then octets == 0 */
4255 PERL_ARGS_ASSERT_SWATCH_GET;
4257 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
4258 Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %" UVuf,
4262 /* If overflowed, use the max possible */
4268 /* create and initialize $swatch */
4269 scur = octets ? (span * octets) : (span + 7) / 8;
4270 swatch = newSV(scur);
4272 s = (U8*)SvPVX(swatch);
4273 if (octets && none) {
4274 const U8* const e = s + scur;
4277 *s++ = (U8)(none & 0xff);
4278 else if (bits == 16) {
4279 *s++ = (U8)((none >> 8) & 0xff);
4280 *s++ = (U8)( none & 0xff);