X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/cbbf89328f29f925842d180c1410e324b297744d..80690093c542ba068b37608492e9976edfa9b903:/utf8.c diff --git a/utf8.c b/utf8.c index 3d9f00f..53085e6 100644 --- a/utf8.c +++ b/utf8.c @@ -1,7 +1,7 @@ /* utf8.c * - * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and - * others + * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * by Larry Wall and others * * You may distribute under the terms of either the GNU General Public * License or the Artistic License, as specified in the README file. @@ -9,26 +9,40 @@ */ /* - * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever - * heard of that we don't want to see any closer; and that's the one place - * we're trying to get to! And that's just where we can't get, nohow.' + * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever + * heard of that we don't want to see any closer; and that's the one place + * we're trying to get to! And that's just where we can't get, nohow.' + * + * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"] * * 'Well do I understand your speech,' he answered in the same language; * 'yet few strangers do so. Why then do you not speak in the Common Tongue, - * as is the custom in the West, if you wish to be answered?' + * as is the custom in the West, if you wish to be answered?' + * --Gandalf, addressing Théoden's door wardens + * + * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] * * ...the travellers perceived that the floor was paved with stones of many * hues; branching runes and strange devices intertwined beneath their feet. + * + * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"] */ #include "EXTERN.h" #define PERL_IN_UTF8_C #include "perl.h" +#ifndef EBCDIC +/* Separate prototypes needed because in ASCII systems these are + * usually macros but they still are compiled as code, too. */ +PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags); +PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv); +#endif + static const char unees[] = "Malformed UTF-8 character (unexpected end of string)"; -/* +/* =head1 Unicode Support This file contains various utility functions for manipulating UTF8-encoded @@ -37,7 +51,40 @@ Unicode characters as a variable number of bytes, in such a way that characters in the ASCII range are unmodified, and a zero byte never appears within non-zero characters. -=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags +=cut +*/ + +/* +=for apidoc is_ascii_string + +Returns true if the first C bytes of the given string are the same whether +or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That +is, if they are invariant. On ASCII-ish machines, only ASCII characters +fit this definition, hence the function's name. + +See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc(). + +=cut +*/ + +bool +Perl_is_ascii_string(const U8 *s, STRLEN len) +{ + const U8* const send = s + (len ? len : strlen((const char *)s)); + const U8* x = s; + + PERL_ARGS_ASSERT_IS_ASCII_STRING; + + for (; x < send; ++x) { + if (!UTF8_IS_INVARIANT(*x)) + break; + } + + return x == send; +} + +/* +=for apidoc uvuni_to_utf8_flags Adds the UTF-8 representation of the Unicode codepoint C to the end of the string C; C should be have at least C free @@ -64,6 +111,8 @@ is the recommended Unicode-aware way of saying U8 * Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) { + PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS; + if (ckWARN(WARN_UTF8)) { if (UNICODE_IS_SURROGATE(uv) && !(flags & UNICODE_ALLOW_SURROGATE)) @@ -80,7 +129,7 @@ Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) !(flags & UNICODE_ALLOW_SUPER)) ) Perl_warner(aTHX_ packWARN(WARN_UTF8), - "Unicode character 0x%04"UVxf" is illegal", uv); + "Unicode non-character 0x%04"UVxf" is illegal for interchange", uv); } if (UNI_IS_INVARIANT(uv)) { *d++ = (U8)UTF_TO_NATIVE(uv); @@ -186,12 +235,14 @@ five bytes or more. =cut */ STATIC STRLEN -S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len) +S_is_utf8_char_slow(const U8 *s, const STRLEN len) { U8 u = *s; STRLEN slen; UV uv, ouv; + PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW; + if (UTF8_IS_INVARIANT(u)) return 1; @@ -213,7 +264,7 @@ S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len) if (!UTF8_IS_CONTINUATION(*s)) return 0; uv = UTF8_ACCUMULATE(uv, *s); - if (uv < ouv) + if (uv < ouv) return 0; ouv = uv; s++; @@ -226,18 +277,20 @@ S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len) } /* -=for apidoc A|STRLEN|is_utf8_char|const U8 *s +=for apidoc is_utf8_char Tests if some arbitrary number of bytes begins in a valid UTF-8 -character. Note that an INVARIANT (i.e. ASCII) character is a valid -UTF-8 character. The actual number of bytes in the UTF-8 character -will be returned if it is valid, otherwise 0. +character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines) +character is a valid UTF-8 character. The actual number of bytes in the UTF-8 +character will be returned if it is valid, otherwise 0. =cut */ STRLEN -Perl_is_utf8_char(pTHX_ const U8 *s) +Perl_is_utf8_char(const U8 *s) { const STRLEN len = UTF8SKIP(s); + + PERL_ARGS_ASSERT_IS_UTF8_CHAR; #ifdef IS_UTF8_CHAR if (IS_UTF8_CHAR_FAST(len)) return IS_UTF8_CHAR(s, len) ? len : 0; @@ -245,28 +298,27 @@ Perl_is_utf8_char(pTHX_ const U8 *s) return is_utf8_char_slow(s, len); } + /* -=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len +=for apidoc is_utf8_string Returns true if first C bytes of the given string form a valid UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does not mean 'a string that contains code points above 0x7F encoded in UTF-8' because a valid ASCII string is a valid UTF-8 string. -See also is_utf8_string_loclen() and is_utf8_string_loc(). +See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc(). =cut */ bool -Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) +Perl_is_utf8_string(const U8 *s, STRLEN len) { + const U8* const send = s + (len ? len : strlen((const char *)s)); const U8* x = s; - const U8* send; - if (!len) - len = strlen((const char *)s); - send = s + len; + PERL_ARGS_ASSERT_IS_UTF8_STRING; while (x < send) { STRLEN c; @@ -281,9 +333,10 @@ Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) c = UTF8SKIP(x); if (IS_UTF8_CHAR_FAST(c)) { if (!IS_UTF8_CHAR(x, c)) - goto out; - } else if (!is_utf8_char_slow(x, c)) - goto out; + c = 0; + } + else + c = is_utf8_char_slow(x, c); #else c = is_utf8_char(x); #endif /* #ifdef IS_UTF8_CHAR */ @@ -303,7 +356,7 @@ Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len) /* Implemented as a macro in utf8.h -=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep +=for apidoc is_utf8_string_loc Like is_utf8_string() but stores the location of the failure (in the case of "utf8ness failure") or the location s+len (in the case of @@ -311,7 +364,7 @@ case of "utf8ness failure") or the location s+len (in the case of See also is_utf8_string_loclen() and is_utf8_string(). -=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el +=for apidoc is_utf8_string_loclen Like is_utf8_string() but stores the location of the failure (in the case of "utf8ness failure") or the location s+len (in the case of @@ -324,17 +377,14 @@ See also is_utf8_string_loc() and is_utf8_string(). */ bool -Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) +Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el) { + const U8* const send = s + (len ? len : strlen((const char *)s)); const U8* x = s; - const U8* send; STRLEN c; + STRLEN outlen = 0; - if (!len) - len = strlen((const char *)s); - send = s + len; - if (el) - *el = 0; + PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN; while (x < send) { /* Inline the easy bits of is_utf8_char() here for speed... */ @@ -358,25 +408,24 @@ Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN goto out; } x += c; - if (el) - (*el)++; + outlen++; } out: + if (el) + *el = outlen; + if (ep) *ep = x; - if (x != send) - return FALSE; - - return TRUE; + return (x == send); } /* -=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags +=for apidoc utf8n_to_uvuni Bottom level UTF-8 decode routine. -Returns the unicode code point value of the first character in the string C +Returns the Unicode code point value of the first character in the string C which is assumed to be in UTF-8 encoding and no longer than C; C will be set to the length, in bytes, of that character. @@ -399,15 +448,19 @@ Most code should use utf8_to_uvchr() rather than call this directly. UV Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) { - const U8 *s0 = s; + dVAR; + const U8 * const s0 = s; UV uv = *s, ouv = 0; STRLEN len = 1; const bool dowarn = ckWARN_d(WARN_UTF8); const UV startbyte = *s; STRLEN expectlen = 0; U32 warning = 0; + SV* sv; -/* This list is a superset of the UTF8_ALLOW_XXX. */ + PERL_ARGS_ASSERT_UTF8N_TO_UVUNI; + +/* This list is a superset of the UTF8_ALLOW_XXX. BUT it isn't, eg SUPER missing XXX */ #define UTF8_WARN_EMPTY 1 #define UTF8_WARN_CONTINUATION 2 @@ -528,57 +581,60 @@ malformed: if (flags & UTF8_CHECK_ONLY) { if (retlen) - *retlen = -1; + *retlen = ((STRLEN) -1); return 0; } if (dowarn) { - SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0)); + if (warning == UTF8_WARN_FFFF) { + sv = newSVpvs_flags("Unicode non-character ", SVs_TEMP); + Perl_sv_catpvf(aTHX_ sv, "0x%04"UVxf" is illegal for interchange", uv); + } + else { + sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP); + + switch (warning) { + case 0: /* Intentionally empty. */ break; + case UTF8_WARN_EMPTY: + sv_catpvs(sv, "(empty string)"); + break; + case UTF8_WARN_CONTINUATION: + Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv); + break; + case UTF8_WARN_NON_CONTINUATION: + if (s == s0) + Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")", + (UV)s[1], startbyte); + else { + const int len = (int)(s-s0); + Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)", + (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen); + } - switch (warning) { - case 0: /* Intentionally empty. */ break; - case UTF8_WARN_EMPTY: - Perl_sv_catpv(aTHX_ sv, "(empty string)"); - break; - case UTF8_WARN_CONTINUATION: - Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv); - break; - case UTF8_WARN_NON_CONTINUATION: - if (s == s0) - Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")", - (UV)s[1], startbyte); - else { - const int len = (int)(s-s0); - Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)", - (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen); + break; + case UTF8_WARN_FE_FF: + Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv); + break; + case UTF8_WARN_SHORT: + Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")", + (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte); + expectlen = curlen; /* distance for caller to skip */ + break; + case UTF8_WARN_OVERFLOW: + Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")", + ouv, *s, startbyte); + break; + case UTF8_WARN_SURROGATE: + Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv); + break; + case UTF8_WARN_LONG: + Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")", + (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte); + break; + default: + sv_catpvs(sv, "(unknown reason)"); + break; } - - break; - case UTF8_WARN_FE_FF: - Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv); - break; - case UTF8_WARN_SHORT: - Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")", - (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte); - expectlen = curlen; /* distance for caller to skip */ - break; - case UTF8_WARN_OVERFLOW: - Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")", - ouv, *s, startbyte); - break; - case UTF8_WARN_SURROGATE: - Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv); - break; - case UTF8_WARN_LONG: - Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")", - (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte); - break; - case UTF8_WARN_FFFF: - Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv); - break; - default: - Perl_sv_catpv(aTHX_ sv, "(unknown reason)"); - break; } if (warning) { @@ -599,7 +655,7 @@ malformed: } /* -=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen +=for apidoc utf8_to_uvchr Returns the native character value of the first character in the string C which is assumed to be in UTF-8 encoding; C will be set to the @@ -614,18 +670,20 @@ returned and retlen is set, if possible, to -1. UV Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen) { + PERL_ARGS_ASSERT_UTF8_TO_UVCHR; + return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } /* -=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen +=for apidoc utf8_to_uvuni Returns the Unicode code point of the first character in the string C which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. -This function should only be used when returned UV is considered +This function should only be used when the returned UV is considered an index into the Unicode semantic tables (e.g. swashes). If C does not point to a well-formed UTF-8 character, zero is @@ -637,13 +695,15 @@ returned and retlen is set, if possible, to -1. UV Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen) { + PERL_ARGS_ASSERT_UTF8_TO_UVUNI; + /* Call the low level routine asking for checks */ return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen, ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY); } /* -=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e +=for apidoc utf8_length Return the length of the UTF-8 char encoded string C in characters. Stops at C (inclusive). If C s> or if the scan would end @@ -655,8 +715,11 @@ up past C, croaks. STRLEN Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) { + dVAR; STRLEN len = 0; + PERL_ARGS_ASSERT_UTF8_LENGTH; + /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. * the bitops (especially ~) can create illegal UTF-8. * In other words: in Perl UTF-8 is not just for Unicode. */ @@ -664,27 +727,28 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e) if (e < s) goto warn_and_return; while (s < e) { - const U8 t = UTF8SKIP(s); - if (e - s < t) { - warn_and_return: - if (ckWARN_d(WARN_UTF8)) { - if (PL_op) - Perl_warner(aTHX_ packWARN(WARN_UTF8), - "%s in %s", unees, OP_DESC(PL_op)); - else - Perl_warner(aTHX_ packWARN(WARN_UTF8), unees); - } - return len; - } - s += t; + if (!UTF8_IS_INVARIANT(*s)) + s += UTF8SKIP(s); + else + s++; len++; } + if (e != s) { + len--; + warn_and_return: + if (PL_op) + Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), + "%s in %s", unees, OP_DESC(PL_op)); + else + Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees); + } + return len; } /* -=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b +=for apidoc utf8_distance Returns the number of UTF-8 characters between the UTF-8 pointers C and C. @@ -698,46 +762,13 @@ same UTF-8 buffer. IV Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b) { - IV off = 0; - - /* Note: cannot use UTF8_IS_...() too eagerly here since e.g. - * the bitops (especially ~) can create illegal UTF-8. - * In other words: in Perl UTF-8 is not just for Unicode. */ + PERL_ARGS_ASSERT_UTF8_DISTANCE; - if (a < b) { - while (a < b) { - const U8 c = UTF8SKIP(a); - if (b - a < c) - goto warn_and_return; - a += c; - off--; - } - } - else { - while (b < a) { - const U8 c = UTF8SKIP(b); - - if (a - b < c) { - warn_and_return: - if (ckWARN_d(WARN_UTF8)) { - if (PL_op) - Perl_warner(aTHX_ packWARN(WARN_UTF8), - "%s in %s", unees, OP_DESC(PL_op)); - else - Perl_warner(aTHX_ packWARN(WARN_UTF8), unees); - } - return off; - } - b += c; - off++; - } - } - - return off; + return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a); } /* -=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off +=for apidoc utf8_hop Return the UTF-8 pointer C displaced by C characters, either forward or backward. @@ -752,6 +783,9 @@ on the first byte of character or just after the last byte of a character. U8 * Perl_utf8_hop(pTHX_ const U8 *s, I32 off) { + PERL_ARGS_ASSERT_UTF8_HOP; + + PERL_UNUSED_CONTEXT; /* Note: cannot use UTF8_IS_...() too eagerly here since e.g * the bitops (especially ~) can create illegal UTF-8. * In other words: in Perl UTF-8 is not just for Unicode. */ @@ -771,31 +805,35 @@ Perl_utf8_hop(pTHX_ const U8 *s, I32 off) } /* -=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len +=for apidoc utf8_to_bytes -Converts a string C of length C from UTF-8 into byte encoding. +Converts a string C of length C from UTF-8 into native byte encoding. Unlike C, this over-writes the original string, and updates len to contain the new length. Returns zero on failure, setting C to -1. +If you need a copy of the string, see C. + =cut */ U8 * Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) { - U8 *send; + U8 * const save = s; + U8 * const send = s + *len; U8 *d; - U8 *save = s; + + PERL_ARGS_ASSERT_UTF8_TO_BYTES; /* ensure valid UTF-8 and chars < 256 before updating string */ - for (send = s + *len; s < send; ) { + while (s < send) { U8 c = *s++; if (!UTF8_IS_INVARIANT(c) && (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send) || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) { - *len = -1; + *len = ((STRLEN) -1); return 0; } } @@ -812,14 +850,15 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len) } /* -=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8 +=for apidoc bytes_from_utf8 -Converts a string C of length C from UTF-8 into byte encoding. +Converts a string C of length C from UTF-8 into native byte encoding. Unlike C but like C, returns a pointer to the newly-created string, and updates C to contain the new length. Returns the original string if no conversion occurs, C is unchanged. Do nothing if C points to 0. Sets C to -0 if C is converted or contains all 7bit characters. +0 if C is converted or consisted entirely of characters that are invariant +in utf8 (i.e., US-ASCII on non-EBCDIC machines). =cut */ @@ -832,6 +871,9 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) const U8 *send; I32 count = 0; + PERL_ARGS_ASSERT_BYTES_FROM_UTF8; + + PERL_UNUSED_CONTEXT; if (!*is_utf8) return (U8 *)start; @@ -847,9 +889,9 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) } } - *is_utf8 = 0; + *is_utf8 = FALSE; - Newxz(d, (*len) - count + 1, U8); + Newx(d, (*len) - count + 1, U8); s = start; start = d; while (s < send) { U8 c = *s++; @@ -866,13 +908,16 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8) } /* -=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len +=for apidoc bytes_to_utf8 -Converts a string C of length C from ASCII into UTF-8 encoding. +Converts a string C of length C from the native encoding into UTF-8. Returns a pointer to the newly-created string, and sets C to reflect the new length. -If you want to convert to UTF-8 from other encodings than ASCII, +A NUL character will be written after the end of the string. + +If you want to convert to UTF-8 from encodings other than +the native (Latin1 or EBCDIC), see sv_recode_to_utf8(). =cut @@ -885,7 +930,10 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len) U8 *d; U8 *dst; - Newxz(d, (*len) * 2 + 1, U8); + PERL_ARGS_ASSERT_BYTES_TO_UTF8; + PERL_UNUSED_CONTEXT; + + Newx(d, (*len) * 2 + 1, U8); dst = d; while (s < send) { @@ -914,14 +962,10 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) U8* pend; U8* dstart = d; - if (bytelen == 1 && p[0] == 0) { /* Be understanding. */ - d[0] = 0; - *newlen = 1; - return d; - } + PERL_ARGS_ASSERT_UTF16_TO_UTF8; if (bytelen & 1) - Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen); + Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen); pend = p + bytelen; @@ -929,7 +973,11 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */ p += 2; if (uv < 0x80) { +#ifdef EBCDIC + *d++ = UNI_TO_NATIVE(uv); +#else *d++ = (U8)uv; +#endif continue; } if (uv < 0x800) { @@ -937,12 +985,18 @@ Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) *d++ = (U8)(( uv & 0x3f) | 0x80); continue; } - if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */ - UV low = (p[0] << 8) + p[1]; - p += 2; - if (low < 0xdc00 || low >= 0xdfff) + if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */ + if (p >= pend) { Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); - uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000; + } else { + UV low = (p[0] << 8) + p[1]; + p += 2; + if (low < 0xdc00 || low > 0xdfff) + Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); + uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000; + } + } else if (uv >= 0xdc00 && uv <= 0xdfff) { + Perl_croak(aTHX_ "Malformed UTF-16 surrogate"); } if (uv < 0x10000) { *d++ = (U8)(( uv >> 12) | 0xe0); @@ -968,9 +1022,16 @@ U8* Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen) { U8* s = (U8*)p; - U8* send = s + bytelen; + U8* const send = s + bytelen; + + PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED; + + if (bytelen & 1) + Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf, + (UV)bytelen); + while (s < send) { - U8 tmp = s[0]; + const U8 tmp = s[0]; s[0] = s[1]; s[1] = tmp; s += 2; @@ -989,14 +1050,6 @@ Perl_is_uni_alnum(pTHX_ UV c) } bool -Perl_is_uni_alnumc(pTHX_ UV c) -{ - U8 tmpbuf[UTF8_MAXBYTES+1]; - uvchr_to_utf8(tmpbuf, c); - return is_utf8_alnumc(tmpbuf); -} - -bool Perl_is_uni_idfirst(pTHX_ UV c) { U8 tmpbuf[UTF8_MAXBYTES+1]; @@ -1095,6 +1148,8 @@ Perl_is_uni_xdigit(pTHX_ UV c) UV Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) { + PERL_ARGS_ASSERT_TO_UNI_UPPER; + uvchr_to_utf8(p, c); return to_utf8_upper(p, p, lenp); } @@ -1102,6 +1157,8 @@ Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp) UV Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) { + PERL_ARGS_ASSERT_TO_UNI_TITLE; + uvchr_to_utf8(p, c); return to_utf8_title(p, p, lenp); } @@ -1109,6 +1166,8 @@ Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp) UV Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) { + PERL_ARGS_ASSERT_TO_UNI_LOWER; + uvchr_to_utf8(p, c); return to_utf8_lower(p, p, lenp); } @@ -1116,6 +1175,8 @@ Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp) UV Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp) { + PERL_ARGS_ASSERT_TO_UNI_FOLD; + uvchr_to_utf8(p, c); return to_utf8_fold(p, p, lenp); } @@ -1129,12 +1190,6 @@ Perl_is_uni_alnum_lc(pTHX_ UV c) } bool -Perl_is_uni_alnumc_lc(pTHX_ UV c) -{ - return is_uni_alnumc(c); /* XXX no locale support yet */ -} - -bool Perl_is_uni_idfirst_lc(pTHX_ UV c) { return is_uni_idfirst(c); /* XXX no locale support yet */ @@ -1240,6 +1295,10 @@ static bool S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, const char *const swashname) { + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_COMMON; + if (!is_utf8_char(p)) return FALSE; if (!*swash) @@ -1250,109 +1309,293 @@ S_is_utf8_common(pTHX_ const U8 *const p, SV **swash, bool Perl_is_utf8_alnum(pTHX_ const U8 *p) { + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_ALNUM; + /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true * descendant of isalnum(3), in other words, it doesn't * contain the '_'. --jhi */ - return S_is_utf8_common(aTHX_ p, &PL_utf8_alnum, "IsWord"); -} - -bool -Perl_is_utf8_alnumc(pTHX_ const U8 *p) -{ - return S_is_utf8_common(aTHX_ p, &PL_utf8_alnumc, "IsAlnumC"); + return is_utf8_common(p, &PL_utf8_alnum, "IsWord"); } bool Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */ { + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_IDFIRST; + if (*p == '_') return TRUE; /* is_utf8_idstart would be more logical. */ - return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart"); + return is_utf8_common(p, &PL_utf8_idstart, "IdStart"); } bool Perl_is_utf8_idcont(pTHX_ const U8 *p) { + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_IDCONT; + if (*p == '_') return TRUE; - return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue"); + return is_utf8_common(p, &PL_utf8_idcont, "IdContinue"); } bool Perl_is_utf8_alpha(pTHX_ const U8 *p) { - return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha"); + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_ALPHA; + + return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha"); } bool Perl_is_utf8_ascii(pTHX_ const U8 *p) { - return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii"); + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_ASCII; + + return is_utf8_common(p, &PL_utf8_ascii, "IsAscii"); } bool Perl_is_utf8_space(pTHX_ const U8 *p) { - return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl"); + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_SPACE; + + return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl"); +} + +bool +Perl_is_utf8_perl_space(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE; + + return is_utf8_common(p, &PL_utf8_perl_space, "IsPerlSpace"); +} + +bool +Perl_is_utf8_perl_word(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD; + + return is_utf8_common(p, &PL_utf8_perl_word, "IsPerlWord"); } bool Perl_is_utf8_digit(pTHX_ const U8 *p) { - return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit"); + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_DIGIT; + + return is_utf8_common(p, &PL_utf8_digit, "IsDigit"); +} + +bool +Perl_is_utf8_posix_digit(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT; + + return is_utf8_common(p, &PL_utf8_posix_digit, "IsPosixDigit"); } bool Perl_is_utf8_upper(pTHX_ const U8 *p) { - return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase"); + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_UPPER; + + return is_utf8_common(p, &PL_utf8_upper, "IsUppercase"); } bool Perl_is_utf8_lower(pTHX_ const U8 *p) { - return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase"); + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_LOWER; + + return is_utf8_common(p, &PL_utf8_lower, "IsLowercase"); } bool Perl_is_utf8_cntrl(pTHX_ const U8 *p) { - return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl"); + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_CNTRL; + + return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl"); } bool Perl_is_utf8_graph(pTHX_ const U8 *p) { - return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph"); + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_GRAPH; + + return is_utf8_common(p, &PL_utf8_graph, "IsGraph"); } bool Perl_is_utf8_print(pTHX_ const U8 *p) { - return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint"); + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_PRINT; + + return is_utf8_common(p, &PL_utf8_print, "IsPrint"); } bool Perl_is_utf8_punct(pTHX_ const U8 *p) { - return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct"); + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_PUNCT; + + return is_utf8_common(p, &PL_utf8_punct, "IsPunct"); } bool Perl_is_utf8_xdigit(pTHX_ const U8 *p) { - return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit"); + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_XDIGIT; + + return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit"); } bool Perl_is_utf8_mark(pTHX_ const U8 *p) { - return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM"); + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_MARK; + + return is_utf8_common(p, &PL_utf8_mark, "IsM"); +} + +bool +Perl_is_utf8_X_begin(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN; + + return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin"); +} + +bool +Perl_is_utf8_X_extend(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND; + + return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend"); +} + +bool +Perl_is_utf8_X_prepend(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND; + + return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend"); +} + +bool +Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL; + + return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable"); +} + +bool +Perl_is_utf8_X_L(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_L; + + return is_utf8_common(p, &PL_utf8_X_L, "GCB=L"); +} + +bool +Perl_is_utf8_X_LV(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_LV; + + return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV"); +} + +bool +Perl_is_utf8_X_LVT(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_LVT; + + return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT"); +} + +bool +Perl_is_utf8_X_T(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_T; + + return is_utf8_common(p, &PL_utf8_X_T, "GCB=T"); +} + +bool +Perl_is_utf8_X_V(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_V; + + return is_utf8_common(p, &PL_utf8_X_V, "GCB=V"); +} + +bool +Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p) +{ + dVAR; + + PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V; + + return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V"); } /* -=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special +=for apidoc to_utf8_case The "p" contains the pointer to the UTF-8 string encoding the character that is being converted. @@ -1380,27 +1623,45 @@ UV Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special) { + dVAR; U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; STRLEN len = 0; - const UV uv0 = utf8_to_uvchr(p, NULL); /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings * are necessary in EBCDIC, they are redundant no-ops * in ASCII-ish platforms, and hopefully optimized away. */ const UV uv1 = NATIVE_TO_UNI(uv0); + + PERL_ARGS_ASSERT_TO_UTF8_CASE; + uvuni_to_utf8(tmpbuf, uv1); if (!*swashp) /* load on-demand */ *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0); + /* This is the beginnings of a skeleton of code to read the info section + * that is in all the swashes in case we ever want to do that, so one can + * read things whose maps aren't code points, and whose default if missing + * is not to the code point itself. This was just to see if it actually + * worked. Details on what the possibilities are are in perluniprops.pod + HV * const hv = get_hv("utf8::SwashInfo", 0); + if (hv) { + SV **svp; + svp = hv_fetch(hv, (const char*)normal, strlen(normal), FALSE); + const char *s; + + HV * const this_hash = SvRV(*svp); + svp = hv_fetch(this_hash, "type", strlen("type"), FALSE); + s = SvPV_const(*svp, len); + } + }*/ - /* The 0xDF is the only special casing Unicode code point below 0x100. */ - if (special && (uv1 == 0xDF || uv1 > 0xFF)) { + if (special) { /* It might be "special" (sometimes, but not always, * a multicharacter mapping) */ - HV *hv; + HV * const hv = get_hv(special, 0); SV **svp; - if ((hv = get_hv(special, FALSE)) && + if (hv && (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) && (*svp)) { const char *s; @@ -1420,7 +1681,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, STRLEN tlen = 0; while (t < tend) { - UV c = utf8_to_uvchr(t, &tlen); + const UV c = utf8_to_uvchr(t, &tlen); if (tlen > 0) { d = uvchr_to_utf8(d, UNI_TO_NATIVE(c)); t += tlen; @@ -1445,17 +1706,17 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, } if (!len && *swashp) { - UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE); - + const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE); + if (uv2) { /* It was "normal" (a single character mapping). */ - UV uv3 = UNI_TO_NATIVE(uv2); - + const UV uv3 = UNI_TO_NATIVE(uv2); len = uvchr_to_utf8(ustrp, uv3) - ustrp; } } - if (!len) /* Neither: just copy. */ + if (!len) /* Neither: just copy. In other words, there was no mapping + defined, which means that the code point maps to itself */ len = uvchr_to_utf8(ustrp, uv0) - ustrp; if (lenp) @@ -1465,7 +1726,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, } /* -=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp +=for apidoc to_utf8_upper Convert the UTF-8 encoded character at p to its uppercase version and store that in UTF-8 in ustrp and its length in bytes in lenp. Note @@ -1480,12 +1741,16 @@ The first character of the uppercased version is returned UV Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) { + dVAR; + + PERL_ARGS_ASSERT_TO_UTF8_UPPER; + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper"); } /* -=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp +=for apidoc to_utf8_title Convert the UTF-8 encoded character at p to its titlecase version and store that in UTF-8 in ustrp and its length in bytes in lenp. Note @@ -1500,12 +1765,16 @@ The first character of the titlecased version is returned UV Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) { + dVAR; + + PERL_ARGS_ASSERT_TO_UTF8_TITLE; + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle"); } /* -=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp +=for apidoc to_utf8_lower Convert the UTF-8 encoded character at p to its lowercase version and store that in UTF-8 in ustrp and its length in bytes in lenp. Note @@ -1520,12 +1789,16 @@ The first character of the lowercased version is returned UV Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) { + dVAR; + + PERL_ARGS_ASSERT_TO_UTF8_LOWER; + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower"); } /* -=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp +=for apidoc to_utf8_fold Convert the UTF-8 encoded character at p to its foldcase version and store that in UTF-8 in ustrp and its length in bytes in lenp. Note @@ -1541,6 +1814,10 @@ The first character of the foldcased version is returned UV Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp) { + dVAR; + + PERL_ARGS_ASSERT_TO_UTF8_FOLD; + return Perl_to_utf8_case(aTHX_ p, ustrp, lenp, &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold"); } @@ -1556,23 +1833,30 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits { dVAR; SV* retval; - SV* const tokenbufsv = sv_newmortal(); dSP; const size_t pkg_len = strlen(pkg); const size_t name_len = strlen(name); - HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE); + HV * const stash = gv_stashpvn(pkg, pkg_len, 0); SV* errsv_save; + PERL_ARGS_ASSERT_SWASH_INIT; + PUSHSTACKi(PERLSI_MAGIC); ENTER; - SAVEI32(PL_hints); - PL_hints = 0; + SAVEHINTS(); save_re_context(); if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */ ENTER; errsv_save = newSVsv(ERRSV); + /* It is assumed that callers of this routine are not passing in any + user derived data. */ + /* Need to do this after save_re_context() as it will set PL_tainted to + 1 while saving $1 etc (see the code after getrx: in Perl_magic_get). + Even line to create errsv_save can turn on PL_tainted. */ + SAVEBOOL(PL_tainted); + PL_tainted = 0; Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len), - Nullsv); + NULL); if (!SvTRUE(ERRSV)) sv_setsv(ERRSV, errsv_save); SvREFCNT_dec(errsv_save); @@ -1581,18 +1865,12 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits SPAGAIN; PUSHMARK(SP); EXTEND(SP,5); - PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len))); - PUSHs(sv_2mortal(newSVpvn(name, name_len))); + mPUSHp(pkg, pkg_len); + mPUSHp(name, name_len); PUSHs(listsv); - PUSHs(sv_2mortal(newSViv(minbits))); - PUSHs(sv_2mortal(newSViv(none))); + mPUSHi(minbits); + mPUSHi(none); PUTBACK; - if (IN_PERL_COMPILETIME) { - /* XXX ought to be handled by lex_start */ - SAVEI32(PL_in_my); - PL_in_my = 0; - sv_setpv(tokenbufsv, PL_tokenbuf); - } errsv_save = newSVsv(ERRSV); if (call_method("SWASHNEW", G_SCALAR)) retval = newSVsv(*PL_stack_sp--); @@ -1604,16 +1882,12 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits LEAVE; POPSTACK; if (IN_PERL_COMPILETIME) { - STRLEN len; - const char* const pv = SvPV_const(tokenbufsv, len); - - Copy(pv, PL_tokenbuf, len+1, char); - PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK); + CopHINTS_set(PL_curcop, PL_hints); } if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) { if (SvPOK(retval)) Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"", - retval); + SVfARG(retval)); Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref"); } return retval; @@ -1637,7 +1911,7 @@ UV Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) { dVAR; - HV* const hv = (HV*)SvRV(swash); + HV *const hv = MUTABLE_HV(SvRV(swash)); U32 klen; U32 off; STRLEN slen; @@ -1646,7 +1920,9 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) U32 bit; SV *swatch; U8 tmputf8[2]; - UV c = NATIVE_TO_ASCII(*ptr); + const UV c = NATIVE_TO_ASCII(*ptr); + + PERL_ARGS_ASSERT_SWASH_FETCH; if (!do_utf8 && !UNI_IS_INVARIANT(c)) { tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c); @@ -1654,7 +1930,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) ptr = tmputf8; } /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ - * then the "swatch" is a vec() for al the chars which start + * then the "swatch" is a vec() for all the chars which start * with 0xAA..0xYY * So the key in the hash (klen) is length of encoded char -1 */ @@ -1662,7 +1938,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) off = ptr[klen]; if (klen == 0) { - /* If char in invariant then swatch is for all the invariant chars + /* If char is invariant then swatch is for all the invariant chars * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK */ needents = UTF_CONTINUATION_MARK; @@ -1709,7 +1985,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) needents); if (IN_PERL_COMPILETIME) - PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK); + CopHINTS_set(PL_curcop, PL_hints); svp = hv_store(hv, (const char *)ptr, klen, swatch, 0); @@ -1719,7 +1995,8 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) } PL_last_swash_hv = hv; - PL_last_swash_klen = klen; + assert(klen <= sizeof(PL_last_swash_key)); + PL_last_swash_klen = (U8)klen; /* FIXME change interpvar.h? */ PL_last_swash_tmps = (U8 *) tmps; PL_last_swash_slen = slen; @@ -1742,7 +2019,7 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8) return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ; } Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width"); - return 0; + NORETURN_FUNCTION_END; } /* Note: @@ -1757,13 +2034,12 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) SV *swatch; U8 *l, *lend, *x, *xend, *s; STRLEN lcur, xcur, scur; - - HV* const hv = (HV*)SvRV(swash); - SV** const listsvp = hv_fetch(hv, "LIST", 4, FALSE); - SV** const typesvp = hv_fetch(hv, "TYPE", 4, FALSE); - SV** const bitssvp = hv_fetch(hv, "BITS", 4, FALSE); - SV** const nonesvp = hv_fetch(hv, "NONE", 4, FALSE); - SV** const extssvp = hv_fetch(hv, "EXTRAS", 6, FALSE); + HV *const hv = MUTABLE_HV(SvRV(swash)); + SV** const listsvp = hv_fetchs(hv, "LIST", FALSE); + SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE); + SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE); + SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE); + SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE); const U8* const typestr = (U8*)SvPV_nolen(*typesvp); const int typeto = typestr[0] == 'T' && typestr[1] == 'o'; const STRLEN bits = SvUV(*bitssvp); @@ -1771,15 +2047,17 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) const UV none = SvUV(*nonesvp); const UV end = start + span; + PERL_ARGS_ASSERT_SWASH_GET; + if (bits != 1 && bits != 8 && bits != 16 && bits != 32) { Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf, (UV)bits); } /* create and initialize $swatch */ - swatch = newSVpvn("",0); scur = octets ? (span * octets) : (span + 7) / 8; - SvGROW(swatch, scur + 1); + swatch = newSV(scur); + SvPOK_on(swatch); s = (U8*)SvPVX(swatch); if (octets && none) { const U8* const e = s + scur; @@ -1809,7 +2087,7 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) l = (U8*)SvPV(*listsvp, lcur); lend = l + lcur; while (l < lend) { - UV min, max, val, key; + UV min, max, val; STRLEN numlen; I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX; @@ -1882,6 +2160,7 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) continue; if (octets) { + UV key; if (min < start) { if (!none || val < none) { val += start - min; @@ -1912,6 +2191,7 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) } } else { /* bits == 1, then val should be ignored */ + UV key; if (min < start) min = start; for (key = min; key <= max; key++) { @@ -1937,7 +2217,7 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) U8 *s, *o, *nl; STRLEN slen, olen; - U8 opc = *x++; + const U8 opc = *x++; if (opc == '\n') continue; @@ -1965,8 +2245,8 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) } othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE); - otherhv = (HV*)SvRV(*othersvp); - otherbitssvp = hv_fetch(otherhv, "BITS", 4, FALSE); + otherhv = MUTABLE_HV(SvRV(*othersvp)); + otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE); otherbits = (STRLEN)SvUV(*otherbitssvp); if (bits < otherbits) Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch"); @@ -2007,7 +2287,7 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) else { STRLEN otheroctets = otherbits >> 3; STRLEN offset = 0; - U8* send = s + slen; + U8* const send = s + slen; while (s < send) { UV otherval = 0; @@ -2026,7 +2306,7 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) } if (opc == '+' && otherval) - ; /* replace with otherval */ + NOOP; /* replace with otherval */ else if (opc == '!' && !otherval) otherval = 1; else if (opc == '-' && otherval) @@ -2058,7 +2338,7 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span) } /* -=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv +=for apidoc uvchr_to_utf8 Adds the UTF-8 representation of the Native codepoint C to the end of the string C; C should be have at least C free @@ -2080,20 +2360,24 @@ is the recommended wide native character-aware way of saying U8 * Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv) { + PERL_ARGS_ASSERT_UVCHR_TO_UTF8; + return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0); } U8 * Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags) { + PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS; + return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags); } /* -=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 +=for apidoc utf8n_to_uvchr flags -Returns the native character value of the first character in the string +Returns the native character value of the first character in the string C which is assumed to be in UTF-8 encoding; C will be set to the length, in bytes, of that character. @@ -2106,15 +2390,18 @@ Allows length and flags to be passed to low level routine. a real function in case XS code wants it */ UV -Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, +Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags) { const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags); + + PERL_ARGS_ASSERT_UTF8N_TO_UVCHR; + return UNI_TO_NATIVE(uv); } /* -=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags +=for apidoc pv_uni_display Build to the scalar dsv a displayable version of the string spv, length len, the displayable version being at most pvlim bytes long @@ -2136,7 +2423,10 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f int truncated = 0; const char *s, *e; - sv_setpvn(dsv, "", 0); + PERL_ARGS_ASSERT_PV_UNI_DISPLAY; + + sv_setpvs(dsv, ""); + SvUTF8_off(dsv); for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) { UV u; /* This serves double duty as a flag and a character to print after @@ -2168,12 +2458,15 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f default: break; } if (ok) { - Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok); + const char string = ok; + sv_catpvs(dsv, "\\"); + sv_catpvn(dsv, &string, 1); } } /* isPRINT() is the locale-blind version. */ if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) { - Perl_sv_catpvf(aTHX_ dsv, "%c", c); + const char string = c; + sv_catpvn(dsv, &string, 1); ok = 1; } } @@ -2181,13 +2474,13 @@ Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV f Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u); } if (truncated) - sv_catpvn(dsv, "...", 3); - + sv_catpvs(dsv, "..."); + return SvPVX(dsv); } /* -=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags +=for apidoc sv_uni_display Build to the scalar dsv a displayable version of the scalar sv, the displayable version being at most pvlim bytes long @@ -2197,33 +2490,50 @@ The flags argument is as in pv_uni_display(). The pointer to the PV of the dsv is returned. -=cut */ +=cut +*/ char * Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags) { + PERL_ARGS_ASSERT_SV_UNI_DISPLAY; + return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv), SvCUR(ssv), pvlim, flags); } /* -=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2 - -Return true if the strings s1 and s2 differ case-insensitively, false -if not (if they are equal case-insensitively). If u1 is true, the -string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true, -the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2 -are false, the respective string is assumed to be in native 8-bit -encoding. - -If the pe1 and pe2 are non-NULL, the scanning pointers will be copied -in there (they will point at the beginning of the I character). -If the pointers behind pe1 or pe2 are non-NULL, they are the end -pointers beyond which scanning will not continue under any -circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and -s2+l2 will be used as goal end pointers that will also stop the scan, -and which qualify towards defining a successful match: all the scans -that define an explicit length must reach their goal pointers for -a match to succeed). +=for apidoc foldEQ_utf8 + +Returns true if the leading portions of the strings s1 and s2 (either or both +of which may be in UTF-8) are the same case-insensitively; false otherwise. +How far into the strings to compare is determined by other input parameters. + +If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode; +otherwise it is assumed to be in native 8-bit encoding. Correspondingly for u2 +with respect to s2. + +If the byte length l1 is non-zero, it says how far into s1 to check for fold +equality. In other words, s1+l1 will be used as a goal to reach. The +scan will not be considered to be a match unless the goal is reached, and +scanning won't continue past that goal. Correspondingly for l2 with respect to +s2. + +If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is +considered an end pointer beyond which scanning of s1 will not continue under +any circumstances. This means that if both l1 and pe1 are specified, and pe1 +is less than s1+l1, the match will never be successful because it can never +get as far as its goal (and in fact is asserted against). Correspondingly for +pe2 with respect to s2. + +At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be +non-zero), and if both do, both have to be +reached for a successful match. Also, if the fold of a character is multiple +characters, all of them must be matched (see tr21 reference below for +'folding'). + +Upon a successful match, if pe1 is non-NULL, +it will be set to point to the beginning of the I character of s1 beyond +what was matched. Correspondingly for pe2 and s2. For case-insensitiveness, the "casefolding" of Unicode is used instead of upper/lowercasing both the characters, see @@ -2231,91 +2541,138 @@ http://www.unicode.org/unicode/reports/tr21/ (Case Mappings). =cut */ I32 -Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2) -{ - register const U8 *p1 = (const U8*)s1; - register const U8 *p2 = (const U8*)s2; - register const U8 *f1 = NULL; - register U8 *f2 = NULL; - register U8 *e1 = NULL; - register U8 *q1 = NULL; - register U8 *e2 = NULL; - register U8 *q2 = NULL; - STRLEN n1 = 0, n2 = 0; - U8 foldbuf1[UTF8_MAXBYTES_CASE+1]; - U8 foldbuf2[UTF8_MAXBYTES_CASE+1]; - U8 natbuf[1+1]; - STRLEN foldlen1, foldlen2; - bool match; - - if (pe1) - e1 = *(U8**)pe1; - if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1))) - f1 = (const U8*)s1 + l1; - if (pe2) - e2 = *(U8**)pe2; - if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2))) - f2 = (const U8*)s2 + l2; - - if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0)) - return 1; /* mismatch; possible infinite loop or false positive */ - - if (!u1 || !u2) - natbuf[1] = 0; /* Need to terminate the buffer. */ - - while ((e1 == 0 || p1 < e1) && - (f1 == 0 || p1 < f1) && - (e2 == 0 || p2 < e2) && - (f2 == 0 || p2 < f2)) { - if (n1 == 0) { - if (u1) - to_utf8_fold(p1, foldbuf1, &foldlen1); - else { - uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1))); - to_utf8_fold(natbuf, foldbuf1, &foldlen1); - } - q1 = foldbuf1; - n1 = foldlen1; - } - if (n2 == 0) { - if (u2) - to_utf8_fold(p2, foldbuf2, &foldlen2); - else { - uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2))); - to_utf8_fold(natbuf, foldbuf2, &foldlen2); - } - q2 = foldbuf2; - n2 = foldlen2; - } - while (n1 && n2) { - if ( UTF8SKIP(q1) != UTF8SKIP(q2) || - (UTF8SKIP(q1) == 1 && *q1 != *q2) || - memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) ) - return 1; /* mismatch */ - n1 -= UTF8SKIP(q1); - q1 += UTF8SKIP(q1); - n2 -= UTF8SKIP(q2); - q2 += UTF8SKIP(q2); - } - if (n1 == 0) - p1 += u1 ? UTF8SKIP(p1) : 1; - if (n2 == 0) - p2 += u2 ? UTF8SKIP(p2) : 1; - - } - - /* A match is defined by all the scans that specified - * an explicit length reaching their final goals. */ - match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2); - - if (match) { - if (pe1) - *pe1 = (char*)p1; - if (pe2) - *pe2 = (char*)p2; - } - - return match ? 0 : 1; /* 0 match, 1 mismatch */ +Perl_foldEQ_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2) +{ + dVAR; + register const U8 *p1 = (const U8*)s1; /* Point to current char */ + register const U8 *p2 = (const U8*)s2; + register const U8 *g1 = NULL; /* goal for s1 */ + register const U8 *g2 = NULL; + register const U8 *e1 = NULL; /* Don't scan s1 past this */ + register U8 *f1 = NULL; /* Point to current folded */ + register const U8 *e2 = NULL; + register U8 *f2 = NULL; + STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */ + U8 foldbuf1[UTF8_MAXBYTES_CASE+1]; + U8 foldbuf2[UTF8_MAXBYTES_CASE+1]; + U8 natbuf[2]; /* Holds native 8-bit char converted to utf8; + these always fit in 2 bytes */ + + PERL_ARGS_ASSERT_FOLDEQ_UTF8; + + if (pe1) { + e1 = *(U8**)pe1; + } + + if (l1) { + g1 = (const U8*)s1 + l1; + } + + if (pe2) { + e2 = *(U8**)pe2; + } + + if (l2) { + g2 = (const U8*)s2 + l2; + } + + /* Must have at least one goal */ + assert(g1 || g2); + + if (g1) { + + /* Will never match if goal is out-of-bounds */ + assert(! e1 || e1 >= g1); + + /* Here, there isn't an end pointer, or it is beyond the goal. We + * only go as far as the goal */ + e1 = g1; + } + else { + assert(e1); /* Must have an end for looking at s1 */ + } + + /* Same for goal for s2 */ + if (g2) { + assert(! e2 || e2 >= g2); + e2 = g2; + } + else { + assert(e2); + } + + /* Look through both strings, a character at a time */ + while (p1 < e1 && p2 < e2) { + + /* If at the beginning of a new character in s1, get its fold to use + * and the length of the fold */ + if (n1 == 0) { + if (u1) { + to_utf8_fold(p1, foldbuf1, &n1); + } + else { /* Not utf8, convert to it first and then get fold */ + uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1))); + to_utf8_fold(natbuf, foldbuf1, &n1); + } + f1 = foldbuf1; + } + + if (n2 == 0) { /* Same for s2 */ + if (u2) { + to_utf8_fold(p2, foldbuf2, &n2); + } + else { + uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2))); + to_utf8_fold(natbuf, foldbuf2, &n2); + } + f2 = foldbuf2; + } + + /* While there is more to look for in both folds, see if they + * continue to match */ + while (n1 && n2) { + U8 fold_length = UTF8SKIP(f1); + if (fold_length != UTF8SKIP(f2) + || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE + function call for single + character */ + || memNE((char*)f1, (char*)f2, fold_length)) + { + return 0; /* mismatch */ + } + + /* Here, they matched, advance past them */ + n1 -= fold_length; + f1 += fold_length; + n2 -= fold_length; + f2 += fold_length; + } + + /* When reach the end of any fold, advance the input past it */ + if (n1 == 0) { + p1 += u1 ? UTF8SKIP(p1) : 1; + } + if (n2 == 0) { + p2 += u2 ? UTF8SKIP(p2) : 1; + } + } /* End of loop through both strings */ + + /* A match is defined by each scan that specified an explicit length + * reaching its final goal, and the other not having matched a partial + * character (which can happen when the fold of a character is more than one + * character). */ + if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) { + return 0; + } + + /* Successful match. Set output pointers */ + if (pe1) { + *pe1 = (char*)p1; + } + if (pe2) { + *pe2 = (char*)p2; + } + return 1; } /*