=head1 Unicode Support
This file contains various utility functions for manipulating UTF8-encoded
-strings. For the uninitiated, this is a method of representing arbitrary
+strings. For the uninitiated, this is a method of representing arbitrary
Unicode characters as a variable number of bytes, in such a way that
characters in the ASCII range are unmodified, and a zero byte never appears
within non-zero characters.
/* The first problematic code point is the first surrogate */
if (uv >= UNICODE_SURROGATE_FIRST
- && ckWARN4_d(WARN_UTF8, WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
+ && ckWARN3_d(WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
{
if (UNICODE_IS_SURROGATE(uv)) {
if (flags & UNICODE_WARN_SURROGATE) {
Adds the UTF-8 representation of the native code point C<uv> to the end
of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
-bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words,
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words,
d = uvchr_to_utf8(d, uv);
Adds the UTF-8 representation of the native code point C<uv> to the end
of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
-bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words,
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words,
d = uvchr_to_utf8_flags(d, uv, flags);
UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
If both flags are set, the function will both warn and return NULL.
-The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
+The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags
affect how the function handles a Unicode non-character. And likewise, the
-UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, affect the handling of
+UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags affect the handling of
code points that are
above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
even less portable) can be warned and/or disallowed even if other above-Unicode
byte containing 0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to
be treated as malformations, while allowing smaller above-Unicode code points.
(Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
-including these, as malformations.) Similarly, UTF8_WARN_FE_FF acts just like
+including these, as malformations.)
+Similarly, UTF8_WARN_FE_FF acts just like
the other WARN flags, but applies just to these code points.
All other code points corresponding to Unicode characters, including private
}
}
-#ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */
- if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */
- && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
- {
- /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
- * generation of the sv, since no warnings are raised under CHECK */
- if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF
- && ckWARN_d(WARN_UTF8))
- {
- /* This message is deliberately not of the same syntax as the other
- * messages for malformations, for backwards compatibility in the
- * unlikely event that code is relying on its precise earlier text
- */
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0));
- pack_warn = packWARN(WARN_UTF8);
- }
- if (flags & UTF8_DISALLOW_FE_FF) {
- goto malformed;
- }
- }
+#ifndef EBCDIC /* EBCDIC can't overflow */
if (UNLIKELY(overflowed)) {
-
- /* If the first byte is FF, it will overflow a 32-bit word. If the
- * first byte is FE, it will overflow a signed 32-bit word. The
- * above preserves backward compatibility, since its message was used
- * in earlier versions of this code in preference to overflow */
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
goto malformed;
}
goto malformed;
}
- /* Here, the input is considered to be well-formed , but could be a
+ /* Here, the input is considered to be well-formed, but it still could be a
* problematic code point that is not allowed by the input parameters. */
if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
&& (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_WARN_ILLEGAL_INTERCHANGE)))
{
if (UNICODE_IS_SURROGATE(uv)) {
+
+ /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
+ * generation of the sv, since no warnings are raised under CHECK */
if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
- && ckWARN2_d(WARN_UTF8, WARN_SURROGATE))
+ && ckWARN_d(WARN_SURROGATE))
{
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
- pack_warn = packWARN2(WARN_UTF8, WARN_SURROGATE);
+ pack_warn = packWARN(WARN_SURROGATE);
}
if (flags & UTF8_DISALLOW_SURROGATE) {
goto disallowed;
}
else if ((uv > PERL_UNICODE_MAX)) {
if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
- && ckWARN2_d(WARN_UTF8, WARN_NON_UNICODE))
+ && ckWARN_d(WARN_NON_UNICODE))
{
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
- pack_warn = packWARN2(WARN_UTF8, WARN_NON_UNICODE);
+ pack_warn = packWARN(WARN_NON_UNICODE);
}
+#ifndef EBCDIC /* EBCDIC always allows FE, FF */
+
+ /* The first byte being 0xFE or 0xFF is a subset of the SUPER code
+ * points. We test for these after the regular SUPER ones, and
+ * before possibly bailing out, so that the more dire warning
+ * overrides the regular one, if applicable */
+ if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */
+ && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
+ {
+ if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY))
+ == UTF8_WARN_FE_FF
+ && ckWARN_d(WARN_UTF8))
+ {
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv));
+ pack_warn = packWARN(WARN_UTF8);
+ }
+ if (flags & UTF8_DISALLOW_FE_FF) {
+ goto disallowed;
+ }
+ }
+#endif
if (flags & UTF8_DISALLOW_SUPER) {
goto disallowed;
}
}
else if (UNICODE_IS_NONCHAR(uv)) {
if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
- && ckWARN2_d(WARN_UTF8, WARN_NONCHAR))
+ && ckWARN_d(WARN_NONCHAR))
{
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
- pack_warn = packWARN2(WARN_UTF8, WARN_NONCHAR);
+ pack_warn = packWARN(WARN_NONCHAR);
}
if (flags & UTF8_DISALLOW_NONCHAR) {
goto disallowed;
=for apidoc bytes_cmp_utf8
Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
-sequence of characters (stored as UTF-8) in C<u>, C<ulen>. Returns 0 if they are
+sequence of characters (stored as UTF-8)
+in C<u>, C<ulen>. Returns 0 if they are
equal, -1 or -2 if the first string is less than the second string, +1 or +2
if the first string is greater than the second string.
-1 or +1 is returned if the shorter string was identical to the start of the
-longer string. -2 or +2 is returned if the was a difference between characters
+longer string. -2 or +2 is returned if
+there was a difference between characters
within the strings.
=cut
Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
the newly-created string, and updates C<len> to contain the new
length. Returns the original string if no conversion occurs, C<len>
-is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
+is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
0 if C<s> is converted or consisted entirely of characters that are invariant
in utf8 (i.e., US-ASCII on non-EBCDIC machines).
* LENP will be set to the length in bytes of the string of changed characters
*
* The functions return the ordinal of the first character in the string of OUTP */
-#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUc")
-#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTc")
-#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLc")
+#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "")
+#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "")
+#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "")
/* This additionally has the input parameter SPECIALS, which if non-zero will
* cause this to use the SPECIALS hash for folding (meaning get full case
* folding); otherwise, when zero, this implies a simple case fold */
-#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecCf" : NULL)
+#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "" : NULL)
UV
Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
/*
=for apidoc to_utf8_case
-The C<p> contains the pointer to the UTF-8 string encoding
+C<p> contains the pointer to the UTF-8 string encoding
the character that is being converted. This routine assumes that the character
at C<p> is well-formed.
-The C<ustrp> is a pointer to the character buffer to put the
-conversion result to. The C<lenp> is a pointer to the length
+C<ustrp> is a pointer to the character buffer to put the
+conversion result to. C<lenp> is a pointer to the length
of the result.
-The C<swashp> is a pointer to the swash to use.
+C<swashp> is a pointer to the swash to use.
Both the special and normal mappings are stored in F<lib/unicore/To/Foo.pl>,
-and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>. The C<special> (usually,
+and loaded by SWASHNEW, using F<lib/utf8_heavy.pl>. C<special> (usually,
but not always, a multicharacter mapping), is tried first.
-The C<special> is a string like "utf8::ToSpecLower", which means the
-hash %utf8::ToSpecLower. The access to the hash is through
-Perl_to_utf8_case().
+C<special> is a string, normally C<NULL> or C<"">. C<NULL> means to not use
+any special mappings; C<""> means to use the special mappings. Values other
+than these two are treated as the name of the hash containing the special
+mappings, like C<"utf8::ToSpecLower">.
-The C<normal> is a string like "ToLower" which means the swash
+C<normal> is a string like "ToLower" which means the swash
%utf8::ToLower.
=cut */
if (special) {
/* It might be "special" (sometimes, but not always,
* a multicharacter mapping) */
- HV * const hv = get_hv(special, 0);
+ HV *hv = NULL;
SV **svp;
- if (hv &&
- (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) &&
- (*svp)) {
+ /* If passed in the specials name, use that; otherwise use any
+ * given in the swash */
+ if (*special != '\0') {
+ hv = get_hv(special, 0);
+ }
+ else {
+ svp = hv_fetchs(MUTABLE_HV(SvRV(*swashp)), "SPECIALS", 0);
+ if (svp) {
+ hv = MUTABLE_HV(SvRV(*svp));
+ }
+ }
+
+ if (hv
+ && (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE))
+ && (*svp))
+ {
const char *s;
s = SvPV_const(*svp, len);
}
else if UTF8_IS_DOWNGRADEABLE_START(*p) {
if (flags) {
- result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
+ U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+ result = toUPPER_LC(c);
}
else {
return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
*lenp = 1;
}
else {
- *ustrp = UTF8_EIGHT_BIT_HI(result);
- *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
+ *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
+ *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
*lenp = 2;
}
}
else if UTF8_IS_DOWNGRADEABLE_START(*p) {
if (flags) {
- result = toUPPER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
+ U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+ result = toUPPER_LC(c);
}
else {
return _to_upper_title_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
*lenp = 1;
}
else {
- *ustrp = UTF8_EIGHT_BIT_HI(result);
- *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
+ *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
+ *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
*lenp = 2;
}
}
else if UTF8_IS_DOWNGRADEABLE_START(*p) {
if (flags) {
- result = toLOWER_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
+ U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+ result = toLOWER_LC(c);
}
else {
return to_lower_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
*lenp = 1;
}
else {
- *ustrp = UTF8_EIGHT_BIT_HI(result);
- *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
+ *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
+ *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
*lenp = 2;
}
}
else if UTF8_IS_DOWNGRADEABLE_START(*p) {
if (flags & FOLD_FLAGS_LOCALE) {
- result = toFOLD_LC(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)));
+ U8 c = TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1));
+ result = toFOLD_LC(c);
}
else {
return _to_fold_latin1(TWO_BYTE_UTF8_TO_NATIVE(*p, *(p+1)),
*lenp = 1;
}
else {
- *ustrp = UTF8_EIGHT_BIT_HI(result);
- *(ustrp + 1) = UTF8_EIGHT_BIT_LO(result);
+ *ustrp = UTF8_EIGHT_BIT_HI((U8) result);
+ *(ustrp + 1) = UTF8_EIGHT_BIT_LO((U8) result);
*lenp = 2;
}
*max = *min;
/* Non-binary tables have a third entry: what the first element of the
- * range maps to */
+ * range maps to. The map for those currently read here is in hex */
if (wants_value) {
if (isBLANK(*l)) {
++l;
-
- /* The ToLc, etc table mappings are not in hex, and must be
- * corrected by adding the code point to them */
- if (typeto) {
- char *after_strtol = (char *) lend;
- *val = Strtol((char *)l, &after_strtol, 10);
- l = (U8 *) after_strtol;
- }
- else { /* Other tables are in hex, and are the correct result
- without tweaking */
- flags = PERL_SCAN_SILENT_ILLDIGIT
- | PERL_SCAN_DISALLOW_PREFIX
- | PERL_SCAN_SILENT_NON_PORTABLE;
- numlen = lend - l;
- *val = grok_hex((char *)l, &numlen, &flags, NULL);
- if (numlen)
- l += numlen;
- else
- *val = 0;
- }
+ flags = PERL_SCAN_SILENT_ILLDIGIT
+ | PERL_SCAN_DISALLOW_PREFIX
+ | PERL_SCAN_SILENT_NON_PORTABLE;
+ numlen = lend - l;
+ *val = grok_hex((char *)l, &numlen, &flags, NULL);
+ if (numlen)
+ l += numlen;
+ else
+ *val = 0;
}
else {
*val = 0;
loc = (char *) l;
lend = l + lcur;
- /* Scan the input to count the number of lines to preallocate array size
- * based on worst possible case, which is each line in the input creates 2
- * elements in the inversion list: 1) the beginning of a range in the list;
- * 2) the beginning of a range not in the list. */
- while ((loc = (strchr(loc, '\n'))) != NULL) {
- elements += 2;
- loc++;
+ if (*l == 'V') { /* Inversion list format */
+ char *after_strtol = (char *) lend;
+ UV element0;
+ UV* other_elements_ptr;
+
+ /* The first number is a count of the rest */
+ l++;
+ elements = Strtoul((char *)l, &after_strtol, 10);
+ l = (U8 *) after_strtol;
+
+ /* Get the 0th element, which is needed to setup the inversion list */
+ element0 = (UV) Strtoul((char *)l, &after_strtol, 10);
+ l = (U8 *) after_strtol;
+ invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
+ elements--;
+
+ /* Then just populate the rest of the input */
+ while (elements-- > 0) {
+ if (l > lend) {
+ Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements);
+ }
+ *other_elements_ptr++ = (UV) Strtoul((char *)l, &after_strtol, 10);
+ l = (U8 *) after_strtol;
+ }
}
+ else {
- /* If the ending is somehow corrupt and isn't a new line, add another
- * element for the final range that isn't in the inversion list */
- if (! (*lend == '\n'
- || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
- {
- elements++;
- }
+ /* Scan the input to count the number of lines to preallocate array
+ * size based on worst possible case, which is each line in the input
+ * creates 2 elements in the inversion list: 1) the beginning of a
+ * range in the list; 2) the beginning of a range not in the list. */
+ while ((loc = (strchr(loc, '\n'))) != NULL) {
+ elements += 2;
+ loc++;
+ }
- invlist = _new_invlist(elements);
+ /* If the ending is somehow corrupt and isn't a new line, add another
+ * element for the final range that isn't in the inversion list */
+ if (! (*lend == '\n'
+ || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
+ {
+ elements++;
+ }
- /* Now go through the input again, adding each range to the list */
- while (l < lend) {
- UV start, end;
- UV val; /* Not used by this function */
+ invlist = _new_invlist(elements);
- l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
- cBOOL(octets), typestr);
+ /* Now go through the input again, adding each range to the list */
+ while (l < lend) {
+ UV start, end;
+ UV val; /* Not used by this function */
- if (l > lend) {
- break;
- }
+ l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
+ cBOOL(octets), typestr);
+
+ if (l > lend) {
+ break;
+ }
- invlist = _add_range_to_invlist(invlist, start, end);
+ invlist = _add_range_to_invlist(invlist, start, end);
+ }
}
/* Invert if the data says it should be */
if (invert_it_svp && SvUV(*invert_it_svp)) {
- _invlist_invert_prop(invlist);
+ _invlist_invert(invlist);
}
/* This code is copied from swatch_get()