=head1 Unicode Support
This file contains various utility functions for manipulating UTF8-encoded
-strings. For the uninitiated, this is a method of representing arbitrary
+strings. For the uninitiated, this is a method of representing arbitrary
Unicode characters as a variable number of bytes, in such a way that
characters in the ASCII range are unmodified, and a zero byte never appears
within non-zero characters.
/* The first problematic code point is the first surrogate */
if (uv >= UNICODE_SURROGATE_FIRST
- && ckWARN4_d(WARN_UTF8, WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
+ && ckWARN3_d(WARN_SURROGATE, WARN_NON_UNICODE, WARN_NONCHAR))
{
if (UNICODE_IS_SURROGATE(uv)) {
if (flags & UNICODE_WARN_SURROGATE) {
Adds the UTF-8 representation of the native code point C<uv> to the end
of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
-bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words,
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words,
d = uvchr_to_utf8(d, uv);
Adds the UTF-8 representation of the native code point C<uv> to the end
of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
-bytes available. The return value is the pointer to the byte after the
-end of the new character. In other words,
+bytes available. The return value is the pointer to the byte after the
+end of the new character. In other words,
d = uvchr_to_utf8_flags(d, uv, flags);
byte containing 0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to
be treated as malformations, while allowing smaller above-Unicode code points.
(Of course UTF8_DISALLOW_SUPER will treat all above-Unicode code points,
-including these, as malformations.) Similarly, UTF8_WARN_FE_FF acts just like
+including these, as malformations.)
+Similarly, UTF8_WARN_FE_FF acts just like
the other WARN flags, but applies just to these code points.
All other code points corresponding to Unicode characters, including private
}
}
-#ifndef EBCDIC /* EBCDIC allows FE, FF, can't overflow */
- if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */
- && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
- {
- /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
- * generation of the sv, since no warnings are raised under CHECK */
- if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY)) == UTF8_WARN_FE_FF
- && ckWARN_d(WARN_UTF8))
- {
- /* This message is deliberately not of the same syntax as the other
- * messages for malformations, for backwards compatibility in the
- * unlikely event that code is relying on its precise earlier text
- */
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s Code point beginning with byte 0x%02X is not Unicode, and not portable", malformed_text, *s0));
- pack_warn = packWARN(WARN_UTF8);
- }
- if (flags & UTF8_DISALLOW_FE_FF) {
- goto malformed;
- }
- }
+#ifndef EBCDIC /* EBCDIC can't overflow */
if (UNLIKELY(overflowed)) {
-
- /* If the first byte is FF, it will overflow a 32-bit word. If the
- * first byte is FE, it will overflow a signed 32-bit word. The
- * above preserves backward compatibility, since its message was used
- * in earlier versions of this code in preference to overflow */
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (overflow at byte 0x%02x, after start byte 0x%02x)", malformed_text, overflow_byte, *s0));
goto malformed;
}
goto malformed;
}
- /* Here, the input is considered to be well-formed , but could be a
+ /* Here, the input is considered to be well-formed, but it still could be a
* problematic code point that is not allowed by the input parameters. */
if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
&& (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_WARN_ILLEGAL_INTERCHANGE)))
{
if (UNICODE_IS_SURROGATE(uv)) {
+
+ /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
+ * generation of the sv, since no warnings are raised under CHECK */
if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
- && ckWARN2_d(WARN_UTF8, WARN_SURROGATE))
+ && ckWARN_d(WARN_SURROGATE))
{
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
- pack_warn = packWARN2(WARN_UTF8, WARN_SURROGATE);
+ pack_warn = packWARN(WARN_SURROGATE);
}
if (flags & UTF8_DISALLOW_SURROGATE) {
goto disallowed;
}
else if ((uv > PERL_UNICODE_MAX)) {
if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
- && ckWARN2_d(WARN_UTF8, WARN_NON_UNICODE))
+ && ckWARN_d(WARN_NON_UNICODE))
{
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
- pack_warn = packWARN2(WARN_UTF8, WARN_NON_UNICODE);
+ pack_warn = packWARN(WARN_NON_UNICODE);
}
+#ifndef EBCDIC /* EBCDIC always allows FE, FF */
+
+ /* The first byte being 0xFE or 0xFF is a subset of the SUPER code
+ * points. We test for these after the regular SUPER ones, and
+ * before possibly bailing out, so that the more dire warning
+ * overrides the regular one, if applicable */
+ if ((*s0 & 0xFE) == 0xFE /* matches both FE, FF */
+ && (flags & (UTF8_WARN_FE_FF|UTF8_DISALLOW_FE_FF)))
+ {
+ if ((flags & (UTF8_WARN_FE_FF|UTF8_CHECK_ONLY))
+ == UTF8_WARN_FE_FF
+ && ckWARN_d(WARN_UTF8))
+ {
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%"UVXf" is not Unicode, and not portable", uv));
+ pack_warn = packWARN(WARN_UTF8);
+ }
+ if (flags & UTF8_DISALLOW_FE_FF) {
+ goto disallowed;
+ }
+ }
+#endif
if (flags & UTF8_DISALLOW_SUPER) {
goto disallowed;
}
}
else if (UNICODE_IS_NONCHAR(uv)) {
if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
- && ckWARN2_d(WARN_UTF8, WARN_NONCHAR))
+ && ckWARN_d(WARN_NONCHAR))
{
sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
- pack_warn = packWARN2(WARN_UTF8, WARN_NONCHAR);
+ pack_warn = packWARN(WARN_NONCHAR);
}
if (flags & UTF8_DISALLOW_NONCHAR) {
goto disallowed;
=for apidoc bytes_cmp_utf8
Compares the sequence of characters (stored as octets) in C<b>, C<blen> with the
-sequence of characters (stored as UTF-8) in C<u>, C<ulen>. Returns 0 if they are
+sequence of characters (stored as UTF-8)
+in C<u>, C<ulen>. Returns 0 if they are
equal, -1 or -2 if the first string is less than the second string, +1 or +2
if the first string is greater than the second string.
-1 or +1 is returned if the shorter string was identical to the start of the
-longer string. -2 or +2 is returned if the was a difference between characters
+longer string. -2 or +2 is returned if
+there was a difference between characters
within the strings.
=cut
Unlike L</utf8_to_bytes> but like L</bytes_to_utf8>, returns a pointer to
the newly-created string, and updates C<len> to contain the new
length. Returns the original string if no conversion occurs, C<len>
-is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
+is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
0 if C<s> is converted or consisted entirely of characters that are invariant
in utf8 (i.e., US-ASCII on non-EBCDIC machines).
}
}
- if (hv &&
- (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE)) &&
- (*svp)) {
+ if (hv
+ && (svp = hv_fetch(hv, (const char*)p, UNISKIP(uv1), FALSE))
+ && (*svp))
+ {
const char *s;
s = SvPV_const(*svp, len);
loc = (char *) l;
lend = l + lcur;
- /* Scan the input to count the number of lines to preallocate array size
- * based on worst possible case, which is each line in the input creates 2
- * elements in the inversion list: 1) the beginning of a range in the list;
- * 2) the beginning of a range not in the list. */
- while ((loc = (strchr(loc, '\n'))) != NULL) {
- elements += 2;
- loc++;
+ if (*l == 'V') { /* Inversion list format */
+ char *after_strtol = (char *) lend;
+ UV element0;
+ UV* other_elements_ptr;
+
+ /* The first number is a count of the rest */
+ l++;
+ elements = Strtoul((char *)l, &after_strtol, 10);
+ l = (U8 *) after_strtol;
+
+ /* Get the 0th element, which is needed to setup the inversion list */
+ element0 = (UV) Strtoul((char *)l, &after_strtol, 10);
+ l = (U8 *) after_strtol;
+ invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
+ elements--;
+
+ /* Then just populate the rest of the input */
+ while (elements-- > 0) {
+ if (l > lend) {
+ Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements);
+ }
+ *other_elements_ptr++ = (UV) Strtoul((char *)l, &after_strtol, 10);
+ l = (U8 *) after_strtol;
+ }
}
+ else {
- /* If the ending is somehow corrupt and isn't a new line, add another
- * element for the final range that isn't in the inversion list */
- if (! (*lend == '\n'
- || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
- {
- elements++;
- }
+ /* Scan the input to count the number of lines to preallocate array
+ * size based on worst possible case, which is each line in the input
+ * creates 2 elements in the inversion list: 1) the beginning of a
+ * range in the list; 2) the beginning of a range not in the list. */
+ while ((loc = (strchr(loc, '\n'))) != NULL) {
+ elements += 2;
+ loc++;
+ }
- invlist = _new_invlist(elements);
+ /* If the ending is somehow corrupt and isn't a new line, add another
+ * element for the final range that isn't in the inversion list */
+ if (! (*lend == '\n'
+ || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
+ {
+ elements++;
+ }
- /* Now go through the input again, adding each range to the list */
- while (l < lend) {
- UV start, end;
- UV val; /* Not used by this function */
+ invlist = _new_invlist(elements);
- l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
- cBOOL(octets), typestr);
+ /* Now go through the input again, adding each range to the list */
+ while (l < lend) {
+ UV start, end;
+ UV val; /* Not used by this function */
- if (l > lend) {
- break;
- }
+ l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
+ cBOOL(octets), typestr);
- invlist = _add_range_to_invlist(invlist, start, end);
+ if (l > lend) {
+ break;
+ }
+
+ invlist = _add_range_to_invlist(invlist, start, end);
+ }
}
/* Invert if the data says it should be */
if (invert_it_svp && SvUV(*invert_it_svp)) {
- _invlist_invert_prop(invlist);
+ _invlist_invert(invlist);
}
/* This code is copied from swatch_get()