static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
-/*
+/*
=head1 Unicode Support
This file contains various utility functions for manipulating UTF8-encoded
is, if they are invariant. On ASCII-ish machines, only ASCII characters
fit this definition, hence the function's name.
+If C<len> is 0, it will be calculated using C<strlen(s)>.
+
See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
=cut
/*
=for apidoc uvuni_to_utf8_flags
-Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
-of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
+Adds the UTF-8 representation of the code point C<uv> to the end
+of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
bytes available. The return value is the pointer to the byte after the
end of the new character. In other words,
d = uvuni_to_utf8_flags(d, uv, 0);
-is the recommended Unicode-aware way of saying
+This is the recommended Unicode-aware way of saying
*(d++) = uv;
+This function will convert to UTF-8 (and not warn) even code points that aren't
+legal Unicode or are problematic, unless C<flags> contains one or more of the
+following flags.
+If C<uv> is a Unicode surrogate code point and UNICODE_WARN_SURROGATE is set,
+the function will raise a warning, provided UTF8 warnings are enabled. If instead
+UNICODE_DISALLOW_SURROGATE is set, the function will fail and return NULL.
+If both flags are set, the function will both warn and return NULL.
+
+The UNICODE_WARN_NONCHAR and UNICODE_DISALLOW_NONCHAR flags correspondingly
+affect how the function handles a Unicode non-character. And, likewise for the
+UNICODE_WARN_SUPER and UNICODE_DISALLOW_SUPER flags, and code points that are
+above the Unicode maximum of 0x10FFFF. Code points above 0x7FFF_FFFF (which are
+even less portable) can be warned and/or disallowed even if other above-Unicode
+code points are accepted by the UNICODE_WARN_FE_FF and UNICODE_DISALLOW_FE_FF
+flags.
+
+And finally, the flag UNICODE_WARN_ILLEGAL_INTERCHANGE selects all four of the
+above WARN flags; and UNICODE_DISALLOW_ILLEGAL_INTERCHANGE selects all four
+DISALLOW flags.
+
+
=cut
*/
{
PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
- if (ckWARN(WARN_UTF8)) {
- if (UNICODE_IS_SURROGATE(uv) &&
- !(flags & UNICODE_ALLOW_SURROGATE))
- Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
- else if (
- ((uv >= 0xFDD0 && uv <= 0xFDEF &&
- !(flags & UNICODE_ALLOW_FDD0))
- ||
- ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
- !(flags & UNICODE_ALLOW_FFFF))) &&
- /* UNICODE_ALLOW_SUPER includes
- * FFFEs and FFFFs beyond 0x10FFFF. */
- ((uv <= PERL_UNICODE_MAX) ||
- !(flags & UNICODE_ALLOW_SUPER))
- )
- Perl_warner(aTHX_ packWARN(WARN_UTF8),
- "Unicode non-character 0x%04"UVxf" is illegal for interchange", uv);
+ if (ckWARN_d(WARN_UTF8)) {
+ if (UNICODE_IS_SURROGATE(uv)) {
+ if (flags & UNICODE_WARN_SURROGATE) {
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "UTF-16 surrogate U+%04"UVXf, uv);
+ }
+ if (flags & UNICODE_DISALLOW_SURROGATE) {
+ return NULL;
+ }
+ }
+ else if (UNICODE_IS_SUPER(uv)) {
+ if (flags & UNICODE_WARN_SUPER
+ || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_WARN_FE_FF)))
+ {
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
+ }
+ if (flags & UNICODE_DISALLOW_SUPER
+ || (UNICODE_IS_FE_FF(uv) && (flags & UNICODE_DISALLOW_FE_FF)))
+ {
+ return NULL;
+ }
+ }
+ else if (UNICODE_IS_NONCHAR(uv)) {
+ if (flags & UNICODE_WARN_NONCHAR) {
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "Unicode non-character U+%04"UVXf" is illegal for open interchange",
+ uv);
+ }
+ if (flags & UNICODE_DISALLOW_NONCHAR) {
+ return NULL;
+ }
+ }
}
if (UNI_IS_INVARIANT(uv)) {
*d++ = (U8)UTF_TO_NATIVE(uv);
if (!UTF8_IS_CONTINUATION(*s))
return 0;
uv = UTF8_ACCUMULATE(uv, *s);
- if (uv < ouv)
+ if (uv < ouv)
return 0;
ouv = uv;
s++;
=for apidoc is_utf8_string
Returns true if first C<len> bytes of the given string form a valid
-UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
-not mean 'a string that contains code points above 0x7F encoded in UTF-8'
-because a valid ASCII string is a valid UTF-8 string.
+UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
+using C<strlen(s)>. Note that 'a valid UTF-8 string' does not mean 'a
+string that contains code points above 0x7F encoded in UTF-8' because a
+valid ASCII string is a valid UTF-8 string.
See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
=for apidoc utf8n_to_uvuni
Bottom level UTF-8 decode routine.
-Returns the Unicode code point value of the first character in the string C<s>
-which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
-C<retlen> will be set to the length, in bytes, of that character.
-
-If C<s> does not point to a well-formed UTF-8 character, the behaviour
-is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
-it is assumed that the caller will raise a warning, and this function
-will silently just set C<retlen> to C<-1> and return zero. If the
-C<flags> does not contain UTF8_CHECK_ONLY, warnings about
-malformations will be given, C<retlen> will be set to the expected
-length of the UTF-8 character in bytes, and zero will be returned.
-
-The C<flags> can also contain various flags to allow deviations from
-the strict UTF-8 encoding (see F<utf8.h>).
+Returns the code point value of the first character in the string C<s>
+which is assumed to be in UTF-8 (or UTF-EBCDIC) encoding and no longer than
+C<curlen> bytes; C<retlen> will be set to the length, in bytes, of that
+character.
+
+The value of C<flags> determines the behavior when C<s> does not point to a
+well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
+C<retlen> is set to the expected length of the UTF-8 character in bytes, zero
+is returned, and if UTF-8 warnings haven't been lexically disabled, a warning
+is raised.
+
+Various ALLOW flags can be set in C<flags> to allow (and not warn on)
+individual types of malformations, such as the sequence being overlong (that
+is, when there is a shorter sequence that can express the same code point;
+overlong sequences are expressly forbidden in the UTF-8 standard due to
+potential security issues). Another malformation example is the first byte of
+a character not being a legal first byte. See F<utf8.h> for the list of such
+flags. Of course, the value returned by this function under such conditions is
+not reliable.
+
+The UTF8_CHECK_ONLY flag overrides the behavior when a non-allowed (by other
+flags) malformation is found. If this flag is set, the routine assumes that
+the caller will raise a warning, and this function will silently just set
+C<retlen> to C<-1> and return zero.
+
+Certain code points are considered problematic. These are Unicode surrogates,
+Unicode non-characters, and code points above the Unicode maximum of 0x10FFF.
+By default these are considered regular code points, but certain situations
+warrant special handling for them. if C<flags> contains
+UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
+malformations and handled as such. The flags UTF8_DISALLOW_SURROGATE,
+UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
+maximum) can be set to disallow these categories individually.
+
+The flags UTF8_WARN_ILLEGAL_INTERCHANGE, UTF8_WARN_SURROGATE,
+UTF8_WARN_NONCHAR, and UTF8_WARN_SUPER will cause warning messages to be raised
+for their respective categories, but otherwise the code points are considered
+valid (not malformations). To get a category to both be treated as a
+malformation and raise a warning, specify both the WARN and DISALLOW flags.
+(But note that warnings are not raised if lexically disabled nor if
+UTF8_CHECK_ONLY is also specified.)
+
+Very large code points (above 0x7FFF_FFFF) are considered more problematic than
+the others that are above the Unicode legal maximum. There are several
+reasons, one of which is that the original UTF-8 specification never went above
+this number (the current 0x10FFF limit was imposed later). The UTF-8 encoding
+on ASCII platforms for these large code point begins with a byte containing
+0xFE or 0xFF. The UTF8_DISALLOW_FE_FF flag will cause them to be treated as
+malformations, while allowing smaller above-Unicode code points. (Of course
+UTF8_DISALLOW_SUPER will treat all above-Unicode code points, including these,
+as malformations.) Similarly, UTF8_WARN_FE_FF acts just like the other WARN
+flags, but applies just to these code points.
+
+All other code points corresponding to Unicode characters, including private
+use and those yet to be assigned, are never considered malformed and never
+warn.
Most code should use utf8_to_uvchr() rather than call this directly.
const U8 * const s0 = s;
UV uv = *s, ouv = 0;
STRLEN len = 1;
- const bool dowarn = ckWARN_d(WARN_UTF8);
+ bool dowarn = ckWARN_d(WARN_UTF8);
const UV startbyte = *s;
STRLEN expectlen = 0;
U32 warning = 0;
- SV* sv;
+ SV* sv = NULL;
PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
-/* This list is a superset of the UTF8_ALLOW_XXX. BUT it isn't, eg SUPER missing XXX */
+/* This list is a superset of the UTF8_ALLOW_XXX. */
#define UTF8_WARN_EMPTY 1
#define UTF8_WARN_CONTINUATION 2
#define UTF8_WARN_NON_CONTINUATION 3
-#define UTF8_WARN_FE_FF 4
-#define UTF8_WARN_SHORT 5
-#define UTF8_WARN_OVERFLOW 6
-#define UTF8_WARN_SURROGATE 7
-#define UTF8_WARN_LONG 8
-#define UTF8_WARN_FFFF 9 /* Also FFFE. */
+#define UTF8_WARN_SHORT 4
+#define UTF8_WARN_OVERFLOW 5
+#define UTF8_WARN_LONG 6
if (curlen == 0 &&
!(flags & UTF8_ALLOW_EMPTY)) {
#ifdef EBCDIC
uv = NATIVE_TO_UTF(uv);
#else
- if ((uv == 0xfe || uv == 0xff) &&
- !(flags & UTF8_ALLOW_FE_FF)) {
- warning = UTF8_WARN_FE_FF;
- goto malformed;
+ if (uv == 0xfe || uv == 0xff) {
+ if (flags & (UTF8_WARN_SUPER|UTF8_WARN_FE_FF)) {
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point beginning with byte 0x%02"UVXf" is not Unicode, and not portable", uv));
+ flags &= ~UTF8_WARN_SUPER; /* Only warn once on this problem */
+ }
+ if (flags & (UTF8_DISALLOW_SUPER|UTF8_DISALLOW_FE_FF)) {
+ goto malformed;
+ }
}
#endif
len--;
s++;
- ouv = uv;
+ ouv = uv; /* ouv is the value from the previous iteration */
while (len--) {
if (!UTF8_IS_CONTINUATION(*s) &&
}
else
uv = UTF8_ACCUMULATE(uv, *s);
- if (!(uv > ouv)) {
+ if (!(uv > ouv)) { /* If the value didn't grow from the previous
+ iteration, something is horribly wrong */
/* These cannot be allowed. */
if (uv == ouv) {
if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
ouv = uv;
}
- if (UNICODE_IS_SURROGATE(uv) &&
- !(flags & UTF8_ALLOW_SURROGATE)) {
- warning = UTF8_WARN_SURROGATE;
- goto malformed;
- } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
- !(flags & UTF8_ALLOW_LONG)) {
+ if ((expectlen > (STRLEN)UNISKIP(uv)) && !(flags & UTF8_ALLOW_LONG)) {
warning = UTF8_WARN_LONG;
goto malformed;
- } else if (UNICODE_IS_ILLEGAL(uv) &&
- !(flags & UTF8_ALLOW_FFFF)) {
- warning = UTF8_WARN_FFFF;
- goto malformed;
+ } else if (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE|UTF8_WARN_ILLEGAL_INTERCHANGE)) {
+ if (UNICODE_IS_SURROGATE(uv)) {
+ if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE) {
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
+ }
+ if (flags & UTF8_DISALLOW_SURROGATE) {
+ goto disallowed;
+ }
+ }
+ else if (UNICODE_IS_NONCHAR(uv)) {
+ if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR ) {
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
+ }
+ if (flags & UTF8_DISALLOW_NONCHAR) {
+ goto disallowed;
+ }
+ }
+ else if ((uv > PERL_UNICODE_MAX)) {
+ if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER) {
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv));
+ }
+ if (flags & UTF8_DISALLOW_SUPER) {
+ goto disallowed;
+ }
+ }
+
+ /* Here, this is not considered a malformed character, so drop through
+ * to return it */
}
return uv;
+disallowed: /* Is disallowed, but otherwise not malformed. 'sv' will have been
+ set if there is to be a warning. */
+ if (!sv) {
+ dowarn = 0;
+ }
+
malformed:
if (flags & UTF8_CHECK_ONLY) {
}
if (dowarn) {
- if (warning == UTF8_WARN_FFFF) {
- sv = newSVpvs_flags("Unicode non-character ", SVs_TEMP);
- Perl_sv_catpvf(aTHX_ sv, "0x%04"UVxf" is illegal for interchange", uv);
- }
- else {
+ if (! sv) {
sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
+ }
- switch (warning) {
- case 0: /* Intentionally empty. */ break;
- case UTF8_WARN_EMPTY:
- sv_catpvs(sv, "(empty string)");
- break;
- case UTF8_WARN_CONTINUATION:
- Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
- break;
- case UTF8_WARN_NON_CONTINUATION:
- if (s == s0)
- Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
- (UV)s[1], startbyte);
- else {
- const int len = (int)(s-s0);
- Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
- (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
- }
+ switch (warning) {
+ case 0: /* Intentionally empty. */ break;
+ case UTF8_WARN_EMPTY:
+ sv_catpvs(sv, "(empty string)");
+ break;
+ case UTF8_WARN_CONTINUATION:
+ Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
+ break;
+ case UTF8_WARN_NON_CONTINUATION:
+ if (s == s0)
+ Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
+ (UV)s[1], startbyte);
+ else {
+ const int len = (int)(s-s0);
+ Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
+ (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
+ }
- break;
- case UTF8_WARN_FE_FF:
- Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
- break;
- case UTF8_WARN_SHORT:
- Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
- (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
- expectlen = curlen; /* distance for caller to skip */
- break;
- case UTF8_WARN_OVERFLOW:
- Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
- ouv, *s, startbyte);
- break;
- case UTF8_WARN_SURROGATE:
- Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
- break;
- case UTF8_WARN_LONG:
- Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
- (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
- break;
- default:
- sv_catpvs(sv, "(unknown reason)");
- break;
- }
+ break;
+ case UTF8_WARN_SHORT:
+ Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
+ (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
+ expectlen = curlen; /* distance for caller to skip */
+ break;
+ case UTF8_WARN_OVERFLOW:
+ Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
+ ouv, *s, startbyte);
+ break;
+ case UTF8_WARN_LONG:
+ Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
+ (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
+ break;
+ default:
+ sv_catpvs(sv, "(unknown reason)");
+ break;
}
- if (warning) {
+ if (sv) {
const char * const s = SvPVX_const(sv);
if (PL_op)
/*
=for apidoc utf8_to_uvchr
-Returns the native character value of the first character in the string C<s>
+Returns the native code point of the first character in the string C<s>
which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
length, in bytes, of that character.
=cut
*/
+
UV
Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
{
PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
- ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
/*
/* Call the low level routine asking for checks */
return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
- ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
}
/*
}
/*
+=for apidoc bytes_cmp_utf8
+
+Compares the sequence of characters (stored as octets) in b, blen with the
+sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
+equal, -1 or -2 if the first string is less than the second string, +1 or +2
+if the first string is greater than the second string.
+
+-1 or +1 is returned if the shorter string was identical to the start of the
+longer string. -2 or +2 is returned if the was a difference between characters
+within the strings.
+
+=cut
+*/
+
+int
+Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
+{
+ const U8 *const bend = b + blen;
+ const U8 *const uend = u + ulen;
+
+ PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
+
+ PERL_UNUSED_CONTEXT;
+
+ while (b < bend && u < uend) {
+ U8 c = *u++;
+ if (!UTF8_IS_INVARIANT(c)) {
+ if (UTF8_IS_DOWNGRADEABLE_START(c)) {
+ if (u < uend) {
+ U8 c1 = *u++;
+ if (UTF8_IS_CONTINUATION(c1)) {
+ c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1));
+ } else {
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+ "Malformed UTF-8 character "
+ "(unexpected non-continuation byte 0x%02x"
+ ", immediately after start byte 0x%02x)"
+ /* Dear diag.t, it's in the pod. */
+ "%s%s", c1, c,
+ PL_op ? " in " : "",
+ PL_op ? OP_DESC(PL_op) : "");
+ return -2;
+ }
+ } else {
+ if (PL_op)
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+ "%s in %s", unees, OP_DESC(PL_op));
+ else
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
+ return -2; /* Really want to return undef :-) */
+ }
+ } else {
+ return -2;
+ }
+ }
+ if (*b != c) {
+ return *b < c ? -2 : +2;
+ }
+ ++b;
+ }
+
+ if (b == bend && u == uend)
+ return 0;
+
+ return b < bend ? +1 : -1;
+}
+
+/*
=for apidoc utf8_to_bytes
Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
U8 c = *s++;
if (!UTF8_IS_INVARIANT(c)) {
/* Then it is two-byte encoded */
- c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
- c = ASCII_TO_NATIVE(c);
+ c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++));
}
*d++ = c;
}
/*
=for apidoc bytes_to_utf8
-Converts a string C<s> of length C<len> from the native encoding into UTF-8.
+Converts a string C<s> of length C<len> bytes from the native encoding into
+UTF-8.
Returns a pointer to the newly-created string, and sets C<len> to
-reflect the new length.
+reflect the new length in bytes.
A NUL character will be written after the end of the string.
PERL_ARGS_ASSERT_TO_UTF8_CASE;
+ /* Note that swash_fetch() doesn't output warnings for these because it
+ * assumes we will */
+ if (uv1 >= UNICODE_SURROGATE_FIRST && ckWARN_d(WARN_UTF8)) {
+ if (uv1 <= UNICODE_SURROGATE_LAST) {
+ const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "Operation \"%s\" returns its argument for UTF-16 surrogate U+%04"UVXf"", desc, uv1);
+ }
+ else if (UNICODE_IS_SUPER(uv1)) {
+ const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "Operation \"%s\" returns its argument for non-Unicode code point 0x%04"UVXf"", desc, uv1);
+ }
+
+ /* Note that non-characters are perfectly legal, so no warning should
+ * be given */
+ }
+
uvuni_to_utf8(tmpbuf, uv1);
if (!*swashp) /* load on-demand */
const size_t name_len = strlen(name);
HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
SV* errsv_save;
+ GV *method;
PERL_ARGS_ASSERT_SWASH_INIT;
ENTER;
SAVEHINTS();
save_re_context();
- if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
+ method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
+ if (!method) { /* demand load utf8 */
ENTER;
errsv_save = newSVsv(ERRSV);
/* It is assumed that callers of this routine are not passing in any
mPUSHi(none);
PUTBACK;
errsv_save = newSVsv(ERRSV);
- if (call_method("SWASHNEW", G_SCALAR))
+ /* If we already have a pointer to the method, no need to use call_method()
+ to repeat the lookup. */
+ if (method ? call_sv(MUTABLE_SV(method), G_SCALAR)
+ : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
retval = newSVsv(*PL_stack_sp--);
else
retval = &PL_sv_undef;
* return several Unicode characters for a single Unicode character
* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
* the lower-level routine, and it is similarly broken for returning
- * multiple values. --jhi */
+ * multiple values. --jhi
+ * For those, you should use to_utf8_case() instead */
/* Now SWASHGET is recasted into S_swash_get in this file. */
/* Note:
/* If char is encoded then swatch is for the prefix */
needents = (1 << UTF_ACCUMULATION_SHIFT);
off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
+ if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_UTF8)) {
+ const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
+
+ /* This outputs warnings for binary properties only, assuming that
+ * to_utf8_case() will output any. Also, surrogates aren't checked
+ * for, as that would warn on things like /\p{Gc=Cs}/ */
+ SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+ if (SvUV(*bitssvp) == 1) {
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "Code point 0x%04"UVXf" is not Unicode, no properties match it; all inverse properties do", code_point);
+ }
+ }
}
/*
NORETURN_FUNCTION_END;
}
+/* Read a single line of the main body of the swash input text. These are of
+ * the form:
+ * 0053 0056 0073
+ * where each number is hex. The first two numbers form the minimum and
+ * maximum of a range, and the third is the value associated with the range.
+ * Not all swashes should have a third number
+ *
+ * On input: l points to the beginning of the line to be examined; it points
+ * to somewhere in the string of the whole input text, and is
+ * terminated by a \n or the null string terminator.
+ * lend points to the null terminator of that string
+ * wants_value is non-zero if the swash expects a third number
+ * typestr is the name of the swash's mapping, like 'ToLower'
+ * On output: *min, *max, and *val are set to the values read from the line.
+ * returns a pointer just beyond the line examined. If there was no
+ * valid min number on the line, returns lend+1
+ */
+
+STATIC U8*
+S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
+ const bool wants_value, const U8* const typestr)
+{
+ const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
+ STRLEN numlen; /* Length of the number */
+ I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+
+ /* nl points to the next \n in the scan */
+ U8* const nl = (U8*)memchr(l, '\n', lend - l);
+
+ /* Get the first number on the line: the range minimum */
+ numlen = lend - l;
+ *min = grok_hex((char *)l, &numlen, &flags, NULL);
+ if (numlen) /* If found a hex number, position past it */
+ l += numlen;
+ else if (nl) { /* Else, go handle next line, if any */
+ return nl + 1; /* 1 is length of "\n" */
+ }
+ else { /* Else, no next line */
+ return lend + 1; /* to LIST's end at which \n is not found */
+ }
+
+ /* The max range value follows, separated by a BLANK */
+ if (isBLANK(*l)) {
+ ++l;
+ flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
+ numlen = lend - l;
+ *max = grok_hex((char *)l, &numlen, &flags, NULL);
+ if (numlen)
+ l += numlen;
+ else /* If no value here, it is a single element range */
+ *max = *min;
+
+ /* Non-binary tables have a third entry: what the first element of the
+ * range maps to */
+ if (wants_value) {
+ if (isBLANK(*l)) {
+ ++l;
+ flags = PERL_SCAN_SILENT_ILLDIGIT |
+ PERL_SCAN_DISALLOW_PREFIX;
+ numlen = lend - l;
+ *val = grok_hex((char *)l, &numlen, &flags, NULL);
+ if (numlen)
+ l += numlen;
+ else
+ *val = 0;
+ }
+ else {
+ *val = 0;
+ if (typeto) {
+ Perl_croak(aTHX_ "%s: illegal mapping '%s'",
+ typestr, l);
+ }
+ }
+ }
+ else
+ *val = 0; /* bits == 1, then any val should be ignored */
+ }
+ else { /* Nothing following range min, should be single element with no
+ mapping expected */
+ *max = *min;
+ if (wants_value) {
+ *val = 0;
+ if (typeto) {
+ Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
+ }
+ }
+ else
+ *val = 0; /* bits == 1, then val should be ignored */
+ }
+
+ /* Position to next line if any, or EOF */
+ if (nl)
+ l = nl + 1;
+ else
+ l = lend;
+
+ return l;
+}
+
/* Note:
* Returns a swatch (a bit vector string) for a code point sequence
* that starts from the value C<start> and comprises the number C<span>.
U8 *l, *lend, *x, *xend, *s;
STRLEN lcur, xcur, scur;
HV *const hv = MUTABLE_HV(SvRV(swash));
+
+ /* The string containing the main body of the table */
SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+
SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
- const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
const STRLEN bits = SvUV(*bitssvp);
const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
const UV none = SvUV(*nonesvp);
lend = l + lcur;
while (l < lend) {
UV min, max, val;
- STRLEN numlen;
- I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
-
- U8* const nl = (U8*)memchr(l, '\n', lend - l);
-
- numlen = lend - l;
- min = grok_hex((char *)l, &numlen, &flags, NULL);
- if (numlen)
- l += numlen;
- else if (nl) {
- l = nl + 1; /* 1 is length of "\n" */
- continue;
- }
- else {
- l = lend; /* to LIST's end at which \n is not found */
+ l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
+ cBOOL(octets), typestr);
+ if (l > lend) {
break;
}
- if (isBLANK(*l)) {
- ++l;
- flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
- numlen = lend - l;
- max = grok_hex((char *)l, &numlen, &flags, NULL);
- if (numlen)
- l += numlen;
- else
- max = min;
-
- if (octets) {
- if (isBLANK(*l)) {
- ++l;
- flags = PERL_SCAN_SILENT_ILLDIGIT |
- PERL_SCAN_DISALLOW_PREFIX;
- numlen = lend - l;
- val = grok_hex((char *)l, &numlen, &flags, NULL);
- if (numlen)
- l += numlen;
- else
- val = 0;
- }
- else {
- val = 0;
- if (typeto) {
- Perl_croak(aTHX_ "%s: illegal mapping '%s'",
- typestr, l);
- }
- }
- }
- else
- val = 0; /* bits == 1, then val should be ignored */
- }
- else {
- max = min;
- if (octets) {
- val = 0;
- if (typeto) {
- Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
- }
- }
- else
- val = 0; /* bits == 1, then val should be ignored */
- }
-
- if (nl)
- l = nl + 1;
- else
- l = lend;
-
+ /* If looking for something beyond this range, go try the next one */
if (max < start)
continue;
return swatch;
}
+HV*
+Perl__swash_inversion_hash(pTHX_ SV* const swash)
+{
+
+ /* Subject to change or removal. For use only in one place in regexec.c
+ *
+ * Returns a hash which is the inversion and closure of a swash mapping.
+ * For example, consider the input lines:
+ * 004B 006B
+ * 004C 006C
+ * 212A 006B
+ *
+ * The returned hash would have two keys, the utf8 for 006B and the utf8 for
+ * 006C. The value for each key is an array. For 006C, the array would
+ * have a two elements, the utf8 for itself, and for 004C. For 006B, there
+ * would be three elements in its array, the utf8 for 006B, 004B and 212A.
+ *
+ * Essentially, for any code point, it gives all the code points that map to
+ * it, or the list of 'froms' for that point.
+ *
+ * Currently it only looks at the main body of the swash, and ignores any
+ * additions or deletions from other swashes */
+
+ U8 *l, *lend;
+ STRLEN lcur;
+ HV *const hv = MUTABLE_HV(SvRV(swash));
+
+ /* The string containing the main body of the table */
+ SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+
+ SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
+ SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+ SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
+ /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
+ const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
+ const STRLEN bits = SvUV(*bitssvp);
+ const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+ const UV none = SvUV(*nonesvp);
+
+ HV* ret = newHV();
+
+ PERL_ARGS_ASSERT__SWASH_INVERSION_HASH;
+
+ /* Must have at least 8 bits to get the mappings */
+ if (bits != 8 && bits != 16 && bits != 32) {
+ Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"UVuf,
+ (UV)bits);
+ }
+
+ /* read $swash->{LIST} */
+ l = (U8*)SvPV(*listsvp, lcur);
+ lend = l + lcur;
+
+ /* Go through each input line */
+ while (l < lend) {
+ UV min, max, val;
+ UV inverse;
+ l = S_swash_scan_list_line(aTHX_ l, lend, &min, &max, &val,
+ cBOOL(octets), typestr);
+ if (l > lend) {
+ break;
+ }
+
+ /* Each element in the range is to be inverted */
+ for (inverse = min; inverse <= max; inverse++) {
+ AV* list;
+ SV* element;
+ SV** listp;
+ IV i;
+ bool found_key = FALSE;
+
+ /* The key is the inverse mapping */
+ char key[UTF8_MAXBYTES+1];
+ char* key_end = (char *) uvuni_to_utf8((U8*) key, val);
+ STRLEN key_len = key_end - key;
+
+ /* Get the list for the map */
+ if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
+ list = (AV*) *listp;
+ }
+ else { /* No entry yet for it: create one */
+ list = newAV();
+ if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
+ Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
+ }
+ }
+
+ for (i = 0; i < av_len(list); i++) {
+ SV** entryp = av_fetch(list, i, FALSE);
+ SV* entry;
+ if (entryp == NULL) {
+ Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
+ }
+ entry = *entryp;
+ if (SvUV(entry) == val) {
+ found_key = TRUE;
+ break;
+ }
+ }
+
+ /* Make sure there is a mapping to itself on the list */
+ if (! found_key) {
+ element = newSVuv(val);
+ av_push(list, element);
+ }
+
+
+ /* Simply add the value to the list */
+ element = newSVuv(inverse);
+ av_push(list, element);
+
+ /* swash_get() increments the value of val for each element in the
+ * range. That makes more compact tables possible. You can
+ * express the capitalization, for example, of all consecutive
+ * letters with a single line: 0061\t007A\t0041 This maps 0061 to
+ * 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
+ * and it's not documented, and perhaps not even currently used,
+ * but I copied the semantics from swash_get(), just in case */
+ if (!none || val < none) {
+ ++val;
+ }
+ }
+ }
+
+ return ret;
+}
+
+HV*
+Perl__swash_to_invlist(pTHX_ SV* const swash)
+{
+
+ /* Subject to change or removal. For use only in one place in regcomp.c */
+
+ U8 *l, *lend;
+ char *loc;
+ STRLEN lcur;
+ HV *const hv = MUTABLE_HV(SvRV(swash));
+ UV elements = 0; /* Number of elements in the inversion list */
+ U8 empty[] = "";
+
+ /* The string containing the main body of the table */
+ SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
+ SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
+ SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+
+ const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
+ const STRLEN bits = SvUV(*bitssvp);
+ const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
+
+ HV* invlist;
+
+ PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
+
+ /* read $swash->{LIST} */
+ if (SvPOK(*listsvp)) {
+ l = (U8*)SvPV(*listsvp, lcur);
+ }
+ else {
+ /* LIST legitimately doesn't contain a string during compilation phases
+ * of Perl itself, before the Unicode tables are generated. In this
+ * case, just fake things up by creating an empty list */
+ l = empty;
+ lcur = 0;
+ }
+ loc = (char *) l;
+ lend = l + lcur;
+
+ /* Scan the input to count the number of lines to preallocate array size
+ * based on worst possible case, which is each line in the input creates 2
+ * elements in the inversion list: 1) the beginning of a range in the list;
+ * 2) the beginning of a range not in the list. */
+ while ((loc = (strchr(loc, '\n'))) != NULL) {
+ elements += 2;
+ loc++;
+ }
+
+ /* If the ending is somehow corrupt and isn't a new line, add another
+ * element for the final range that isn't in the inversion list */
+ if (! (*lend == '\n' || (*lend == '\0' && *(lend - 1) == '\n'))) {
+ elements++;
+ }
+
+ invlist = _new_invlist(elements);
+
+ /* Now go through the input again, adding each range to the list */
+ while (l < lend) {
+ UV start, end;
+ UV val; /* Not used by this function */
+
+ l = S_swash_scan_list_line(aTHX_ l, lend, &start, &end, &val,
+ cBOOL(octets), typestr);
+
+ if (l > lend) {
+ break;
+ }
+
+ _append_range_to_invlist(invlist, start, end);
+ }
+
+ return invlist;
+}
+
/*
=for apidoc uvchr_to_utf8
-Adds the UTF-8 representation of the Native codepoint C<uv> to the end
+Adds the UTF-8 representation of the Native code point C<uv> to the end
of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
bytes available. The return value is the pointer to the byte after the
end of the new character. In other words,
/*
=for apidoc utf8n_to_uvchr
-flags
-Returns the native character value of the first character in the string
+Returns the native character value of the first character in the string
C<s>
which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
length, in bytes, of that character.
-Allows length and flags to be passed to low level routine.
+length and flags are the same as utf8n_to_uvuni().
=cut
*/
a real function in case XS code wants it
*/
UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
+Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
U32 flags)
{
const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
return UNI_TO_NATIVE(uv);
}
+bool
+Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
+{
+ /* May change: warns if surrogates, non-character code points, or
+ * non-Unicode code points are in s which has length len. Returns TRUE if
+ * none found; FALSE otherwise. The only other validity check is to make
+ * sure that this won't exceed the string's length */
+
+ const U8* const e = s + len;
+ bool ok = TRUE;
+
+ PERL_ARGS_ASSERT_CHECK_UTF8_PRINT;
+
+ while (s < e) {
+ if (UTF8SKIP(s) > len) {
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+ "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
+ return FALSE;
+ }
+ if (*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE) {
+ STRLEN char_len;
+ if (UTF8_IS_SUPER(s)) {
+ UV uv = utf8_to_uvchr(s, &char_len);
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
+ ok = FALSE;
+ }
+ else if (UTF8_IS_SURROGATE(s)) {
+ UV uv = utf8_to_uvchr(s, &char_len);
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "Unicode surrogate U+%04"UVXf" is illegal in UTF-8", uv);
+ ok = FALSE;
+ }
+ else if
+ (UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
+ {
+ UV uv = utf8_to_uvchr(s, &char_len);
+ Perl_warner(aTHX_ packWARN(WARN_UTF8),
+ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
+ ok = FALSE;
+ }
+ }
+ s += UTF8SKIP(s);
+ }
+
+ return ok;
+}
+
/*
=for apidoc pv_uni_display
}
if (truncated)
sv_catpvs(dsv, "...");
-
+
return SvPVX(dsv);
}
}
/*
-=for apidoc ibcmp_utf8
+=for apidoc foldEQ_utf8
-Returns true if the strings s1 and s2 differ case-insensitively, false
-if they are equal case-insensitively. Note that this is the complement of what
-you might expect (perhaps it would have been better to name it C<ibncmp_utf8>).
+Returns true if the leading portions of the strings s1 and s2 (either or both
+of which may be in UTF-8) are the same case-insensitively; false otherwise.
+How far into the strings to compare is determined by other input parameters.
If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
otherwise it is assumed to be in native 8-bit encoding. Correspondingly for u2
with respect to s2.
-If the byte length l1 is non-zero, s1+l1 will be used as a goal to reach. The
+If the byte length l1 is non-zero, it says how far into s1 to check for fold
+equality. In other words, s1+l1 will be used as a goal to reach. The
scan will not be considered to be a match unless the goal is reached, and
scanning won't continue past that goal. Correspondingly for l2 with respect to
s2.
considered an end pointer beyond which scanning of s1 will not continue under
any circumstances. This means that if both l1 and pe1 are specified, and pe1
is less than s1+l1, the match will never be successful because it can never
-get as far as its goal. Correspondingly for pe2 with respect to s2.
+get as far as its goal (and in fact is asserted against). Correspondingly for
+pe2 with respect to s2.
-At least one of s1 and s2 must have a goal, and if both do, both have to be
+At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be
+non-zero), and if both do, both have to be
reached for a successful match. Also, if the fold of a character is multiple
characters, all of them must be matched (see tr21 reference below for
'folding').
-Upon a successful match (when the routine returns false), if pe1 is non-NULL,
+Upon a successful match, if pe1 is non-NULL,
it will be set to point to the beginning of the I<next> character of s1 beyond
what was matched. Correspondingly for pe2 and s2.
http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
=cut */
+
+/* A flags parameter has been added which may change, and hence isn't
+ * externally documented. Currently it is:
+ * 0 for as-documented above
+ * FOLDEQ_UTF8_NOMIX_ASCII meaning that if a non-ASCII character folds to an
+ ASCII one, to not match
+ */
I32
-Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
+Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2, U32 flags)
{
dVAR;
register const U8 *p1 = (const U8*)s1; /* Point to current char */
register const U8 *p2 = (const U8*)s2;
- register const U8 *g1 = NULL; /* goal for s1 */
+ register const U8 *g1 = NULL; /* goal for s1 */
register const U8 *g2 = NULL;
- register const U8 *e1 = NULL; /* Don't scan s1 past this */
- register U8 *f1 = NULL; /* Point to current folded */
+ register const U8 *e1 = NULL; /* Don't scan s1 past this */
+ register U8 *f1 = NULL; /* Point to current folded */
register const U8 *e2 = NULL;
register U8 *f2 = NULL;
- STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
+ STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
- U8 natbuf[2]; /* Holds native 8-bit char converted to utf8;
- these always fit in 2 bytes */
+ U8 natbuf[2]; /* Holds native 8-bit char converted to utf8;
+ these always fit in 2 bytes */
- PERL_ARGS_ASSERT_IBCMP_UTF8;
+ PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
if (pe1) {
- e1 = *(U8**)pe1;
+ e1 = *(U8**)pe1;
}
if (l1) {
- g1 = (const U8*)s1 + l1;
+ g1 = (const U8*)s1 + l1;
}
if (pe2) {
- e2 = *(U8**)pe2;
+ e2 = *(U8**)pe2;
}
if (l2) {
- g2 = (const U8*)s2 + l2;
+ g2 = (const U8*)s2 + l2;
}
/* Must have at least one goal */
if (g1) {
- /* Will never match if goal is out-of-bounds */
- assert(! e1 || e1 >= g1);
+ /* Will never match if goal is out-of-bounds */
+ assert(! e1 || e1 >= g1);
- /* Here, there isn't an end pointer, or it is beyond the goal. We
- * only go as far as the goal */
- e1 = g1;
+ /* Here, there isn't an end pointer, or it is beyond the goal. We
+ * only go as far as the goal */
+ e1 = g1;
+ }
+ else {
+ assert(e1); /* Must have an end for looking at s1 */
}
- else assert(e1); /* Must have an end for looking at s1 */
/* Same for goal for s2 */
if (g2) {
- assert(! e2 || e2 >= g2);
- e2 = g2;
+ assert(! e2 || e2 >= g2);
+ e2 = g2;
+ }
+ else {
+ assert(e2);
}
- else assert(e2);
/* Look through both strings, a character at a time */
while (p1 < e1 && p2 < e2) {
- /* If at the beginning of a new character in s1, get its fold to use */
- if (n1 == 0) {
- if (u1) {
- to_utf8_fold(p1, foldbuf1, &n1);
- }
- else { /* Not utf8, convert to it first and then get fold */
- uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
- to_utf8_fold(natbuf, foldbuf1, &n1);
- }
- f1 = foldbuf1;
- }
+ /* If at the beginning of a new character in s1, get its fold to use
+ * and the length of the fold */
+ if (n1 == 0) {
+ if (isASCII(*p1)) {
- if (n2 == 0) { /* Same for s2 */
- if (u2) {
- to_utf8_fold(p2, foldbuf2, &n2);
- }
- else {
- uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
- to_utf8_fold(natbuf, foldbuf2, &n2);
+ /* But if not to mix non- with ASCII, fail */
+ if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
+ return 0;
+ }
+ n1 = 1;
+ *foldbuf1 = toLOWER(*p1); /* ASCII range fold is lowercase */
}
- f2 = foldbuf2;
- }
+ else if (u1) {
+ to_utf8_fold(p1, foldbuf1, &n1);
+ }
+ else { /* Not utf8, convert to it first and then get fold */
+ uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
+ to_utf8_fold(natbuf, foldbuf1, &n1);
+ }
+ f1 = foldbuf1;
+ }
- /* While there is more to look for in both folds, see if they
- * continue to match */
- while (n1 && n2) {
- U8 fold_length = UTF8SKIP(f1);
- if (fold_length != UTF8SKIP(f2)
- || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
- function call for single
- character */
- || memNE((char*)f1, (char*)f2, fold_length))
- {
- return 1; /* mismatch */
+ if (n2 == 0) { /* Same for s2 */
+ if (isASCII(*p2)) {
+ if (flags && ! isASCII(*p1)) {
+ return 0;
+ }
+ n2 = 1;
+ *foldbuf2 = toLOWER(*p2);
}
+ else if (u2) {
+ to_utf8_fold(p2, foldbuf2, &n2);
+ }
+ else {
+ uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
+ to_utf8_fold(natbuf, foldbuf2, &n2);
+ }
+ f2 = foldbuf2;
+ }
- /* Here, they matched, advance past them */
- n1 -= fold_length;
- f1 += fold_length;
- n2 -= fold_length;
- f2 += fold_length;
- }
+ /* While there is more to look for in both folds, see if they
+ * continue to match */
+ while (n1 && n2) {
+ U8 fold_length = UTF8SKIP(f1);
+ if (fold_length != UTF8SKIP(f2)
+ || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
+ function call for single
+ character */
+ || memNE((char*)f1, (char*)f2, fold_length))
+ {
+ return 0; /* mismatch */
+ }
+
+ /* Here, they matched, advance past them */
+ n1 -= fold_length;
+ f1 += fold_length;
+ n2 -= fold_length;
+ f2 += fold_length;
+ }
- /* When reach the end of any fold, advance the input past it */
- if (n1 == 0) {
- p1 += u1 ? UTF8SKIP(p1) : 1;
- }
- if (n2 == 0) {
- p2 += u2 ? UTF8SKIP(p2) : 1;
- }
+ /* When reach the end of any fold, advance the input past it */
+ if (n1 == 0) {
+ p1 += u1 ? UTF8SKIP(p1) : 1;
+ }
+ if (n2 == 0) {
+ p2 += u2 ? UTF8SKIP(p2) : 1;
+ }
} /* End of loop through both strings */
/* A match is defined by each scan that specified an explicit length
* character (which can happen when the fold of a character is more than one
* character). */
if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
- return 1;
+ return 0;
}
/* Successful match. Set output pointers */
if (pe1) {
- *pe1 = (char*)p1;
+ *pe1 = (char*)p1;
}
if (pe2) {
- *pe2 = (char*)p2;
+ *pe2 = (char*)p2;
}
- return 0;
+ return 1;
}
/*