is, if they are invariant. On ASCII-ish machines, only ASCII characters
fit this definition, hence the function's name.
+If C<len> is 0, it will be calculated using C<strlen(s)>.
+
See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
=cut
=for apidoc is_utf8_string
Returns true if first C<len> bytes of the given string form a valid
-UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
-not mean 'a string that contains code points above 0x7F encoded in UTF-8'
-because a valid ASCII string is a valid UTF-8 string.
+UTF-8 string, false otherwise. If C<len> is 0, it will be calculated
+using C<strlen(s)>. Note that 'a valid UTF-8 string' does not mean 'a
+string that contains code points above 0x7F encoded in UTF-8' because a
+valid ASCII string is a valid UTF-8 string.
See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
}
/*
+=for apidoc bytes_cmp_utf8
+
+Compares the sequence of characters (stored as octets) in b, blen with the
+sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
+equal, -1 or -2 if the first string is less than the second string, +1 or +2
+if the first string is greater than the second string.
+
+-1 or +1 is returned if the shorter string was identical to the start of the
+longer string. -2 or +2 is returned if the was a difference between characters
+within the strings.
+
+=cut
+*/
+
+int
+Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
+{
+ const U8 *const bend = b + blen;
+ const U8 *const uend = u + ulen;
+
+ PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
+
+ PERL_UNUSED_CONTEXT;
+
+ while (b < bend && u < uend) {
+ U8 c = *u++;
+ if (!UTF8_IS_INVARIANT(c)) {
+ if (UTF8_IS_DOWNGRADEABLE_START(c)) {
+ if (u < uend) {
+ U8 c1 = *u++;
+ if (UTF8_IS_CONTINUATION(c1)) {
+ c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1));
+ } else {
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+ "Malformed UTF-8 character "
+ "(unexpected non-continuation byte 0x%02x"
+ ", immediately after start byte 0x%02x)"
+ /* Dear diag.t, it's in the pod. */
+ "%s%s", c1, c,
+ PL_op ? " in " : "",
+ PL_op ? OP_DESC(PL_op) : "");
+ return -2;
+ }
+ } else {
+ if (PL_op)
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+ "%s in %s", unees, OP_DESC(PL_op));
+ else
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
+ return -2; /* Really want to return undef :-) */
+ }
+ } else {
+ return -2;
+ }
+ }
+ if (*b != c) {
+ return *b < c ? -2 : +2;
+ }
+ ++b;
+ }
+
+ if (b == bend && u == uend)
+ return 0;
+
+ return b < bend ? +1 : -1;
+}
+
+/*
=for apidoc utf8_to_bytes
Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
U8 c = *s++;
if (!UTF8_IS_INVARIANT(c)) {
/* Then it is two-byte encoded */
- c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
- c = ASCII_TO_NATIVE(c);
+ c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++));
}
*d++ = c;
}
/*
=for apidoc bytes_to_utf8
-Converts a string C<s> of length C<len> from the native encoding into UTF-8.
+Converts a string C<s> of length C<len> bytes from the native encoding into
+UTF-8.
Returns a pointer to the newly-created string, and sets C<len> to
-reflect the new length.
+reflect the new length in bytes.
A NUL character will be written after the end of the string.
* return several Unicode characters for a single Unicode character
* (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
* the lower-level routine, and it is similarly broken for returning
- * multiple values. --jhi */
+ * multiple values. --jhi
+ * For those, you should use to_utf8_case() instead */
/* Now SWASHGET is recasted into S_swash_get in this file. */
/* Note: