Convert sv_eq_flags() and sv_cmp_flags() to use it.
Previously, to compare two strings of characters, where was was in UTF-8, and
one was not, you had to either:
1: Upgrade the second to UTF-8
2: Compare the resulting octet sequence
3: Free the temporary UTF-8 string
or:
1: Attempt to downgrade the first to bytes. If it can't be, they aren't equal
2: Else compare the resulting octet sequence
3: Free the temporary byte string
Which for the general case involves a malloc()/free() and at least two O(n)
scans per comparison.
Whereas this approach has no allocation, a single O(n) scan, which terminates
as early as the best case for the second approach.
ext/XS-APItest/t/swaptwostmts.t test recursive descent statement parsing
ext/XS-APItest/t/temp_lv_sub.t XS::APItest: tests for lvalue subs returning temps
ext/XS-APItest/t/utf16_to_utf8.t Test behaviour of utf16_to_utf8{,reversed}
+ext/XS-APItest/t/utf8.t Tests for code in utf8.c
ext/XS-APItest/t/xs_special_subs_require.t for require too
ext/XS-APItest/t/xs_special_subs.t Test that XS BEGIN/CHECK/INIT/END work
ext/XS-APItest/t/xsub_h.t Tests for XSUB.h
ApdPR |IV |utf8_distance |NN const U8 *a|NN const U8 *b
ApdPR |U8* |utf8_hop |NN const U8 *s|I32 off
ApMd |U8* |utf8_to_bytes |NN U8 *s|NN STRLEN *len
+Apd |int |bytes_cmp_utf8 |NN const U8 *b|STRLEN blen|NN const U8 *u \
+ |STRLEN ulen
ApMd |U8* |bytes_from_utf8|NN const U8 *s|NN STRLEN *len|NULLOK bool *is_utf8
ApMd |U8* |bytes_to_utf8 |NN const U8 *s|NN STRLEN *len
Apd |UV |utf8_to_uvchr |NN const U8 *s|NULLOK STRLEN *retlen
#define av_undef(a) Perl_av_undef(aTHX_ a)
#define av_unshift(a,b) Perl_av_unshift(aTHX_ a,b)
#define block_gimme() Perl_block_gimme(aTHX)
+#define bytes_cmp_utf8(a,b,c,d) Perl_bytes_cmp_utf8(aTHX_ a,b,c,d)
#define bytes_from_utf8(a,b,c) Perl_bytes_from_utf8(aTHX_ a,b,c)
#define bytes_to_utf8(a,b) Perl_bytes_to_utf8(aTHX_ a,b)
#define call_argv(a,b,c) Perl_call_argv(aTHX_ a,b,c)
INCLUDE: numeric.xs
+MODULE = XS::APItest::utf8 PACKAGE = XS::APItest::utf8
+
+int
+bytes_cmp_utf8(bytes, utf8)
+ SV *bytes
+ SV *utf8
+ PREINIT:
+ const U8 *b;
+ STRLEN blen;
+ const U8 *u;
+ STRLEN ulen;
+ CODE:
+ b = (const U8 *)SvPVbyte(bytes, blen);
+ u = (const U8 *)SvPVbyte(utf8, ulen);
+ RETVAL = bytes_cmp_utf8(b, blen, u, ulen);
+ OUTPUT:
+ RETVAL
+
MODULE = XS::APItest:Overload PACKAGE = XS::APItest::Overload
SV *
--- /dev/null
+#!perl -w
+
+use strict;
+use Test::More;
+
+use XS::APItest;
+
+foreach ([0, '', '', 'empty'],
+ [0, 'N', 'N', '1 char'],
+ [1, 'NN', 'N', '1 char substring'],
+ [-2, 'Perl', 'Rules', 'different'],
+ [0, chr 163, chr 163, 'pound sign'],
+ [1, chr (163) . 10, chr (163) . 1, '10 pounds is more than 1 pound'],
+ [1, chr(163) . chr(163), chr 163, '2 pound signs are more than 1'],
+ [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'],
+ [-1, '!', "!\x{1F42A}", 'Initial substrings match'],
+ ) {
+ my ($expect, $left, $right, $desc) = @$_;
+ my $copy = $right;
+ utf8::encode($copy);
+ is(bytes_cmp_utf8($left, $copy), $expect, $desc);
+ next if $right =~ tr/\0-\377//c;
+ utf8::encode($left);
+ is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed");
+}
+
+done_testing;
Perl_av_unshift
Perl_block_gimme
Perl_blockhook_register
+Perl_bytes_cmp_utf8
Perl_bytes_from_utf8
Perl_bytes_to_utf8
Perl_call_argv
PERL_CALLCONV void Perl_boot_core_PerlIO(pTHX);
PERL_CALLCONV void Perl_boot_core_UNIVERSAL(pTHX);
PERL_CALLCONV void Perl_boot_core_mro(pTHX);
+PERL_CALLCONV int Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
+ __attribute__nonnull__(pTHX_1)
+ __attribute__nonnull__(pTHX_3);
+#define PERL_ARGS_ASSERT_BYTES_CMP_UTF8 \
+ assert(b); assert(u)
+
PERL_CALLCONV U8* Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
__attribute__nonnull__(pTHX_1)
__attribute__nonnull__(pTHX_2);
}
}
else {
- bool is_utf8 = TRUE;
-
if (SvUTF8(sv1)) {
- /* sv1 is the UTF-8 one,
- * if is equal it must be downgrade-able */
- char * const pv = (char*)bytes_from_utf8((const U8*)pv1,
- &cur1, &is_utf8);
- if (pv != pv1)
- pv1 = tpv = pv;
+ /* sv1 is the UTF-8 one */
+ return bytes_cmp_utf8((const U8*)pv2, cur2,
+ (const U8*)pv1, cur1) == 0;
}
else {
- /* sv2 is the UTF-8 one,
- * if is equal it must be downgrade-able */
- char * const pv = (char *)bytes_from_utf8((const U8*)pv2,
- &cur2, &is_utf8);
- if (pv != pv2)
- pv2 = tpv = pv;
- }
- if (is_utf8) {
- /* Downgrade not possible - cannot be eq */
- assert (tpv == 0);
- return FALSE;
+ /* sv2 is the UTF-8 one */
+ return bytes_cmp_utf8((const U8*)pv1, cur1,
+ (const U8*)pv2, cur2) == 0;
}
}
}
pv2 = SvPV_const(svrecode, cur2);
}
else {
- pv2 = tpv = (char*)bytes_to_utf8((const U8*)pv2, &cur2);
+ const int retval = -bytes_cmp_utf8((const U8*)pv2, cur2,
+ (const U8*)pv1, cur1);
+ return retval ? retval < 0 ? -1 : +1 : 0;
}
}
else {
pv1 = SvPV_const(svrecode, cur1);
}
else {
- pv1 = tpv = (char*)bytes_to_utf8((const U8*)pv1, &cur1);
+ const int retval = bytes_cmp_utf8((const U8*)pv1, cur1,
+ (const U8*)pv2, cur2);
+ return retval ? retval < 0 ? -1 : +1 : 0;
}
}
}
#
# PLEASE DO NOT ADD TO THIS LIST. Instead, write an entry in
# pod/perldiag.pod for your new (warning|error).
+
+# Also FIXME this test, as the first entry in TODO *is* covered by the
+# description: Malformed UTF-8 character (%s)
__DATA__
+Malformed UTF-8 character (unexpected non-continuation byte 0x%02x, immediately after start byte 0x%02x)
+
%s (%d) does not match %s (%d),
%s (%d) smaller than %s (%d),
Argument "%s" isn't numeric
}
/*
+=for apidoc bytes_cmp_utf8
+
+Compares the sequence of characters (stored as octets) in b, blen with the
+sequence of characters (stored as UTF-8) in u, ulen. Returns 0 if they are
+equal, -1 or -2 if the first string is less than the second string, +1 or +2
+if the first string is greater than the second string.
+
+-1 or +1 is returned if the shorter string was identical to the start of the
+longer string. -2 or +2 is returned if the was a difference between characters
+within the strings.
+
+=cut
+*/
+
+int
+Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
+{
+ const U8 *const bend = b + blen;
+ const U8 *const uend = u + ulen;
+
+ PERL_ARGS_ASSERT_BYTES_CMP_UTF8;
+
+ PERL_UNUSED_CONTEXT;
+
+ while (b < bend && u < uend) {
+ U8 c = *u++;
+ if (!UTF8_IS_INVARIANT(c)) {
+ if (UTF8_IS_DOWNGRADEABLE_START(c)) {
+ if (u < uend) {
+ U8 c1 = *u++;
+ if (UTF8_IS_CONTINUATION(c1)) {
+ c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), c1);
+ c = ASCII_TO_NATIVE(c);
+ } else {
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+ "Malformed UTF-8 character "
+ "(unexpected non-continuation byte 0x%02x"
+ ", immediately after start byte 0x%02x)"
+ /* Dear diag.t, it's in the pod. */
+ "%s%s", c1, c,
+ PL_op ? " in " : "",
+ PL_op ? OP_DESC(PL_op) : "");
+ return -2;
+ }
+ } else {
+ if (PL_op)
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
+ "%s in %s", unees, OP_DESC(PL_op));
+ else
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
+ return -2; /* Really want to return undef :-) */
+ }
+ } else {
+ return -2;
+ }
+ }
+ if (*b != c) {
+ return *b < c ? -2 : +2;
+ }
+ ++b;
+ }
+
+ if (b == bend && u == uend)
+ return 0;
+
+ return b < bend ? +1 : -1;
+}
+
+/*
=for apidoc utf8_to_bytes
Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.