ADMpR |bool |is_uni_punct_lc|UV c
ADMpPR |bool |is_uni_xdigit_lc|UV c
AndmoR |bool |is_utf8_invariant_string|NN const U8* const s \
- |STRLEN const len
+ |STRLEN len
AnidR |bool |is_utf8_invariant_string_loc|NN const U8* const s \
- |STRLEN const len \
+ |STRLEN len \
|NULLOK const U8 ** ep
AmnpdRP |bool |is_ascii_string|NN const U8* const s|const STRLEN len
AmnpdRP |bool |is_invariant_string|NN const U8* const s|const STRLEN len
use warnings;
use Carp;
-our $VERSION = '0.93';
+our $VERSION = '0.94';
require XSLoader;
RETVAL
AV *
+test_is_utf8_invariant_string_loc(char *s, STRLEN offset, STRLEN len)
+ PREINIT:
+ AV *av;
+ const U8 * ep;
+ CODE:
+ av = newAV();
+ av_push(av, newSViv(is_utf8_invariant_string_loc((U8 *) s + offset, len, &ep)));
+ av_push(av, newSViv(ep - ((U8 *) s + offset)));
+ RETVAL = av;
+ OUTPUT:
+ RETVAL
+
+AV *
test_is_utf8_string_loc(char *s, STRLEN len)
PREINIT:
AV *av;
use XS::APItest;
+
+my $s = "A" x 100 ;
+my $ret_ref = test_is_utf8_invariant_string_loc($s, 0, length $s);
+is($ret_ref->[0], 1, "is_utf8_invariant_string_loc returns TRUE for invariant");
+
+my $above_word_length = 9;
+for my $initial (0 .. $above_word_length) {
+ for my $offset (0 .. $above_word_length) {
+ for my $trailing (0 .. $above_word_length) {
+ if ($initial >= $offset) {
+ my $variant_pos = $initial - $offset;
+ $s = "A" x $initial . "\x80" . "A" x $trailing;
+ my $ret_ref = test_is_utf8_invariant_string_loc($s, $offset,
+ length $s);
+ is($ret_ref->[0], 0, "is_utf8_invariant_string_loc returns"
+ . " FALSE for variant at $variant_pos,"
+ . " first $offset ignored)");
+ is($ret_ref->[1], $variant_pos,
+ " And returns the correct position");
+ }
+ }
+ }
+}
+
my $pound_sign = chr utf8::unicode_to_native(163);
# This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
=cut
-XXX On ASCII machines this could be sped up by doing word-at-a-time operations
-
*/
PERL_STATIC_INLINE bool
-S_is_utf8_invariant_string_loc(const U8* const s, const STRLEN len, const U8 ** ep)
+S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
{
- const U8* const send = s + (len ? len : strlen((const char *)s));
+ const U8* send;
const U8* x = s;
PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
- while (x < send) {
- if (UTF8_IS_INVARIANT(*x)) {
- x++;
- continue;
+ if (len == 0) {
+ len = strlen((const char *)s);
+ }
+
+ send = s + len;
+
+#ifndef EBCDIC
+ /* Try to get the widest word on this platform */
+# ifdef HAS_LONG_LONG
+# define PERL_WORDCAST unsigned long long
+# define PERL_WORDSIZE LONGLONGSIZE
+# else
+# define PERL_WORDCAST UV
+# define PERL_WORDSIZE UVSIZE
+# endif
+
+# if PERL_WORDSIZE == 4
+# define PERL_VARIANTS_WORD_MASK 0x80808080
+# define PERL_WORD_BOUNDARY_MASK 0x3
+# elif PERL_WORDSIZE == 8
+# define PERL_VARIANTS_WORD_MASK 0x8080808080808080
+# define PERL_WORD_BOUNDARY_MASK 0x7
+# else
+# error Unexpected word size
+# endif
+
+ /* Process per-byte until reach word boundary. XXX This loop could be
+ * eliminated if we knew that this platform had fast unaligned reads */
+ while (x < send && (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) {
+ if (! UTF8_IS_INVARIANT(*x)) {
+ if (ep) {
+ *ep = x;
+ }
+
+ return FALSE;
}
+ x++;
+ }
+
+ /* Process per-word as long as we have at least a full word left */
+ while (x + PERL_WORDSIZE <= send) {
+ if ((* (PERL_WORDCAST *) x) & PERL_VARIANTS_WORD_MASK) {
+
+ /* Found a variant. Just return if caller doesn't want its exact
+ * position */
+ if (! ep) {
+ return FALSE;
+ }
- if (ep) {
- *ep = x;
+ /* Otherwise fall into final loop to find which byte it is */
+ break;
}
+ x += PERL_WORDSIZE;
+ }
- return FALSE;
+# undef PERL_WORDCAST
+# undef PERL_WORDSIZE
+# undef PERL_WORD_BOUNDARY_MASK
+# undef PERL_VARIANTS_WORD_MASK
+#endif
+
+ /* Process per-byte */
+ while (x < send) {
+ if (! UTF8_IS_INVARIANT(*x)) {
+ if (ep) {
+ *ep = x;
+ }
+
+ return FALSE;
+ }
+
+ x++;
}
return TRUE;
#define PERL_ARGS_ASSERT_IS_UTF8_IDFIRST \
assert(p)
-/* PERL_CALLCONV bool is_utf8_invariant_string(const U8* const s, STRLEN const len)
+/* PERL_CALLCONV bool is_utf8_invariant_string(const U8* const s, STRLEN len)
__attribute__warn_unused_result__; */
#ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE bool S_is_utf8_invariant_string_loc(const U8* const s, STRLEN const len, const U8 ** ep)
+PERL_STATIC_INLINE bool S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
__attribute__warn_unused_result__;
#define PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC \
assert(s)