AnidR |bool |is_utf8_invariant_string_loc|NN const U8* const s \
|STRLEN len \
|NULLOK const U8 ** ep
+#if defined(PERL_CORE) || defined(PERL_EXT)
+EinR |Size_t |variant_under_utf8_count|NN const U8* const s \
+ |NN const U8* const e
+#endif
AmnpdRP |bool |is_ascii_string|NN const U8* const s|const STRLEN len
AmnpdRP |bool |is_invariant_string|NN const U8* const s|STRLEN len
#if defined(PERL_CORE) || defined (PERL_EXT)
#define is_utf8_non_invariant_string S_is_utf8_non_invariant_string
#define sv_or_pv_pos_u2b(a,b,c,d) S_sv_or_pv_pos_u2b(aTHX_ a,b,c,d)
# endif
+# if defined(PERL_CORE) || defined(PERL_EXT)
+#define variant_under_utf8_count S_variant_under_utf8_count
+# endif
# if defined(PERL_IN_REGCOMP_C)
#define _make_exactf_invlist(a,b) S__make_exactf_invlist(aTHX_ a,b)
#define add_above_Latin1_folds(a,b,c) S_add_above_Latin1_folds(aTHX_ a,b,c)
RETVAL
STRLEN
+test_variant_under_utf8_count(unsigned char *s, STRLEN offset, STRLEN len)
+ PREINIT:
+ PERL_UINTMAX_T * copy;
+ CODE:
+ Newx(copy, 1 + ((len + WORDSIZE - 1) / WORDSIZE), PERL_UINTMAX_T);
+ Copy(s, (U8 *) copy + offset, len, U8);
+ RETVAL = variant_under_utf8_count((U8 *) copy + offset, (U8 *) copy + offset + len);
+ Safefree(copy);
+ OUTPUT:
+ RETVAL
+
+STRLEN
test_utf8_length(unsigned char *s, STRLEN offset, STRLEN len)
CODE:
RETVAL = utf8_length(s + offset, s + len);
}
}
+# Now work on variant_under_utf8_count().
+pass("The tests below are for variant_under_utf8_count() with string"
+ . " starting $offset bytes after a word boundary");
+is(test_variant_under_utf8_count($all_invariants, $offset,
+ length $all_invariants),
+ 0,
+ "$display_all_invariants contains 0 variants");
+
+# First, put a variant in each possible position in the flanking partial words
+for my $pos (0 .. $word_length - $offset,
+ 2 * $word_length .. length($all_invariants) - 1)
+{
+ my $test_string = $all_invariants;
+ my $test_display = $display_all_invariants;
+
+ substr($test_string, $pos, 1) = $variant;
+ substr($test_display, $pos * 2, 2) = $display_variant;
+ is(test_variant_under_utf8_count($test_string, $offset, length $test_string),
+ 1,
+ "$test_display contains 1 variant");
+}
+
+# Then try all possible combinations of variant/invariant in the full word in
+# the middle (We've already tested the case with 0 variants, so start at 1.)
+for my $bit_pattern (1 .. (1 << $word_length) - 1) {
+ my $bits = $bit_pattern;
+ my $display_word = "";
+ my $test_word = "";
+ my $count = 0;
+
+ # Every 1 bit gets the variant for this particular $bit_pattern.
+ for my $bit (0 .. 7) {
+ if ($bits & 1) {
+ $count++;
+ $test_word .= $variant;
+ $display_word .= $display_variant;
+ }
+ else {
+ $test_word .= $invariant;
+ $display_word .= $display_invariant;
+ }
+ $bits >>= 1;
+ }
+
+ my $test_string = $variant x ($word_length - 1)
+ . $test_word
+ . $variant x ($word_length - 1);
+ my $display_string = $display_variant x ($word_length - 1)
+ . $display_word
+ . $display_variant x ($word_length - 1);
+ my $expected_count = $count + 2 * $word_length - 2;
+ is(test_variant_under_utf8_count($test_string, $offset,
+ length $test_string), $expected_count,
+ "$display_string contains $expected_count variants");
+}
+
+
my $pound_sign = chr utf8::unicode_to_native(163);
# This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
} while (x + PERL_WORDSIZE <= send);
}
-# undef PERL_WORDSIZE
-# undef PERL_WORD_BOUNDARY_MASK
-# undef PERL_VARIANTS_WORD_MASK
#endif
/* Process per-byte */
return TRUE;
}
+#if defined(PERL_CORE) || defined(PERL_EXT)
+
+/*
+=for apidoc variant_under_utf8_count
+
+This function looks at the sequence of bytes between C<s> and C<e>, which are
+assumed to be encoded in ASCII/Latin1, and returns how many of them would
+change should the string be translated into UTF-8. Due to the nature of UTF-8,
+each of these would occupy two bytes instead of the single one in the input
+string. Thus, this function returns the precise number of bytes the string
+would expand by when translated to UTF-8.
+
+Unlike most of the other functions that have C<utf8> in their name, the input
+to this function is NOT a UTF-8-encoded string. The function name is slightly
+I<odd> to emphasize this.
+
+This function is internal to Perl because khw thinks that any XS code that
+would want this is probably operating too close to the internals. Presenting a
+valid use case could change that.
+
+See also
+C<L<perlapi/is_utf8_invariant_string>>
+and
+C<L<perlapi/is_utf8_invariant_string_loc>>,
+
+=cut
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_variant_under_utf8_count(const U8* const s, const U8* const e)
+{
+ const U8* x = s;
+ Size_t count = 0;
+
+ PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT;
+
+# ifndef EBCDIC
+
+ if ((STRLEN) (e - x) >= PERL_WORDSIZE
+ + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
+ - (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
+ {
+
+ /* Process per-byte until reach word boundary. XXX This loop could be
+ * eliminated if we knew that this platform had fast unaligned reads */
+ while (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK) {
+ count += ! UTF8_IS_INVARIANT(*x++);
+ }
+
+ /* Process per-word as long as we have at least a full word left */
+ do {
+
+ /* It's easier to look at a 16-bit word size to see how this works.
+ * The expression would be:
+ *
+ * (((*x & 0x8080) >> 7) * 0x0101) >> 8;
+ *
+ * Suppose the value of *x is the 16 bits
+ *
+ * 0by_______z_______
+ *
+ * where the 14 bits represented by '_' could be any combination of
+ * 0's or 1's (we don't care), and 'y' is the high bit of one byte,
+ * and 'z' is the high bit for the other (endianness doesn't
+ * matter). On ASCII platforms a byte is variant if the high bit
+ * is set; invariant otherwise. Thus, our goal, the count of
+ * variants in this 2-byte word is
+ *
+ * y + z
+ *
+ * To turn 0by_______z_______ into (y + z) we mask the intial value
+ * with 0x8080 to turn it into
+ *
+ * 0by0000000z0000000
+ *
+ * Then right shifting by 7 yields
+ *
+ * 0by0000000z
+ *
+ * Viewed as a number, this is
+ *
+ * 2**8 * y + z
+ *
+ * We then multiply by 0x0101 (which is = 2**8 + 1), so
+ *
+ * (2**8 * y + z) * (2**8 + 1)
+ * = (2**8 * y * 2**8) + (z * 2**8) + (2**8 * y * 1) + (z * 1)
+ * = (2**16 * y) + (2**8 * (y + z)) + z
+ *
+ * However (2**16 * y) doesn't fit in a 16-bit word (unless 'y' is
+ * zero in which case it is 0), and since this is unsigned
+ * multiplication, the C standard says that this component just
+ * gets ignored, so we are left with
+ *
+ * = 2**8 * (y + z) + z
+ *
+ * We then shift right by 8 bits, which divides by 2**8, and gets
+ * rid of the lone 'z', leaving us with
+ *
+ * = y + z
+ *
+ * The same principles apply for longer word sizes. For 32 bit
+ * words we end up with
+ *
+ * = 2**24 * (w + x + y + z) + (lots of other expressions
+ * below 2**24)
+ *
+ * with anything above 2**24 having overflowed and been chopped
+ * off. Shifting right by 24 yields (w + x + y + z)
+ */
+
+ count += ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
+ * PERL_COUNT_MULTIPLIER)
+ >> ((PERL_WORDSIZE - 1) * CHARBITS);
+ x += PERL_WORDSIZE;
+ } while (x + PERL_WORDSIZE <= e);
+ }
+
+# endif
+
+ /* Process per-byte */
+ while (x < e) {
+ if (! UTF8_IS_INVARIANT(*x)) {
+ count++;
+ }
+
+ x++;
+ }
+
+ return count;
+}
+
+#endif
+
+#undef PERL_WORDSIZE
+#undef PERL_COUNT_MULTIPLIER
+#undef PERL_WORD_BOUNDARY_MASK
+#undef PERL_VARIANTS_WORD_MASK
+
/*
=for apidoc is_utf8_string
assert(sv); assert(pv)
#endif
#endif
+#if defined(PERL_CORE) || defined(PERL_EXT)
+#ifndef PERL_NO_INLINE_FUNCTIONS
+PERL_STATIC_INLINE Size_t S_variant_under_utf8_count(const U8* const s, const U8* const e)
+ __attribute__warn_unused_result__;
+#define PERL_ARGS_ASSERT_VARIANT_UNDER_UTF8_COUNT \
+ assert(s); assert(e)
+#endif
+
+#endif
#if defined(PERL_CR_FILTER)
# if defined(PERL_IN_TOKE_C)
STATIC I32 S_cr_textfilter(pTHX_ int idx, SV *sv, int maxlen);