PERL_ARGS_ASSERT_BYTES_TO_UTF8;
PERL_UNUSED_CONTEXT;
- Newx(d, (*lenp) * 2 + 1, U8);
+ /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
+ Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
dst = d;
while (s < send) {
*d = '\0';
*lenp = d-dst;
- /* Trim unused space */
- Renew(dst, *lenp + 1, U8);
-
return dst;
}
if (instr(file, "mathoms.c")) {
Perl_warner(aTHX_ WARN_DEPRECATED,
- "In %s, line %d, starting in Perl v5.30, %s()"
+ "In %s, line %d, starting in Perl v5.32, %s()"
" will be removed. Avoid this message by"
" converting to use %s().\n",
file, line, name, alternative);
}
else {
Perl_warner(aTHX_ WARN_DEPRECATED,
- "In %s, line %d, starting in Perl v5.30, %s() will"
+ "In %s, line %d, starting in Perl v5.32, %s() will"
" require an additional parameter. Avoid this"
" message by converting to use %s().\n",
file, line, name, alternative);
* works. */
*lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
- Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
+ Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
ustrp, *lenp, U8);
return LATIN_SMALL_LETTER_LONG_S;
* that effect. However, if the caller knows what
* it's doing, it can pass this flag to indicate that,
* and the assertion is skipped.
- * FOLDEQ_S2_ALREADY_FOLDED Similarly.
+ * FOLDEQ_S2_ALREADY_FOLDED Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
+ * to s2, and s2 doesn't have to be UTF-8 encoded.
+ * This introduces an asymmetry to save a few branches
+ * in a loop. Currently, this is not a problem, as
+ * never are both inputs pre-folded. Simply call this
+ * function with the pre-folded one as the second
+ * string.
* FOLDEQ_S2_FOLDS_SANE
*/
I32
PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
- assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
- && (((flags & FOLDEQ_S1_ALREADY_FOLDED)
- && !(flags & FOLDEQ_S1_FOLDS_SANE))
- || ((flags & FOLDEQ_S2_ALREADY_FOLDED)
- && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
+ assert( ! ( (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
+ && (( (flags & FOLDEQ_S1_ALREADY_FOLDED)
+ && !(flags & FOLDEQ_S1_FOLDS_SANE))
+ || ( (flags & FOLDEQ_S2_ALREADY_FOLDED)
+ && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
/* The algorithm is to trial the folds without regard to the flags on
* the first line of the above assert(), and then see if the result
* violates them. This means that the inputs can't be pre-folded to a
flags_for_folder |= FOLD_FLAGS_LOCALE;
}
}
+ if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
+ flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
+ }
if (pe1) {
e1 = *(U8**)pe1;
if (n2 == 0) { /* Same for s2 */
if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
- f2 = (U8 *) p2;
- assert(u2);
- n2 = UTF8SKIP(f2);
+
+ /* Point to the already-folded character. But for non-UTF-8
+ * variants, convert to UTF-8 for the algorithm below */
+ if (UTF8_IS_INVARIANT(*p2)) {
+ f2 = (U8 *) p2;
+ n2 = 1;
+ }
+ else if (u2) {
+ f2 = (U8 *) p2;
+ n2 = UTF8SKIP(f2);
+ }
+ else {
+ foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
+ foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
+ f2 = foldbuf2;
+ n2 = 2;
+ }
}
else {
if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {