"Use of code point 0x%" UVXf " is not allowed; the"
" permissible max is 0x%" UVXf;
-#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
-
/*
=head1 Unicode Support
These are various utility functions for manipulating UTF8-encoded
* performance hit on these high EBCDIC code points. */
if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
- if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
- Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
+ if (UNLIKELY(uv > MAX_LEGAL_CP)) {
+ Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_LEGAL_CP);
}
if ( (flags & UNICODE_WARN_SUPER)
|| ( (flags & UNICODE_WARN_PERL_EXTENDED)
PERL_ARGS_ASSERT_BYTES_TO_UTF8;
PERL_UNUSED_CONTEXT;
- Newx(d, (*lenp) * 2 + 1, U8);
+ /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
+ Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
dst = d;
while (s < send) {
*d = '\0';
*lenp = d-dst;
- /* Trim unused space */
- Renew(dst, *lenp + 1, U8);
-
return dst;
}
if (instr(file, "mathoms.c")) {
Perl_warner(aTHX_ WARN_DEPRECATED,
- "In %s, line %d, starting in Perl v5.30, %s()"
+ "In %s, line %d, starting in Perl v5.32, %s()"
" will be removed. Avoid this message by"
" converting to use %s().\n",
file, line, name, alternative);
}
else {
Perl_warner(aTHX_ WARN_DEPRECATED,
- "In %s, line %d, starting in Perl v5.30, %s() will"
+ "In %s, line %d, starting in Perl v5.32, %s() will"
" require an additional parameter. Avoid this"
" message by converting to use %s().\n",
file, line, name, alternative);
}
if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
- if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) {
+ if (UNLIKELY(uv1 > MAX_LEGAL_CP)) {
Perl_croak(aTHX_ cp_above_legal_max, uv1,
- MAX_EXTERNALLY_LEGAL_CP);
+ MAX_LEGAL_CP);
}
if (ckWARN_d(WARN_NON_UNICODE)) {
const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
/* Special case these two characters, as what normally gets
* returned under locale doesn't work */
- if (memEQs((char *) p, UTF8SKIP(p), CAP_SHARP_S))
+ if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
{
/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
}
else
#endif
- if (memEQs((char *) p, UTF8SKIP(p), LONG_S_T))
+ if (memBEGINs((char *) p, e - p, LONG_S_T))
{
/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
* 255/256 boundary which is forbidden under /l, and so the code
* wouldn't catch that they are equivalent (which they are only in
* this release) */
- else if (memEQs((char *) p, UTF8SKIP(p), DOTTED_I)) {
+ else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
"Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
* works. */
*lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
- Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
+ Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
ustrp, *lenp, U8);
return LATIN_SMALL_LETTER_LONG_S;
* that effect. However, if the caller knows what
* it's doing, it can pass this flag to indicate that,
* and the assertion is skipped.
- * FOLDEQ_S2_ALREADY_FOLDED Similarly.
+ * FOLDEQ_S2_ALREADY_FOLDED Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
+ * to s2, and s2 doesn't have to be UTF-8 encoded.
+ * This introduces an asymmetry to save a few branches
+ * in a loop. Currently, this is not a problem, as
+ * never are both inputs pre-folded. Simply call this
+ * function with the pre-folded one as the second
+ * string.
* FOLDEQ_S2_FOLDS_SANE
*/
I32
PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
- assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
- && (((flags & FOLDEQ_S1_ALREADY_FOLDED)
- && !(flags & FOLDEQ_S1_FOLDS_SANE))
- || ((flags & FOLDEQ_S2_ALREADY_FOLDED)
- && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
+ assert( ! ( (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
+ && (( (flags & FOLDEQ_S1_ALREADY_FOLDED)
+ && !(flags & FOLDEQ_S1_FOLDS_SANE))
+ || ( (flags & FOLDEQ_S2_ALREADY_FOLDED)
+ && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
/* The algorithm is to trial the folds without regard to the flags on
* the first line of the above assert(), and then see if the result
* violates them. This means that the inputs can't be pre-folded to a
flags_for_folder |= FOLD_FLAGS_LOCALE;
}
}
+ if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
+ flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
+ }
if (pe1) {
e1 = *(U8**)pe1;
if (n2 == 0) { /* Same for s2 */
if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
- f2 = (U8 *) p2;
- assert(u2);
- n2 = UTF8SKIP(f2);
+
+ /* Point to the already-folded character. But for non-UTF-8
+ * variants, convert to UTF-8 for the algorithm below */
+ if (UTF8_IS_INVARIANT(*p2)) {
+ f2 = (U8 *) p2;
+ n2 = 1;
+ }
+ else if (u2) {
+ f2 = (U8 *) p2;
+ n2 = UTF8SKIP(f2);
+ }
+ else {
+ foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
+ foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
+ f2 = foldbuf2;
+ n2 = 2;
+ }
}
else {
if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {