#include "EXTERN.h"
#define PERL_IN_UTF8_C
#include "perl.h"
-#include "inline_invlist.c"
-#include "charclass_invlists.h"
+#include "invlist_inline.h"
static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
*/
/*
-=for apidoc is_ascii_string
+=for apidoc is_invariant_string
-Returns true if the first C<len> bytes of the string C<s> are the same whether
-or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
-is, if they are invariant. On ASCII-ish machines, only ASCII characters
-fit this definition, hence the function's name.
+Returns true iff the first C<len> bytes of the string C<s> are the same
+regardless of the UTF-8 encoding of the string (or UTF-EBCDIC encoding on
+EBCDIC machines). That is, if they are UTF-8 invariant. On ASCII-ish
+machines, all the ASCII characters and only the ASCII characters fit this
+definition. On EBCDIC machines, the ASCII-range characters are invariant, but
+so also are the C1 controls and C<\c?> (which isn't in the ASCII range on
+EBCDIC).
If C<len> is 0, it will be calculated using C<strlen(s)>, (which means if you
use this option, that C<s> can't have embedded C<NUL> characters and has to
*/
bool
-Perl_is_ascii_string(const U8 *s, STRLEN len)
+Perl_is_invariant_string(const U8 *s, STRLEN len)
{
const U8* const send = s + (len ? len : strlen((const char *)s));
const U8* x = s;
- PERL_ARGS_ASSERT_IS_ASCII_STRING;
+ PERL_ARGS_ASSERT_IS_INVARIANT_STRING;
for (; x < send; ++x) {
if (!UTF8_IS_INVARIANT(*x))
(as opposed to native) code point. Only in very rare circumstances should code
not be using the native code point.
-For details, see the description for L</uvchr_to_utf8_flags>>.
+For details, see the description for L</uvchr_to_utf8_flags>.
=cut
*/
{
#ifdef EBCDIC
Perl_die(aTHX_ "Can't represent character for Ox%"UVXf" on this platform", uv);
- assert(0);
+ NOT_REACHED; /* NOTREACHED */
#endif
return NULL;
}
else if (UNICODE_IS_NONCHAR(uv)) {
if (flags & UNICODE_WARN_NONCHAR) {
Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
- "Unicode non-character U+%04"UVXf" is illegal for open interchange",
+ "Unicode non-character U+%04"UVXf" is not recommended for open interchange",
uv);
}
if (flags & UNICODE_DISALLOW_NONCHAR) {
=for apidoc uvchr_to_utf8
Adds the UTF-8 representation of the native code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
+of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
the byte after the end of the new character. In other words,
=for apidoc uvchr_to_utf8_flags
Adds the UTF-8 representation of the native code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
+of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
the byte after the end of the new character. In other words,
}
/*
-=for apidoc is_utf8_char_buf
-
-This is identical to the macro L</isUTF8_CHAR>.
-
-=cut */
-
-STRLEN
-Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
-{
-
- PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
-
- return isUTF8_CHAR(buf, buf_end);
-}
-
-/*
=for apidoc is_utf8_string
Returns true if the first C<len> bytes of string C<s> form a valid
embedded C<NUL> characters and has to have a terminating C<NUL> byte). Note
that all characters being ASCII constitute 'a valid UTF-8 string'.
-See also L</is_ascii_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
+See also L</is_invariant_string>(), L</is_utf8_string_loclen>(), and L</is_utf8_string_loc>().
=cut
*/
if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
&& ckWARN_d(WARN_NONCHAR))
{
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is not recommended for open interchange", uv));
pack_warn = packWARN(WARN_NONCHAR);
}
if (flags & UTF8_DISALLOW_NONCHAR) {
* is the label <malformed>.
*/
-malformed:
+ malformed:
if (sv && ckWARN_d(WARN_UTF8)) {
pack_warn = packWARN(WARN_UTF8);
}
-disallowed:
+ disallowed:
if (flags & UTF8_CHECK_ONLY) {
if (retlen)
return 0;
}
-do_warn:
+ do_warn:
if (pack_warn) { /* <pack_warn> was initialized to 0, and changed only
if warnings are to be raised. */
#define LAST_HIGH_SURROGATE 0xDBFF
#define FIRST_LOW_SURROGATE 0xDC00
#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
- if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) {
- if (p >= pend) {
- Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
- } else {
+
+ /* This assumes that most uses will be in the first Unicode plane, not
+ * needing surrogates */
+ if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
+ && uv <= UNICODE_SURROGATE_LAST))
+ {
+ if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
+ Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
+ }
+ else {
UV low = (p[0] << 8) + p[1];
- p += 2;
- if (low < FIRST_LOW_SURROGATE || low > LAST_LOW_SURROGATE)
+ if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
+ || UNLIKELY(low > LAST_LOW_SURROGATE))
+ {
Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
+ }
+ p += 2;
uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
+ (low - FIRST_LOW_SURROGATE) + 0x10000;
}
- } else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) {
- Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
}
#ifdef EBCDIC
d = uvoffuni_to_utf8_flags(d, uv, 0);
case MICRO_SIGN:
converted = GREEK_CAPITAL_LETTER_MU;
break;
+#if UNICODE_MAJOR_VERSION > 2 \
+ || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
+ && UNICODE_DOT_DOT_VERSION >= 8)
case LATIN_SMALL_LETTER_SHARP_S:
*(p)++ = 'S';
*p = S_or_s;
*lenp = 2;
return 'S';
+#endif
default:
Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
}
if (c == MICRO_SIGN) {
converted = GREEK_SMALL_LETTER_MU;
}
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
/* If can't cross 127/128 boundary, can't return "ss"; instead return
return 's';
}
}
+#endif
else { /* In this range the fold of all other characters is their lower
case */
converted = toLOWER_LATIN1(c);
PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
- /* Tread a UTF-8 locale as not being in locale at all */
- if (IN_UTF8_CTYPE_LOCALE) {
- flags &= ~FOLD_FLAGS_LOCALE;
+ if (flags & FOLD_FLAGS_LOCALE) {
+ /* Treat a UTF-8 locale as not being in locale at all */
+ if (IN_UTF8_CTYPE_LOCALE) {
+ flags &= ~FOLD_FLAGS_LOCALE;
+ }
+ else {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ goto needs_full_generality;
+ }
}
if (c < 256) {
- UV result = _to_fold_latin1((U8) c, p, lenp,
+ return _to_fold_latin1((U8) c, p, lenp,
flags & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII));
- /* It is illegal for the fold to cross the 255/256 boundary under
- * locale; in this case return the original */
- return (result > 256 && flags & FOLD_FLAGS_LOCALE)
- ? c
- : result;
}
- /* If no special needs, just use the macro */
+ /* Here, above 255. If no special needs, just use the macro */
if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
uvchr_to_utf8(p, c);
return CALL_FOLD_CASE(p, p, lenp, flags & FOLD_FLAGS_FULL);
else { /* Otherwise, _to_utf8_fold_flags has the intelligence to deal with
the special flags. */
U8 utf8_c[UTF8_MAXBYTES + 1];
+
+ needs_full_generality:
uvchr_to_utf8(utf8_c, c);
return _to_utf8_fold_flags(utf8_c, p, lenp, flags);
}
}
STATIC UV
-S_check_locale_boundary_crossing(pTHX_ const char * const func_name, const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
+S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result, U8* const ustrp, STRLEN *lenp)
{
/* This is called when changing the case of a utf8-encoded character above
* the Latin1 range, and the operation is in a non-UTF-8 locale. If the
s += UTF8SKIP(s);
}
- /* Here, no characters crossed, result is ok as-is */
+ /* Here, no characters crossed, result is ok as-is, but we warn. */
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
return result;
}
-bad_crossing:
+ bad_crossing:
/* Failed, have to return the original */
original = valid_utf8_to_uvchr(p, lenp);
Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
"Can't do %s(\"\\x{%"UVXf"}\") on non-UTF-8 locale; "
"resolved to \"\\x{%"UVXf"}\".",
- func_name,
+ OP_DESC(PL_op),
original,
original);
Copy(p, ustrp, *lenp, char);
PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
- if (flags && IN_UTF8_CTYPE_LOCALE) {
- flags = FALSE;
+ if (flags) {
+ /* Treat a UTF-8 locale as not being in locale at all */
+ if (IN_UTF8_CTYPE_LOCALE) {
+ flags = FALSE;
+ }
+ else {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ }
}
if (UTF8_IS_INVARIANT(*p)) {
result = CALL_UPPER_CASE(p, ustrp, lenp);
if (flags) {
- result = check_locale_boundary_crossing("uc", p, result, ustrp, lenp);
+ result = check_locale_boundary_crossing(p, result, ustrp, lenp);
}
return result;
}
PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
- if (flags && IN_UTF8_CTYPE_LOCALE) {
- flags = FALSE;
+ if (flags) {
+ /* Treat a UTF-8 locale as not being in locale at all */
+ if (IN_UTF8_CTYPE_LOCALE) {
+ flags = FALSE;
+ }
+ else {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ }
}
if (UTF8_IS_INVARIANT(*p)) {
result = CALL_TITLE_CASE(p, ustrp, lenp);
if (flags) {
- result = check_locale_boundary_crossing("ucfirst", p, result, ustrp, lenp);
+ result = check_locale_boundary_crossing(p, result, ustrp, lenp);
}
return result;
}
PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
- if (flags && IN_UTF8_CTYPE_LOCALE) {
- flags = FALSE;
+ if (flags) {
+ /* Treat a UTF-8 locale as not being in locale at all */
+ if (IN_UTF8_CTYPE_LOCALE) {
+ flags = FALSE;
+ }
+ else {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ }
}
if (UTF8_IS_INVARIANT(*p)) {
result = CALL_LOWER_CASE(p, ustrp, lenp);
if (flags) {
- result = check_locale_boundary_crossing("lc", p, result, ustrp, lenp);
+ result = check_locale_boundary_crossing(p, result, ustrp, lenp);
}
return result;
assert(p != ustrp); /* Otherwise overwrites */
- if (flags & FOLD_FLAGS_LOCALE && IN_UTF8_CTYPE_LOCALE) {
- flags &= ~FOLD_FLAGS_LOCALE;
+ if (flags & FOLD_FLAGS_LOCALE) {
+ /* Treat a UTF-8 locale as not being in locale at all */
+ if (IN_UTF8_CTYPE_LOCALE) {
+ flags &= ~FOLD_FLAGS_LOCALE;
+ }
+ else {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ }
}
if (UTF8_IS_INVARIANT(*p)) {
if (flags & FOLD_FLAGS_LOCALE) {
+# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
+ const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
+
+# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
+# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
+
+ const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
+
/* Special case these two characters, as what normally gets
* returned under locale doesn't work */
- if (UTF8SKIP(p) == sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1
- && memEQ((char *) p, LATIN_CAPITAL_LETTER_SHARP_S_UTF8,
- sizeof(LATIN_CAPITAL_LETTER_SHARP_S_UTF8) - 1))
+ if (UTF8SKIP(p) == cap_sharp_s_len
+ && memEQ((char *) p, CAP_SHARP_S, cap_sharp_s_len))
{
/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
"resolved to \"\\x{17F}\\x{17F}\".");
goto return_long_s;
}
- else if (UTF8SKIP(p) == sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1
- && memEQ((char *) p, LATIN_SMALL_LIGATURE_LONG_S_T_UTF8,
- sizeof(LATIN_SMALL_LIGATURE_LONG_S_T_UTF8) - 1))
+ else
+#endif
+ if (UTF8SKIP(p) == long_s_t_len
+ && memEQ((char *) p, LONG_S_T, long_s_t_len))
{
/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
"resolved to \"\\x{FB06}\".");
goto return_ligature_st;
}
- return check_locale_boundary_crossing("fc", p, result, ustrp, lenp);
+
+#if UNICODE_MAJOR_VERSION == 3 \
+ && UNICODE_DOT_VERSION == 0 \
+ && UNICODE_DOT_DOT_VERSION == 1
+# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
+
+ /* And special case this on this Unicode version only, for the same
+ * reaons the other two are special cased. They would cross the
+ * 255/256 boundary which is forbidden under /l, and so the code
+ * wouldn't catch that they are equivalent (which they are only in
+ * this release) */
+ else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
+ && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
+ {
+ /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
+ "resolved to \"\\x{0131}\".");
+ goto return_dotless_i;
+ }
+#endif
+
+ return check_locale_boundary_crossing(p, result, ustrp, lenp);
}
else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
return result;
/* But in these instances, there is an alternative we can
* return that is valid */
- if (original == LATIN_CAPITAL_LETTER_SHARP_S
- || original == LATIN_SMALL_LETTER_SHARP_S)
- {
+ if (original == LATIN_SMALL_LETTER_SHARP_S
+#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
+ || original == LATIN_CAPITAL_LETTER_SHARP_S
+#endif
+ ) {
goto return_long_s;
}
else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
goto return_ligature_st;
}
+#if UNICODE_MAJOR_VERSION == 3 \
+ && UNICODE_DOT_VERSION == 0 \
+ && UNICODE_DOT_DOT_VERSION == 1
+
+ else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+ goto return_dotless_i;
+ }
+#endif
Copy(p, ustrp, *lenp, char);
return original;
}
*lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
return LATIN_SMALL_LIGATURE_ST;
+
+#if UNICODE_MAJOR_VERSION == 3 \
+ && UNICODE_DOT_VERSION == 0 \
+ && UNICODE_DOT_DOT_VERSION == 1
+
+ return_dotless_i:
+ *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
+ Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
+ return LATIN_SMALL_LETTER_DOTLESS_I;
+
+#endif
+
}
/* Note:
PUSHSTACKi(PERLSI_MAGIC);
ENTER;
SAVEHINTS();
+ save_re_context();
/* We might get here via a subroutine signature which uses a utf8
* parameter name, at which point PL_subname will have been set
* but not yet used. */
#ifndef NO_TAINT_SUPPORT
/* It is assumed that callers of this routine are not passing in
* any user derived data. */
+ /* Need to do this after save_re_context() as it will set
+ * PL_tainted to 1 while saving $1 etc (see the code after getrx:
+ * in Perl_magic_get). Even line to create errsv_save can turn on
+ * PL_tainted. */
SAVEBOOL(TAINT_get);
TAINT_NOT;
#endif
* currently handle. But it also means that FB05 and FB06 are equivalent in
* a 1-1 mapping which we should handle, and this relationship may not be in
* the main table. Therefore this function examines all the multi-char
- * sequences and adds the 1-1 mappings that come out of that. */
+ * sequences and adds the 1-1 mappings that come out of that.
+ *
+ * XXX This function was originally intended to be multipurpose, but its
+ * only use is quite likely to remain for constructing the inversion of
+ * the CaseFolding (//i) property. If it were more general purpose for
+ * regex patterns, it would have to do the FB05/FB06 game for simple folds,
+ * because certain folds are prohibited under /iaa and /il. As an example,
+ * in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both
+ * equivalent under /i. But under /iaa and /il, the folds to 'i' are
+ * prohibited, so we would not figure out that they fold to each other.
+ * Code could be written to automatically figure this out, similar to the
+ * code that does this for multi-character folds, but this is the only case
+ * where something like this is ever likely to happen, as all the single
+ * char folds to The 0-255 range are now quite settled. Instead there is a
+ * little special code that is compiled only for this Unicode version. This
+ * is smaller and didn't require much coding time to do. But this makes
+ * this routine strongly tied to being used just for CaseFolding. If ever
+ * it should be generalized, this would have to be fixed */
U8 *l, *lend;
STRLEN lcur;
} /* End of specials */
/* read $swash->{LIST} */
+
+#if UNICODE_MAJOR_VERSION == 3 \
+ && UNICODE_DOT_VERSION == 0 \
+ && UNICODE_DOT_DOT_VERSION == 1
+
+ /* For this version only U+130 and U+131 are equivalent under qr//i. Add a
+ * rule so that things work under /iaa and /il */
+
+ SV * mod_listsv = sv_mortalcopy(*listsvp);
+ sv_catpv(mod_listsv, "130\t130\t131\n");
+ l = (U8*)SvPV(mod_listsv, lcur);
+
+#else
+
l = (U8*)SvPV(*listsvp, lcur);
+
+#endif
+
lend = l + lcur;
/* Go through each input line */
/* The first number is a count of the rest */
l++;
- elements = grok_atou((const char *)l, &after_atou);
+ if (!grok_atoUV((const char *)l, &elements, &after_atou)) {
+ Perl_croak(aTHX_ "panic: Expecting a valid count of elements at start of inversion list");
+ }
if (elements == 0) {
invlist = _new_invlist(0);
}
/* Get the 0th element, which is needed to setup the inversion list */
while (isSPACE(*l)) l++;
- element0 = (UV) grok_atou((const char *)l, &after_atou);
+ if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
+ Perl_croak(aTHX_ "panic: Expecting a valid 0th element for inversion list");
+ }
l = (U8 *) after_atou;
invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
elements--;
Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements);
}
while (isSPACE(*l)) l++;
- *other_elements_ptr++ = (UV) grok_atou((const char *)l, &after_atou);
+ if (!grok_atoUV((const char *)l, other_elements_ptr++, &after_atou)) {
+ Perl_croak(aTHX_ "panic: Expecting a valid element in inversion list");
+ }
l = (U8 *) after_atou;
}
}
}
if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) {
STRLEN char_len;
- if (UTF8_IS_SUPER(s)) {
+ if (UTF8_IS_SUPER(s, e)) {
if (ckWARN_d(WARN_NON_UNICODE)) {
UV uv = utf8_to_uvchr_buf(s, e, &char_len);
Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
ok = FALSE;
}
}
- else if (UTF8_IS_SURROGATE(s)) {
+ else if (UTF8_IS_SURROGATE(s, e)) {
if (ckWARN_d(WARN_SURROGATE)) {
UV uv = utf8_to_uvchr_buf(s, e, &char_len);
Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
ok = FALSE;
}
}
- else if
- ((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
- && (ckWARN_d(WARN_NONCHAR)))
- {
+ else if ((UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) {
UV uv = utf8_to_uvchr_buf(s, e, &char_len);
Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
- "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
+ "Unicode non-character U+%04"UVXf" is not recommended for open interchange", uv);
ok = FALSE;
}
}
The pointer to the PV of the C<dsv> is returned.
+See also L</sv_uni_display>.
+
=cut */
char *
Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
* FOLDEQ_LOCALE is set iff the rules from the current underlying
* locale are to be used.
* FOLDEQ_S1_ALREADY_FOLDED s1 has already been folded before calling this
- * routine. This allows that step to be skipped.
+ * routine. This allows that step to be skipped.
+ * Currently, this requires s1 to be encoded as UTF-8
+ * (u1 must be true), which is asserted for.
+ * FOLDEQ_S1_FOLDS_SANE With either NOMIX_ASCII or LOCALE, no folds may
+ * cross certain boundaries. Hence, the caller should
+ * let this function do the folding instead of
+ * pre-folding. This code contains an assertion to
+ * that effect. However, if the caller knows what
+ * it's doing, it can pass this flag to indicate that,
+ * and the assertion is skipped.
* FOLDEQ_S2_ALREADY_FOLDED Similarly.
+ * FOLDEQ_S2_FOLDS_SANE
*/
I32
Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags)
STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
+ U8 flags_for_folder = FOLD_FLAGS_FULL;
PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
- && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED))));
+ && (((flags & FOLDEQ_S1_ALREADY_FOLDED)
+ && !(flags & FOLDEQ_S1_FOLDS_SANE))
+ || ((flags & FOLDEQ_S2_ALREADY_FOLDED)
+ && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
/* The algorithm is to trial the folds without regard to the flags on
* the first line of the above assert(), and then see if the result
* violates them. This means that the inputs can't be pre-folded to a
* and /iaa matches are most likely to involve code points 0-255, and this
* function only under rare conditions gets called for 0-255. */
- if (IN_UTF8_CTYPE_LOCALE) {
- flags &= ~FOLDEQ_LOCALE;
+ if (flags & FOLDEQ_LOCALE) {
+ if (IN_UTF8_CTYPE_LOCALE) {
+ flags &= ~FOLDEQ_LOCALE;
+ }
+ else {
+ flags_for_folder |= FOLD_FLAGS_LOCALE;
+ }
}
if (pe1) {
while (p1 < e1 && p2 < e2) {
/* If at the beginning of a new character in s1, get its fold to use
- * and the length of the fold. (exception: locale rules just get the
- * character to a single byte) */
+ * and the length of the fold. */
if (n1 == 0) {
if (flags & FOLDEQ_S1_ALREADY_FOLDED) {
f1 = (U8 *) p1;
+ assert(u1);
n1 = UTF8SKIP(f1);
}
else {
- /* If in locale matching, we use two sets of rules, depending
- * on if the code point is above or below 255. Here, we test
- * for and handle locale rules */
- if ((flags & FOLDEQ_LOCALE)
- && (! u1 || ! UTF8_IS_ABOVE_LATIN1(*p1)))
- {
- /* There is no mixing of code points above and below 255. */
- if (u2 && UTF8_IS_ABOVE_LATIN1(*p2)) {
- return 0;
- }
-
- /* We handle locale rules by converting, if necessary, the
- * code point to a single byte. */
- if (! u1 || UTF8_IS_INVARIANT(*p1)) {
- *foldbuf1 = *p1;
- }
- else {
- *foldbuf1 = TWO_BYTE_UTF8_TO_NATIVE(*p1, *(p1 + 1));
- }
- n1 = 1;
- }
- else if (isASCII(*p1)) { /* Note, that here won't be both
- ASCII and using locale rules */
-
- /* If trying to mix non- with ASCII, and not supposed to,
- * fail */
- if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
- return 0;
- }
- n1 = 1;
- *foldbuf1 = toFOLD(*p1);
- }
- else if (u1) {
- to_utf8_fold(p1, foldbuf1, &n1);
- }
- else { /* Not utf8, get utf8 fold */
- to_uni_fold(*p1, foldbuf1, &n1);
- }
- f1 = foldbuf1;
- }
+ if (isASCII(*p1) && ! (flags & FOLDEQ_LOCALE)) {
+
+ /* We have to forbid mixing ASCII with non-ASCII if the
+ * flags so indicate. And, we can short circuit having to
+ * call the general functions for this common ASCII case,
+ * all of whose non-locale folds are also ASCII, and hence
+ * UTF-8 invariants, so the UTF8ness of the strings is not
+ * relevant. */
+ if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p2)) {
+ return 0;
+ }
+ n1 = 1;
+ *foldbuf1 = toFOLD(*p1);
+ }
+ else if (u1) {
+ _to_utf8_fold_flags(p1, foldbuf1, &n1, flags_for_folder);
+ }
+ else { /* Not utf8, get utf8 fold */
+ _to_uni_fold_flags(*p1, foldbuf1, &n1, flags_for_folder);
+ }
+ f1 = foldbuf1;
+ }
}
if (n2 == 0) { /* Same for s2 */
if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
f2 = (U8 *) p2;
+ assert(u2);
n2 = UTF8SKIP(f2);
}
else {
- if ((flags & FOLDEQ_LOCALE)
- && (! u2 || ! UTF8_IS_ABOVE_LATIN1(*p2)))
- {
- /* Here, the next char in s2 is < 256. We've already
- * worked on s1, and if it isn't also < 256, can't match */
- if (u1 && UTF8_IS_ABOVE_LATIN1(*p1)) {
- return 0;
- }
- if (! u2 || UTF8_IS_INVARIANT(*p2)) {
- *foldbuf2 = *p2;
- }
- else {
- *foldbuf2 = TWO_BYTE_UTF8_TO_NATIVE(*p2, *(p2 + 1));
- }
-
- /* Use another function to handle locale rules. We've made
- * sure that both characters to compare are single bytes */
- if (! foldEQ_locale((char *) f1, (char *) foldbuf2, 1)) {
- return 0;
- }
- n1 = n2 = 0;
- }
- else if (isASCII(*p2)) {
- if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
- return 0;
- }
- n2 = 1;
- *foldbuf2 = toFOLD(*p2);
- }
- else if (u2) {
- to_utf8_fold(p2, foldbuf2, &n2);
- }
- else {
- to_uni_fold(*p2, foldbuf2, &n2);
- }
- f2 = foldbuf2;
+ if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
+ if ((flags & FOLDEQ_UTF8_NOMIX_ASCII) && ! isASCII(*p1)) {
+ return 0;
+ }
+ n2 = 1;
+ *foldbuf2 = toFOLD(*p2);
+ }
+ else if (u2) {
+ _to_utf8_fold_flags(p2, foldbuf2, &n2, flags_for_folder);
+ }
+ else {
+ _to_uni_fold_flags(*p2, foldbuf2, &n2, flags_for_folder);
+ }
+ f2 = foldbuf2;
}
}
=for apidoc uvuni_to_utf8_flags
Instead you almost certainly want to use L</uvchr_to_utf8> or
-L</uvchr_to_utf8_flags>>.
+L</uvchr_to_utf8_flags>.
This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
which itself, while not deprecated, should be used only in isolated
}
/*
- * Local variables:
- * c-indentation-style: bsd
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- *
* ex: set ts=8 sts=4 sw=4 et:
*/