#include "EXTERN.h"
#define PERL_IN_UTF8_C
#include "perl.h"
-#include "inline_invlist.c"
+#include "invlist_inline.h"
static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
(as opposed to native) code point. Only in very rare circumstances should code
not be using the native code point.
-For details, see the description for L</uvchr_to_utf8_flags>>.
+For details, see the description for L</uvchr_to_utf8_flags>.
=cut
*/
else if (UNICODE_IS_NONCHAR(uv)) {
if (flags & UNICODE_WARN_NONCHAR) {
Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
- "Unicode non-character U+%04"UVXf" is illegal for open interchange",
+ "Unicode non-character U+%04"UVXf" is not recommended for open interchange",
uv);
}
if (flags & UNICODE_DISALLOW_NONCHAR) {
=for apidoc uvchr_to_utf8
Adds the UTF-8 representation of the native code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
+of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
the byte after the end of the new character. In other words,
=for apidoc uvchr_to_utf8_flags
Adds the UTF-8 representation of the native code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
+of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
the byte after the end of the new character. In other words,
if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
&& ckWARN_d(WARN_NONCHAR))
{
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv));
+ sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is not recommended for open interchange", uv));
pack_warn = packWARN(WARN_NONCHAR);
}
if (flags & UTF8_DISALLOW_NONCHAR) {
case MICRO_SIGN:
converted = GREEK_CAPITAL_LETTER_MU;
break;
+#if UNICODE_MAJOR_VERSION > 2 \
+ || (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
+ && UNICODE_DOT_DOT_VERSION >= 8)
case LATIN_SMALL_LETTER_SHARP_S:
*(p)++ = 'S';
*p = S_or_s;
*lenp = 2;
return 'S';
+#endif
default:
Perl_croak(aTHX_ "panic: to_upper_title_latin1 did not expect '%c' to map to '%c'", c, LATIN_SMALL_LETTER_Y_WITH_DIAERESIS);
NOT_REACHED; /* NOTREACHED */
if (c == MICRO_SIGN) {
converted = GREEK_SMALL_LETTER_MU;
}
+#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
+ || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
+ || UNICODE_DOT_DOT_VERSION > 0)
else if ((flags & FOLD_FLAGS_FULL) && c == LATIN_SMALL_LETTER_SHARP_S) {
/* If can't cross 127/128 boundary, can't return "ss"; instead return
return 's';
}
}
+#endif
else { /* In this range the fold of all other characters is their lower
case */
converted = toLOWER_LATIN1(c);
if (flags & FOLD_FLAGS_LOCALE) {
-# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
# define LONG_S_T LATIN_SMALL_LIGATURE_LONG_S_T_UTF8
+ const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
+
+# ifdef LATIN_CAPITAL_LETTER_SHARP_S_UTF8
+# define CAP_SHARP_S LATIN_CAPITAL_LETTER_SHARP_S_UTF8
const unsigned int cap_sharp_s_len = sizeof(CAP_SHARP_S) - 1;
- const unsigned int long_s_t_len = sizeof(LONG_S_T) - 1;
/* Special case these two characters, as what normally gets
* returned under locale doesn't work */
"resolved to \"\\x{17F}\\x{17F}\".");
goto return_long_s;
}
- else if (UTF8SKIP(p) == long_s_t_len
+ else
+#endif
+ if (UTF8SKIP(p) == long_s_t_len
&& memEQ((char *) p, LONG_S_T, long_s_t_len))
{
/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
"resolved to \"\\x{FB06}\".");
goto return_ligature_st;
}
+
+#if UNICODE_MAJOR_VERSION == 3 \
+ && UNICODE_DOT_VERSION == 0 \
+ && UNICODE_DOT_DOT_VERSION == 1
+# define DOTTED_I LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8
+
+ /* And special case this on this Unicode version only, for the same
+ * reaons the other two are special cased. They would cross the
+ * 255/256 boundary which is forbidden under /l, and so the code
+ * wouldn't catch that they are equivalent (which they are only in
+ * this release) */
+ else if (UTF8SKIP(p) == sizeof(DOTTED_I) - 1
+ && memEQ((char *) p, DOTTED_I, sizeof(DOTTED_I) - 1))
+ {
+ /* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ "Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
+ "resolved to \"\\x{0131}\".");
+ goto return_dotless_i;
+ }
+#endif
+
return check_locale_boundary_crossing(p, result, ustrp, lenp);
}
else if (! (flags & FOLD_FLAGS_NOMIX_ASCII)) {
/* But in these instances, there is an alternative we can
* return that is valid */
- if (original == LATIN_CAPITAL_LETTER_SHARP_S
- || original == LATIN_SMALL_LETTER_SHARP_S)
- {
+ if (original == LATIN_SMALL_LETTER_SHARP_S
+#ifdef LATIN_CAPITAL_LETTER_SHARP_S /* not defined in early Unicode releases */
+ || original == LATIN_CAPITAL_LETTER_SHARP_S
+#endif
+ ) {
goto return_long_s;
}
else if (original == LATIN_SMALL_LIGATURE_LONG_S_T) {
goto return_ligature_st;
}
+#if UNICODE_MAJOR_VERSION == 3 \
+ && UNICODE_DOT_VERSION == 0 \
+ && UNICODE_DOT_DOT_VERSION == 1
+
+ else if (original == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
+ goto return_dotless_i;
+ }
+#endif
Copy(p, ustrp, *lenp, char);
return original;
}
*lenp = sizeof(LATIN_SMALL_LIGATURE_ST_UTF8) - 1;
Copy(LATIN_SMALL_LIGATURE_ST_UTF8, ustrp, *lenp, U8);
return LATIN_SMALL_LIGATURE_ST;
+
+#if UNICODE_MAJOR_VERSION == 3 \
+ && UNICODE_DOT_VERSION == 0 \
+ && UNICODE_DOT_DOT_VERSION == 1
+
+ return_dotless_i:
+ *lenp = sizeof(LATIN_SMALL_LETTER_DOTLESS_I_UTF8) - 1;
+ Copy(LATIN_SMALL_LETTER_DOTLESS_I_UTF8, ustrp, *lenp, U8);
+ return LATIN_SMALL_LETTER_DOTLESS_I;
+
+#endif
+
}
/* Note:
PUSHSTACKi(PERLSI_MAGIC);
ENTER;
SAVEHINTS();
+ save_re_context();
/* We might get here via a subroutine signature which uses a utf8
* parameter name, at which point PL_subname will have been set
* but not yet used. */
#ifndef NO_TAINT_SUPPORT
/* It is assumed that callers of this routine are not passing in
* any user derived data. */
+ /* Need to do this after save_re_context() as it will set
+ * PL_tainted to 1 while saving $1 etc (see the code after getrx:
+ * in Perl_magic_get). Even line to create errsv_save can turn on
+ * PL_tainted. */
SAVEBOOL(TAINT_get);
TAINT_NOT;
#endif
* currently handle. But it also means that FB05 and FB06 are equivalent in
* a 1-1 mapping which we should handle, and this relationship may not be in
* the main table. Therefore this function examines all the multi-char
- * sequences and adds the 1-1 mappings that come out of that. */
+ * sequences and adds the 1-1 mappings that come out of that.
+ *
+ * XXX This function was originally intended to be multipurpose, but its
+ * only use is quite likely to remain for constructing the inversion of
+ * the CaseFolding (//i) property. If it were more general purpose for
+ * regex patterns, it would have to do the FB05/FB06 game for simple folds,
+ * because certain folds are prohibited under /iaa and /il. As an example,
+ * in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both
+ * equivalent under /i. But under /iaa and /il, the folds to 'i' are
+ * prohibited, so we would not figure out that they fold to each other.
+ * Code could be written to automatically figure this out, similar to the
+ * code that does this for multi-character folds, but this is the only case
+ * where something like this is ever likely to happen, as all the single
+ * char folds to The 0-255 range are now quite settled. Instead there is a
+ * little special code that is compiled only for this Unicode version. This
+ * is smaller and didn't require much coding time to do. But this makes
+ * this routine strongly tied to being used just for CaseFolding. If ever
+ * it should be generalized, this would have to be fixed */
U8 *l, *lend;
STRLEN lcur;
} /* End of specials */
/* read $swash->{LIST} */
+
+#if UNICODE_MAJOR_VERSION == 3 \
+ && UNICODE_DOT_VERSION == 0 \
+ && UNICODE_DOT_DOT_VERSION == 1
+
+ /* For this version only U+130 and U+131 are equivalent under qr//i. Add a
+ * rule so that things work under /iaa and /il */
+
+ SV * mod_listsv = sv_mortalcopy(*listsvp);
+ sv_catpv(mod_listsv, "130\t130\t131\n");
+ l = (U8*)SvPV(mod_listsv, lcur);
+
+#else
+
l = (U8*)SvPV(*listsvp, lcur);
+
+#endif
+
lend = l + lcur;
/* Go through each input line */
/* The first number is a count of the rest */
l++;
- elements = grok_atou((const char *)l, &after_atou);
+ if (!grok_atoUV((const char *)l, &elements, &after_atou)) {
+ Perl_croak(aTHX_ "panic: Expecting a valid count of elements at start of inversion list");
+ }
if (elements == 0) {
invlist = _new_invlist(0);
}
/* Get the 0th element, which is needed to setup the inversion list */
while (isSPACE(*l)) l++;
- element0 = (UV) grok_atou((const char *)l, &after_atou);
+ if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
+ Perl_croak(aTHX_ "panic: Expecting a valid 0th element for inversion list");
+ }
l = (U8 *) after_atou;
invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
elements--;
Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements);
}
while (isSPACE(*l)) l++;
- *other_elements_ptr++ = (UV) grok_atou((const char *)l, &after_atou);
+ if (!grok_atoUV((const char *)l, other_elements_ptr++, &after_atou)) {
+ Perl_croak(aTHX_ "panic: Expecting a valid element in inversion list");
+ }
l = (U8 *) after_atou;
}
}
}
if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) {
STRLEN char_len;
- if (UTF8_IS_SUPER(s)) {
+ if (UTF8_IS_SUPER(s, e)) {
if (ckWARN_d(WARN_NON_UNICODE)) {
UV uv = utf8_to_uvchr_buf(s, e, &char_len);
Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
ok = FALSE;
}
}
- else if (UTF8_IS_SURROGATE(s)) {
+ else if (UTF8_IS_SURROGATE(s, e)) {
if (ckWARN_d(WARN_SURROGATE)) {
UV uv = utf8_to_uvchr_buf(s, e, &char_len);
Perl_warner(aTHX_ packWARN(WARN_SURROGATE),
ok = FALSE;
}
}
- else if
- ((UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s))
- && (ckWARN_d(WARN_NONCHAR)))
- {
+ else if ((UTF8_IS_NONCHAR(s, e)) && (ckWARN_d(WARN_NONCHAR))) {
UV uv = utf8_to_uvchr_buf(s, e, &char_len);
Perl_warner(aTHX_ packWARN(WARN_NONCHAR),
- "Unicode non-character U+%04"UVXf" is illegal for open interchange", uv);
+ "Unicode non-character U+%04"UVXf" is not recommended for open interchange", uv);
ok = FALSE;
}
}
The pointer to the PV of the C<dsv> is returned.
+See also L</sv_uni_display>.
+
=cut */
char *
Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
=for apidoc uvuni_to_utf8_flags
Instead you almost certainly want to use L</uvchr_to_utf8> or
-L</uvchr_to_utf8_flags>>.
+L</uvchr_to_utf8_flags>.
This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
which itself, while not deprecated, should be used only in isolated
}
/*
- * Local variables:
- * c-indentation-style: bsd
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- *
* ex: set ts=8 sts=4 sw=4 et:
*/