#define PERL_IN_UTF8_C
#include "perl.h"
#include "inline_invlist.c"
-#include "charclass_invlists.h"
static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
(as opposed to native) code point. Only in very rare circumstances should code
not be using the native code point.
-For details, see the description for L</uvchr_to_utf8_flags>>.
+For details, see the description for L</uvchr_to_utf8_flags>.
=cut
*/
{
#ifdef EBCDIC
Perl_die(aTHX_ "Can't represent character for Ox%"UVXf" on this platform", uv);
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
#endif
return NULL;
}
=for apidoc uvchr_to_utf8
Adds the UTF-8 representation of the native code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
+of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
the byte after the end of the new character. In other words,
=for apidoc uvchr_to_utf8_flags
Adds the UTF-8 representation of the native code point C<uv> to the end
-of the string C<d>; C<d> should have at least C<UNISKIP(uv)+1> (up to
+of the string C<d>; C<d> should have at least C<UVCHR_SKIP(uv)+1> (up to
C<UTF8_MAXBYTES+1>) free bytes available. The return value is the pointer to
the byte after the end of the new character. In other words,
}
/*
-=for apidoc is_utf8_char_buf
-
-This is identical to the macro L</isUTF8_CHAR>.
-
-=cut */
-
-STRLEN
-Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
-{
-
- PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
-
- return isUTF8_CHAR(buf, buf_end);
-}
-
-/*
=for apidoc is_utf8_string
Returns true if the first C<len> bytes of string C<s> form a valid
* is the label <malformed>.
*/
-malformed:
+ malformed:
if (sv && ckWARN_d(WARN_UTF8)) {
pack_warn = packWARN(WARN_UTF8);
}
-disallowed:
+ disallowed:
if (flags & UTF8_CHECK_ONLY) {
if (retlen)
return 0;
}
-do_warn:
+ do_warn:
if (pack_warn) { /* <pack_warn> was initialized to 0, and changed only
if warnings are to be raised. */
#define LAST_HIGH_SURROGATE 0xDBFF
#define FIRST_LOW_SURROGATE 0xDC00
#define LAST_LOW_SURROGATE UNICODE_SURROGATE_LAST
- if (uv >= FIRST_HIGH_SURROGATE && uv <= LAST_HIGH_SURROGATE) {
- if (p >= pend) {
- Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
- } else {
+
+ /* This assumes that most uses will be in the first Unicode plane, not
+ * needing surrogates */
+ if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
+ && uv <= UNICODE_SURROGATE_LAST))
+ {
+ if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
+ Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
+ }
+ else {
UV low = (p[0] << 8) + p[1];
- p += 2;
- if (low < FIRST_LOW_SURROGATE || low > LAST_LOW_SURROGATE)
+ if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
+ || UNLIKELY(low > LAST_LOW_SURROGATE))
+ {
Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
+ }
+ p += 2;
uv = ((uv - FIRST_HIGH_SURROGATE) << 10)
+ (low - FIRST_LOW_SURROGATE) + 0x10000;
}
- } else if (uv >= FIRST_LOW_SURROGATE && uv <= LAST_LOW_SURROGATE) {
- Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
}
#ifdef EBCDIC
d = uvoffuni_to_utf8_flags(d, uv, 0);
s += UTF8SKIP(s);
}
- /* Here, no characters crossed, result is ok as-is */
+ /* Here, no characters crossed, result is ok as-is, but we warn. */
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(p, p + UTF8SKIP(p));
return result;
}
-bad_crossing:
+ bad_crossing:
/* Failed, have to return the original */
original = valid_utf8_to_uvchr(p, lenp);
PUSHSTACKi(PERLSI_MAGIC);
ENTER;
SAVEHINTS();
+ save_re_context();
/* We might get here via a subroutine signature which uses a utf8
* parameter name, at which point PL_subname will have been set
* but not yet used. */
#ifndef NO_TAINT_SUPPORT
/* It is assumed that callers of this routine are not passing in
* any user derived data. */
+ /* Need to do this after save_re_context() as it will set
+ * PL_tainted to 1 while saving $1 etc (see the code after getrx:
+ * in Perl_magic_get). Even line to create errsv_save can turn on
+ * PL_tainted. */
SAVEBOOL(TAINT_get);
TAINT_NOT;
#endif
/* The first number is a count of the rest */
l++;
- elements = grok_atou((const char *)l, &after_atou);
+ if (!grok_atoUV((const char *)l, &elements, &after_atou)) {
+ Perl_croak(aTHX_ "panic: Expecting a valid count of elements at start of inversion list");
+ }
if (elements == 0) {
invlist = _new_invlist(0);
}
/* Get the 0th element, which is needed to setup the inversion list */
while (isSPACE(*l)) l++;
- element0 = (UV) grok_atou((const char *)l, &after_atou);
+ if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
+ Perl_croak(aTHX_ "panic: Expecting a valid 0th element for inversion list");
+ }
l = (U8 *) after_atou;
invlist = _setup_canned_invlist(elements, element0, &other_elements_ptr);
elements--;
Perl_croak(aTHX_ "panic: Expecting %"UVuf" more elements than available", elements);
}
while (isSPACE(*l)) l++;
- *other_elements_ptr++ = (UV) grok_atou((const char *)l, &after_atou);
+ if (!grok_atoUV((const char *)l, other_elements_ptr++, &after_atou)) {
+ Perl_croak(aTHX_ "panic: Expecting a valid element in inversion list");
+ }
l = (U8 *) after_atou;
}
}
The pointer to the PV of the C<dsv> is returned.
+See also L</sv_uni_display>.
+
=cut */
char *
Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
* routine. This allows that step to be skipped.
* Currently, this requires s1 to be encoded as UTF-8
* (u1 must be true), which is asserted for.
+ * FOLDEQ_S1_FOLDS_SANE With either NOMIX_ASCII or LOCALE, no folds may
+ * cross certain boundaries. Hence, the caller should
+ * let this function do the folding instead of
+ * pre-folding. This code contains an assertion to
+ * that effect. However, if the caller knows what
+ * it's doing, it can pass this flag to indicate that,
+ * and the assertion is skipped.
* FOLDEQ_S2_ALREADY_FOLDED Similarly.
+ * FOLDEQ_S2_FOLDS_SANE
*/
I32
Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, UV l1, bool u1, const char *s2, char **pe2, UV l2, bool u2, U32 flags)
PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
- && (flags & (FOLDEQ_S1_ALREADY_FOLDED | FOLDEQ_S2_ALREADY_FOLDED))));
+ && (((flags & FOLDEQ_S1_ALREADY_FOLDED)
+ && !(flags & FOLDEQ_S1_FOLDS_SANE))
+ || ((flags & FOLDEQ_S2_ALREADY_FOLDED)
+ && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
/* The algorithm is to trial the folds without regard to the flags on
* the first line of the above assert(), and then see if the result
* violates them. This means that the inputs can't be pre-folded to a
=for apidoc uvuni_to_utf8_flags
Instead you almost certainly want to use L</uvchr_to_utf8> or
-L</uvchr_to_utf8_flags>>.
+L</uvchr_to_utf8_flags>.
This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
which itself, while not deprecated, should be used only in isolated
}
/*
- * Local variables:
- * c-indentation-style: bsd
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- *
* ex: set ts=8 sts=4 sw=4 et:
*/