static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
-/* Be sure to synchronize this message with the similar one in regcomp.c */
-static const char cp_above_legal_max[] =
- "Use of code point 0x%" UVXf " is not allowed; the"
- " permissible max is 0x%" UVXf;
-
/*
=head1 Unicode Support
These are various utility functions for manipulating UTF8-encoded
static void
S_restore_cop_warnings(pTHX_ void *p)
{
- if (!specialWARN(PL_curcop->cop_warnings))
- PerlMemShared_free(PL_curcop->cop_warnings);
- PL_curcop->cop_warnings = (STRLEN*)p;
+ free_and_set_cop_warnings(PL_curcop, (STRLEN*) p);
}
if (UNLIKELY( uv > MAX_LEGAL_CP
&& ! (flags & UNICODE_ALLOW_ABOVE_IV_MAX)))
{
- Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_LEGAL_CP);
+ Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv));
}
if ( (flags & UNICODE_WARN_SUPER)
|| ( (flags & UNICODE_WARN_PERL_EXTENDED)
=item C<UTF8_GOT_CONTINUATION>
-The input sequence was malformed in that the first byte was a a UTF-8
+The input sequence was malformed in that the first byte was a UTF-8
continuation byte.
=item C<UTF8_GOT_EMPTY>
* things. For example, an input could be deliberately designed to
* overflow, and if this code bailed out immediately upon discovering that,
* returning to the caller C<*retlen> pointing to the very next byte (one
- * which is actually part of of the overflowing sequence), that could look
+ * which is actually part of the overflowing sequence), that could look
* legitimate to the caller, which could discard the initial partial
* sequence and process the rest, inappropriately.
*
if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
if (UNLIKELY(uv1 > MAX_LEGAL_CP)) {
- Perl_croak(aTHX_ cp_above_legal_max, uv1,
- MAX_LEGAL_CP);
+ Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv1));
}
if (ckWARN_d(WARN_NON_UNICODE)) {
const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
* ustrp will contain *lenp bytes
*
* Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
- * I WITH DOT ABOVE form a case pair, as do 'I' and and LATIN SMALL LETTER
+ * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
* DOTLESS I */
PERL_ARGS_ASSERT_TURKIC_UC;
return result;
/* Not currently externally documented, and subject to change:
- * <flags> is set iff iff the rules from the current underlying locale are to
+ * <flags> is set iff the rules from the current underlying locale are to
* be used. */
UV
}
/* Not currently externally documented, and subject to change:
- * <flags> is set iff iff the rules from the current underlying locale are to
+ * <flags> is set iff the rules from the current underlying locale are to
* be used.
*/
return 1;
}
-/* XXX The next two functions should likely be moved to mathoms.c once all
- * occurrences of them are removed from the core; some cpan-upstream modules
- * still use them */
-
-U8 *
-Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
-{
- PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
-
- return uvoffuni_to_utf8_flags(d, uv, 0);
-}
-
-/*
-=for apidoc utf8n_to_uvuni
-
-Instead use L<perlapi/utf8_to_uvchr_buf>, or rarely, L<perlapi/utf8n_to_uvchr>.
-
-This function was useful for code that wanted to handle both EBCDIC and
-ASCII platforms with Unicode properties, but starting in Perl v5.20, the
-distinctions between the platforms have mostly been made invisible to most
-code, so this function is quite unlikely to be what you want. If you do need
-this precise functionality, use instead
-C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|perlapi/utf8_to_uvchr_buf>>
-or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))|perlapi/utf8n_to_uvchr>>.
-
-=cut
-*/
-
-UV
-Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
-{
- PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
-
- return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
-}
-
-/*
-=for apidoc uvuni_to_utf8_flags
-
-Instead you almost certainly want to use L<perlapi/uvchr_to_utf8> or
-L<perlapi/uvchr_to_utf8_flags>.
-
-This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
-which itself, while not deprecated, should be used only in isolated
-circumstances. These functions were useful for code that wanted to handle
-both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
-v5.20, the distinctions between the platforms have mostly been made invisible
-to most code, so this function is quite unlikely to be what you want.
-
-=cut
-*/
-
-U8 *
-Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
-{
- PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
-
- return uvoffuni_to_utf8_flags(d, uv, flags);
-}
-
-/*
-=for apidoc utf8_to_uvchr
-
-Returns the native code point of the first character in the string C<s>
-which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
-length, in bytes, of that character.
-
-Some, but not all, UTF-8 malformations are detected, and in fact, some
-malformed input could cause reading beyond the end of the input buffer, which
-is why this function is deprecated. Use L</utf8_to_uvchr_buf> instead.
-
-If C<s> points to one of the detected malformations, and UTF8 warnings are
-enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
-C<NULL>) to -1. If those warnings are off, the computed value if well-defined (or
-the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
-is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
-next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
-
-=cut
-*/
-
-UV
-Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
-{
- PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
-
- /* This function is unsafe if malformed UTF-8 input is given it, which is
- * why the function is deprecated. If the first byte of the input
- * indicates that there are more bytes remaining in the sequence that forms
- * the character than there are in the input buffer, it can read past the
- * end. But we can make it safe if the input string happens to be
- * NUL-terminated, as many strings in Perl are, by refusing to read past a
- * NUL, which is what UTF8_CHK_SKIP() does. A NUL indicates the start of
- * the next character anyway. If the input isn't NUL-terminated, the
- * function remains unsafe, as it always has been. */
-
- return utf8_to_uvchr_buf(s, s + UTF8_CHK_SKIP(s), retlen);
-}
-
/*
* ex: set ts=8 sts=4 sw=4 et:
*/