static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
-/* Be sure to synchronize this message with the similar one in regcomp.c */
-static const char cp_above_legal_max[] =
- "Use of code point 0x%" UVXf " is not allowed; the"
- " permissible max is 0x%" UVXf;
-
/*
=head1 Unicode Support
These are various utility functions for manipulating UTF8-encoded
static void
S_restore_cop_warnings(pTHX_ void *p)
{
- if (!specialWARN(PL_curcop->cop_warnings))
- PerlMemShared_free(PL_curcop->cop_warnings);
- PL_curcop->cop_warnings = (STRLEN*)p;
+ free_and_set_cop_warnings(PL_curcop, (STRLEN*) p);
}
" is not recommended for open interchange";
const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
" may not be portable";
-const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \
- " Unicode, requires a Perl extension," \
- " and so is not portable";
#define HANDLE_UNICODE_SURROGATE(uv, flags, msgs) \
STMT_START { \
if (UNLIKELY( uv > MAX_LEGAL_CP
&& ! (flags & UNICODE_ALLOW_ABOVE_IV_MAX)))
{
- Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_LEGAL_CP);
+ Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv));
}
if ( (flags & UNICODE_WARN_SUPER)
|| ( (flags & UNICODE_WARN_PERL_EXTENDED)
/* Choose the more dire applicable warning */
if (UNICODE_IS_PERL_EXTENDED(uv)) {
- format = perl_extended_cp_format;
+ format = PL_extended_cp_format;
+ category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
if (flags & (UNICODE_WARN_PERL_EXTENDED
|UNICODE_DISALLOW_PERL_EXTENDED))
{
*msgs = new_msg_hv(Perl_form(aTHX_ format, uv),
category, flag);
}
- else {
- Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), format, uv);
+ else if ( ckWARN_d(WARN_NON_UNICODE)
+ || ( (flag & UNICODE_GOT_PERL_EXTENDED)
+ && ckWARN(WARN_PORTABLE)))
+ {
+ Perl_warner(aTHX_ category, format, uv);
}
}
if ( (flags & UNICODE_DISALLOW_SUPER)
can warn and/or disallow these extremely high code points, even if other
above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
-L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
+C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
treat all above-Unicode code points, including these, as malformations. (Note
that the Unicode standard considers anything above 0x10FFFF to be illegal, but
there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
can warn and/or disallow these extremely high code points, even if other
above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
-L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
+C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
above-Unicode code points, including these, as malformations.
(Note that the Unicode standard considers anything above 0x10FFFF to be
illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
=item C<UTF8_GOT_CONTINUATION>
-The input sequence was malformed in that the first byte was a a UTF-8
+The input sequence was malformed in that the first byte was a UTF-8
continuation byte.
=item C<UTF8_GOT_EMPTY>
The input sequence was malformed in that a non-continuation type byte was found
in a position where only a continuation type one should be. See also
-L</C<UTF8_GOT_SHORT>>.
+C<L</UTF8_GOT_SHORT>>.
=item C<UTF8_GOT_OVERFLOW>
* things. For example, an input could be deliberately designed to
* overflow, and if this code bailed out immediately upon discovering that,
* returning to the caller C<*retlen> pointing to the very next byte (one
- * which is actually part of of the overflowing sequence), that could look
+ * which is actually part of the overflowing sequence), that could look
* legitimate to the caller, which could discard the initial partial
* sequence and process the rest, inappropriately.
*
}
while (possible_problems) { /* Handle each possible problem */
- UV pack_warn = 0;
+ U32 pack_warn = 0;
char * message = NULL;
U32 this_flag_bit = 0;
* valid, avoid as much as possible reading past the
* end of the buffer */
int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
- ? s - s0
- : send - s0;
+ ? (int) (s - s0)
+ : (int) (send - s0);
pack_warn = packWARN(WARN_UTF8);
message = Perl_form(aTHX_ "%s",
unexpected_non_continuation_text(s0,
if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
if ( ! (flags & UTF8_CHECK_ONLY)
&& (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
- && (msgs || ckWARN_d(WARN_NON_UNICODE)))
+ && (msgs || ( ckWARN_d(WARN_NON_UNICODE)
+ || ckWARN(WARN_PORTABLE))))
{
- pack_warn = packWARN(WARN_NON_UNICODE);
+ pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
/* If it is an overlong that evaluates to a code point
* that doesn't have to use the Perl extended UTF-8, it
* */
if (UNICODE_IS_PERL_EXTENDED(uv)) {
message = Perl_form(aTHX_
- perl_extended_cp_format, uv);
+ PL_extended_cp_format, uv);
}
else {
message = Perl_form(aTHX_
}
/*
-=for comment
-skip apidoc
-This is not currently externally documented because we don't want people to use
-it for now. XXX Perhaps that is too paranoid, and it should be documented?
-
=for apidoc bytes_from_utf8_loc
-Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where
-to store the location of the first character in C<"s"> that cannot be
+Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer
+to where to store the location of the first character in C<"s"> that cannot be
converted to non-UTF8.
If that parameter is C<NULL>, this function behaves identically to
If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
and C<*first_non_downgradable> is set to C<NULL>.
-Otherwise, C<*first_non_downgradable> set to point to the first byte of the
+Otherwise, C<*first_non_downgradable> is set to point to the first byte of the
first character in the original string that wasn't converted. C<*is_utf8p> is
unchanged. Note that the new string may have length 0.
* Do not use in-place. We optimize for native, for obvious reasons. */
U8*
-Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
+Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
{
U8* pend;
U8* dstart = d;
/* Note: this one is slightly destructive of the source. */
U8*
-Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
+Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
{
U8* s = (U8*)p;
U8* const send = s + bytelen;
bool
Perl__is_uni_FOO(pTHX_ const U8 classnum, const UV c)
{
- dVAR;
return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
}
bool
Perl__is_uni_perl_idcont(pTHX_ UV c)
{
- dVAR;
return _invlist_contains_cp(PL_utf8_perl_idcont, c);
}
bool
Perl__is_uni_perl_idstart(pTHX_ UV c)
{
- dVAR;
return _invlist_contains_cp(PL_utf8_perl_idstart, c);
}
* The ordinal of the first character of the changed version is returned
* (but note, as explained above, that there may be more.) */
- dVAR;
PERL_ARGS_ASSERT_TO_UNI_UPPER;
if (c < 256) {
UV
Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
{
- dVAR;
PERL_ARGS_ASSERT_TO_UNI_TITLE;
if (c < 256) {
UV
Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
{
- dVAR;
PERL_ARGS_ASSERT_TO_UNI_LOWER;
if (c < 256) {
* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
*/
- dVAR;
PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
if (flags & FOLD_FLAGS_LOCALE) {
bool
Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
{
- dVAR;
PERL_ARGS_ASSERT__IS_UTF8_FOO;
return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]);
bool
Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e)
{
- dVAR;
PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
return is_utf8_common(p, e, PL_utf8_perl_idstart);
bool
Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
{
- dVAR;
PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
return is_utf8_common(p, e, PL_utf8_perl_idcont);
if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
if (UNLIKELY(uv1 > MAX_LEGAL_CP)) {
- Perl_croak(aTHX_ cp_above_legal_max, uv1,
- MAX_LEGAL_CP);
+ Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv1));
}
if (ckWARN_d(WARN_NON_UNICODE)) {
const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
{
unsigned int i;
- const unsigned int * cp_list;
+ const U32 * cp_list;
U8 * d;
/* 'index' is guaranteed to be non-negative, as this is an inversion
* map that covers all possible inputs. See [perl #133365] */
SSize_t index = _invlist_search(invlist, uv1);
- IV base = invmap[index];
+ I32 base = invmap[index];
/* The data structures are set up so that if 'base' is non-negative,
* the case change is 1-to-1; and if 0, the change is to itself */
}
Size_t
-Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to,
+Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to,
const U32 ** remaining_folds_to)
{
/* Returns the count of the number of code points that fold to the input
* The reason for this convolution is to avoid having to deal with
* allocating and freeing memory. The lists are already constructed, so
* the return can point to them, but single code points aren't, so would
- * need to be constructed if we didn't employ something like this API */
+ * need to be constructed if we didn't employ something like this API
+ *
+ * The code points returned by this function are all legal Unicode, which
+ * occupy at most 21 bits, and so a U32 is sufficient, and the lists are
+ * constructed with this size (to save space and memory), and we return
+ * pointers, so they must be this size */
- dVAR;
/* 'index' is guaranteed to be non-negative, as this is an inversion map
* that covers all possible inputs. See [perl #133365] */
SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
* to 'cp', and the parallel array containing the length of the list
* array */
*first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
- *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes
- *first_folds_to
- */
+ *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1;
+ /* +1 excludes first_folds_to */
return IVCF_AUX_TABLE_lengths[-base];
}
* sequence, and the entire sequence will be stored in *ustrp. ustrp will
* contain *lenp bytes */
- dVAR;
PERL_ARGS_ASSERT_TURKIC_LC;
assert(e > p0);
* ustrp will contain *lenp bytes
*
* Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
- * I WITH DOT ABOVE form a case pair, as do 'I' and and LATIN SMALL LETTER
+ * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
* DOTLESS I */
PERL_ARGS_ASSERT_TURKIC_UC;
return result;
/* Not currently externally documented, and subject to change:
- * <flags> is set iff iff the rules from the current underlying locale are to
+ * <flags> is set iff the rules from the current underlying locale are to
* be used. */
UV
STRLEN *lenp,
bool flags)
{
- dVAR;
UV result;
PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
STRLEN *lenp,
bool flags)
{
- dVAR;
UV result;
PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
}
/* Not currently externally documented, and subject to change:
- * <flags> is set iff iff the rules from the current underlying locale are to
+ * <flags> is set iff the rules from the current underlying locale are to
* be used.
*/
STRLEN *lenp,
bool flags)
{
- dVAR;
UV result;
PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
STRLEN *lenp,
U8 flags)
{
- dVAR;
UV result;
PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
/*
=for apidoc pv_uni_display
-Build to the scalar C<dsv> a displayable version of the string C<spv>,
-length C<len>, the displayable version being at most C<pvlim> bytes long
-(if longer, the rest is truncated and C<"..."> will be appended).
+Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string
+C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes
+long (if longer, the rest is truncated and C<"..."> will be appended).
The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
+Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a
+backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set.
+
The pointer to the PV of the C<dsv> is returned.
See also L</sv_uni_display>.
SvUTF8_off(dsv);
for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
UV u;
- /* This serves double duty as a flag and a character to print after
- a \ when flags & UNI_DISPLAY_BACKSLASH is true.
- */
- char ok = 0;
+ bool ok = 0;
if (pvlim && SvCUR(dsv) >= pvlim) {
truncated++;
if (u < 256) {
const unsigned char c = (unsigned char)u & 0xFF;
if (flags & UNI_DISPLAY_BACKSLASH) {
- switch (c) {
- case '\n':
- ok = 'n'; break;
- case '\r':
- ok = 'r'; break;
- case '\t':
- ok = 't'; break;
- case '\f':
- ok = 'f'; break;
- case '\a':
- ok = 'a'; break;
- case '\\':
- ok = '\\'; break;
- default: break;
- }
- if (ok) {
- const char string = ok;
- sv_catpvs(dsv, "\\");
- sv_catpvn(dsv, &string, 1);
- }
- }
+ if ( isMNEMONIC_CNTRL(c)
+ && ( c != '\b'
+ || (flags & UNI_DISPLAY_BACKSPACE)))
+ {
+ const char * mnemonic = cntrl_to_mnemonic(c);
+ sv_catpvn(dsv, mnemonic, strlen(mnemonic));
+ ok = 1;
+ }
+ else if (c == '\\') {
+ sv_catpvs(dsv, "\\\\");
+ ok = 1;
+ }
+ }
/* isPRINT() is the locale-blind version. */
if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
const char string = c;
return 1;
}
-/* XXX The next two functions should likely be moved to mathoms.c once all
- * occurrences of them are removed from the core; some cpan-upstream modules
- * still use them */
-
-U8 *
-Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
-{
- PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
-
- return uvoffuni_to_utf8_flags(d, uv, 0);
-}
-
-/*
-=for apidoc utf8n_to_uvuni
-
-Instead use L<perlapi/utf8_to_uvchr_buf>, or rarely, L<perlapi/utf8n_to_uvchr>.
-
-This function was useful for code that wanted to handle both EBCDIC and
-ASCII platforms with Unicode properties, but starting in Perl v5.20, the
-distinctions between the platforms have mostly been made invisible to most
-code, so this function is quite unlikely to be what you want. If you do need
-this precise functionality, use instead
-C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|perlapi/utf8_to_uvchr_buf>>
-or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))|perlapi/utf8n_to_uvchr>>.
-
-=cut
-*/
-
-UV
-Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
-{
- PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
-
- return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
-}
-
-/*
-=for apidoc uvuni_to_utf8_flags
-
-Instead you almost certainly want to use L<perlapi/uvchr_to_utf8> or
-L<perlapi/uvchr_to_utf8_flags>.
-
-This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
-which itself, while not deprecated, should be used only in isolated
-circumstances. These functions were useful for code that wanted to handle
-both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
-v5.20, the distinctions between the platforms have mostly been made invisible
-to most code, so this function is quite unlikely to be what you want.
-
-=cut
-*/
-
-U8 *
-Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
-{
- PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
-
- return uvoffuni_to_utf8_flags(d, uv, flags);
-}
-
-/*
-=for apidoc utf8_to_uvchr
-
-Returns the native code point of the first character in the string C<s>
-which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
-length, in bytes, of that character.
-
-Some, but not all, UTF-8 malformations are detected, and in fact, some
-malformed input could cause reading beyond the end of the input buffer, which
-is why this function is deprecated. Use L</utf8_to_uvchr_buf> instead.
-
-If C<s> points to one of the detected malformations, and UTF8 warnings are
-enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
-C<NULL>) to -1. If those warnings are off, the computed value if well-defined (or
-the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
-is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
-next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
-
-=cut
-*/
-
-UV
-Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
-{
- PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
-
- /* This function is unsafe if malformed UTF-8 input is given it, which is
- * why the function is deprecated. If the first byte of the input
- * indicates that there are more bytes remaining in the sequence that forms
- * the character than there are in the input buffer, it can read past the
- * end. But we can make it safe if the input string happens to be
- * NUL-terminated, as many strings in Perl are, by refusing to read past a
- * NUL, which is what UTF8_CHK_SKIP() does. A NUL indicates the start of
- * the next character anyway. If the input isn't NUL-terminated, the
- * function remains unsafe, as it always has been. */
-
- return utf8_to_uvchr_buf(s, s + UTF8_CHK_SKIP(s), retlen);
-}
-
/*
* ex: set ts=8 sts=4 sw=4 et:
*/