static const char unees[] =
"Malformed UTF-8 character (unexpected end of string)";
-/* Be sure to synchronize this message with the similar one in regcomp.c */
-static const char cp_above_legal_max[] =
- "Use of code point 0x%" UVXf " is not allowed; the"
- " permissible max is 0x%" UVXf;
-
/*
=head1 Unicode Support
These are various utility functions for manipulating UTF8-encoded
=cut
*/
+/* helper for Perl__force_out_malformed_utf8_message(). Like
+ * SAVECOMPILEWARNINGS(), but works with PL_curcop rather than
+ * PL_compiling */
+
+static void
+S_restore_cop_warnings(pTHX_ void *p)
+{
+ free_and_set_cop_warnings(PL_curcop, (STRLEN*) p);
+}
+
+
void
Perl__force_out_malformed_utf8_message(pTHX_
const U8 *const p, /* First byte in UTF-8 sequence */
PL_dowarn = G_WARN_ALL_ON|G_WARN_ON;
if (PL_curcop) {
+ /* this is like SAVECOMPILEWARNINGS() except with PL_curcop rather
+ * than PL_compiling */
+ SAVEDESTRUCTOR_X(S_restore_cop_warnings,
+ (void*)PL_curcop->cop_warnings);
PL_curcop->cop_warnings = pWARN_ALL;
}
=for apidoc uvoffuni_to_utf8_flags
THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
-Instead, B<Almost all code should use L</uvchr_to_utf8> or
-L</uvchr_to_utf8_flags>>.
+Instead, B<Almost all code should use L<perlapi/uvchr_to_utf8> or
+L<perlapi/uvchr_to_utf8_flags>>.
This function is like them, but the input is a strict Unicode
(as opposed to native) code point. Only in very rare circumstances should code
not be using the native code point.
-For details, see the description for L</uvchr_to_utf8_flags>.
+For details, see the description for L<perlapi/uvchr_to_utf8_flags>.
=cut
*/
" is not recommended for open interchange";
const char super_cp_format[] = "Code point 0x%" UVXf " is not Unicode,"
" may not be portable";
-const char perl_extended_cp_format[] = "Code point 0x%" UVXf " is not" \
- " Unicode, requires a Perl extension," \
- " and so is not portable";
#define HANDLE_UNICODE_SURROGATE(uv, flags, msgs) \
STMT_START { \
* performance hit on these high EBCDIC code points. */
if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
- if (UNLIKELY(uv > MAX_LEGAL_CP)) {
- Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_LEGAL_CP);
+ if (UNLIKELY( uv > MAX_LEGAL_CP
+ && ! (flags & UNICODE_ALLOW_ABOVE_IV_MAX)))
+ {
+ Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv));
}
if ( (flags & UNICODE_WARN_SUPER)
|| ( (flags & UNICODE_WARN_PERL_EXTENDED)
/* Choose the more dire applicable warning */
if (UNICODE_IS_PERL_EXTENDED(uv)) {
- format = perl_extended_cp_format;
+ format = PL_extended_cp_format;
+ category = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
if (flags & (UNICODE_WARN_PERL_EXTENDED
|UNICODE_DISALLOW_PERL_EXTENDED))
{
*msgs = new_msg_hv(Perl_form(aTHX_ format, uv),
category, flag);
}
- else {
- Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE), format, uv);
+ else if ( ckWARN_d(WARN_NON_UNICODE)
+ || ( (flag & UNICODE_GOT_PERL_EXTENDED)
+ && ckWARN(WARN_PORTABLE)))
+ {
+ Perl_warner(aTHX_ category, format, uv);
}
}
if ( (flags & UNICODE_DISALLOW_SUPER)
C<UNICODE_DISALLOW_ILLEGAL_C9_INTERCHANGE> are shortcuts to select the
above-Unicode and surrogate flags, but not the non-character ones, as
defined in
-L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
See L<perlunicode/Noncharacter code points>.
Extremely high code points were never specified in any standard, and require an
can warn and/or disallow these extremely high code points, even if other
above-Unicode ones are accepted. They are the C<UNICODE_WARN_PERL_EXTENDED>
and C<UNICODE_DISALLOW_PERL_EXTENDED> flags. For more information see
-L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
+C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UNICODE_DISALLOW_SUPER> will
treat all above-Unicode code points, including these, as malformations. (Note
that the Unicode standard considers anything above 0x10FFFF to be illegal, but
there are standards predating it that allow up to 0x7FFF_FFFF (2**31 -1))
#undef FF_OVERLONG_PREFIX
STRLEN
-Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
+Perl_is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
{
STRLEN len;
const U8 *x;
*
*/
- PERL_ARGS_ASSERT__IS_UTF8_CHAR_HELPER;
+ PERL_ARGS_ASSERT_IS_UTF8_CHAR_HELPER;
assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
|UTF8_DISALLOW_PERL_EXTENDED)));
=for apidoc utf8n_to_uvchr
THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
-Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this
+directly.
Bottom level UTF-8 decode routine.
Returns the native code point value of the first character in the string C<s>,
restricts the allowed inputs to the strict UTF-8 traditionally defined by
Unicode. Use C<UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE> to use the strictness
definition given by
-L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+L<Unicode Corrigendum #9|https://www.unicode.org/versions/corrigendum9.html>.
The difference between traditional strictness and C9 strictness is that the
latter does not forbid non-character code points. (They are still discouraged,
however.) For more discussion see L<perlunicode/Noncharacter code points>.
can warn and/or disallow these extremely high code points, even if other
above-Unicode ones are accepted. They are the C<UTF8_WARN_PERL_EXTENDED> and
C<UTF8_DISALLOW_PERL_EXTENDED> flags. For more information see
-L</C<UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
+C<L</UTF8_GOT_PERL_EXTENDED>>. Of course C<UTF8_DISALLOW_SUPER> will treat all
above-Unicode code points, including these, as malformations.
(Note that the Unicode standard considers anything above 0x10FFFF to be
illegal, but there are standards predating it that allow up to 0x7FFF_FFFF
use and those yet to be assigned, are never considered malformed and never
warn.
+=for apidoc Amnh||UTF8_CHECK_ONLY
+=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_INTERCHANGE
+=for apidoc Amnh||UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
+=for apidoc Amnh||UTF8_DISALLOW_SURROGATE
+=for apidoc Amnh||UTF8_DISALLOW_NONCHAR
+=for apidoc Amnh||UTF8_DISALLOW_SUPER
+=for apidoc Amnh||UTF8_WARN_ILLEGAL_INTERCHANGE
+=for apidoc Amnh||UTF8_WARN_ILLEGAL_C9_INTERCHANGE
+=for apidoc Amnh||UTF8_WARN_SURROGATE
+=for apidoc Amnh||UTF8_WARN_NONCHAR
+=for apidoc Amnh||UTF8_WARN_SUPER
+=for apidoc Amnh||UTF8_WARN_PERL_EXTENDED
+=for apidoc Amnh||UTF8_DISALLOW_PERL_EXTENDED
+
=cut
Also implemented as a macro in utf8.h
=for apidoc utf8n_to_uvchr_error
THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
-Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this
+directly.
This function is for code that needs to know what the precise malformation(s)
are when an error is found. If you also need to know the generated warning
=item C<UTF8_GOT_CONTINUATION>
-The input sequence was malformed in that the first byte was a a UTF-8
+The input sequence was malformed in that the first byte was a UTF-8
continuation byte.
=item C<UTF8_GOT_EMPTY>
The input sequence was malformed in that a non-continuation type byte was found
in a position where only a continuation type one should be. See also
-L</C<UTF8_GOT_SHORT>>.
+C<L</UTF8_GOT_SHORT>>.
=item C<UTF8_GOT_OVERFLOW>
=for apidoc utf8n_to_uvchr_msgs
THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
-Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this
+directly.
This function is for code that needs to know what the precise malformation(s)
are when an error is found, and wants the corresponding warning and/or error
/* The order of malformation tests here is important. We should consume as
* few bytes as possible in order to not skip any valid character. This is
* required by the Unicode Standard (section 3.9 of Unicode 6.0); see also
- * http://unicode.org/reports/tr36 for more discussion as to why. For
+ * https://unicode.org/reports/tr36 for more discussion as to why. For
* example, once we've done a UTF8SKIP, we can tell the expected number of
* bytes, and could fail right off the bat if the input parameters indicate
* that there are too few available. But it could be that just that first
* things. For example, an input could be deliberately designed to
* overflow, and if this code bailed out immediately upon discovering that,
* returning to the caller C<*retlen> pointing to the very next byte (one
- * which is actually part of of the overflowing sequence), that could look
+ * which is actually part of the overflowing sequence), that could look
* legitimate to the caller, which could discard the initial partial
* sequence and process the rest, inappropriately.
*
}
while (possible_problems) { /* Handle each possible problem */
- UV pack_warn = 0;
+ U32 pack_warn = 0;
char * message = NULL;
U32 this_flag_bit = 0;
* valid, avoid as much as possible reading past the
* end of the buffer */
int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
- ? s - s0
- : send - s0;
+ ? (int) (s - s0)
+ : (int) (send - s0);
pack_warn = packWARN(WARN_UTF8);
message = Perl_form(aTHX_ "%s",
unexpected_non_continuation_text(s0,
if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
if ( ! (flags & UTF8_CHECK_ONLY)
&& (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
- && (msgs || ckWARN_d(WARN_NON_UNICODE)))
+ && (msgs || ( ckWARN_d(WARN_NON_UNICODE)
+ || ckWARN(WARN_PORTABLE))))
{
- pack_warn = packWARN(WARN_NON_UNICODE);
+ pack_warn = packWARN2(WARN_NON_UNICODE, WARN_PORTABLE);
/* If it is an overlong that evaluates to a code point
* that doesn't have to use the Perl extended UTF-8, it
* */
if (UNICODE_IS_PERL_EXTENDED(uv)) {
message = Perl_form(aTHX_
- perl_extended_cp_format, uv);
+ PL_extended_cp_format, uv);
}
else {
message = Perl_form(aTHX_
{
PERL_ARGS_ASSERT_UTF8_TO_UVCHR_BUF;
- assert(s < send);
-
- return utf8n_to_uvchr(s, send - s, retlen,
- ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
+ return utf8_to_uvchr_buf_helper(s, send, retlen);
}
/* This is marked as deprecated
Only in very rare circumstances should code need to be dealing in Unicode
(as opposed to native) code points. In those few cases, use
-C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>> instead. If you
-are not absolutely sure this is one of those cases, then assume it isn't and
-use plain C<utf8_to_uvchr_buf> instead.
+C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|perlapi/utf8_to_uvchr_buf>> instead.
+If you are not absolutely sure this is one of those cases, then assume it isn't
+and use plain C<utf8_to_uvchr_buf> instead.
Returns the Unicode (not-native) code point of the first character in the
string C<s> which
the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
+See L<perlapi/utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is
+returned.
=cut
*/
* the bitops (especially ~) can create illegal UTF-8.
* In other words: in Perl UTF-8 is not just for Unicode. */
- if (e < s)
+ if (UNLIKELY(e < s))
goto warn_and_return;
while (s < e) {
s += UTF8SKIP(s);
len++;
}
- if (e != s) {
+ if (UNLIKELY(e != s)) {
len--;
warn_and_return:
if (PL_op)
}
/*
-No = here because currently externally undocumented
-for apidoc bytes_from_utf8_loc
+=for apidoc bytes_from_utf8_loc
-Like C<L</bytes_from_utf8>()>, but takes an extra parameter, a pointer to where
-to store the location of the first character in C<"s"> that cannot be
+Like C<L<perlapi/bytes_from_utf8>()>, but takes an extra parameter, a pointer
+to where to store the location of the first character in C<"s"> that cannot be
converted to non-UTF8.
If that parameter is C<NULL>, this function behaves identically to
If the entire input string was converted, C<*is_utf8p> is set to a FALSE value,
and C<*first_non_downgradable> is set to C<NULL>.
-Otherwise, C<*first_non_downgradable> set to point to the first byte of the
+Otherwise, C<*first_non_downgradable> is set to point to the first byte of the
first character in the original string that wasn't converted. C<*is_utf8p> is
unchanged. Note that the new string may have length 0.
* Do not use in-place. We optimize for native, for obvious reasons. */
U8*
-Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
+Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
{
U8* pend;
U8* dstart = d;
/* This assumes that most uses will be in the first Unicode plane, not
* needing surrogates */
- if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST
- && uv <= UNICODE_SURROGATE_LAST))
+ if (UNLIKELY(inRANGE(uv, UNICODE_SURROGATE_FIRST,
+ UNICODE_SURROGATE_LAST)))
{
if (UNLIKELY(p >= pend) || UNLIKELY(uv > LAST_HIGH_SURROGATE)) {
Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
}
else {
UV low = (p[0] << 8) + p[1];
- if ( UNLIKELY(low < FIRST_LOW_SURROGATE)
- || UNLIKELY(low > LAST_LOW_SURROGATE))
+ if (UNLIKELY(! inRANGE(low, FIRST_LOW_SURROGATE,
+ LAST_LOW_SURROGATE)))
{
Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
}
/* Note: this one is slightly destructive of the source. */
U8*
-Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
+Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, Size_t bytelen, Size_t *newlen)
{
U8* s = (U8*)p;
U8* const send = s + bytelen;
return _invlist_contains_cp(PL_XPosix_ptrs[classnum], c);
}
-/* Internal function so we can deprecate the external one, and call
- this one from other deprecated functions in this file */
-
-bool
-Perl__is_utf8_idstart(pTHX_ const U8 *p)
-{
- PERL_ARGS_ASSERT__IS_UTF8_IDSTART;
-
- if (*p == '_')
- return TRUE;
- return is_utf8_common(p, PL_utf8_idstart);
-}
-
bool
Perl__is_uni_perl_idcont(pTHX_ UV c)
{
}
PERL_STATIC_INLINE bool
-S_is_utf8_common(pTHX_ const U8 *const p, SV* const invlist)
-{
- /* returns a boolean giving whether or not the UTF8-encoded character that
- * starts at <p> is in the inversion list indicated by <invlist>.
- *
- * Note that it is assumed that the buffer length of <p> is enough to
- * contain all the bytes that comprise the character. Thus, <*p> should
- * have been checked before this call for mal-formedness enough to assure
- * that. This function, does make sure to not look past any NUL, so it is
- * safe to use on C, NUL-terminated, strings */
- STRLEN len = my_strnlen((char *) p, UTF8SKIP(p));
-
- PERL_ARGS_ASSERT_IS_UTF8_COMMON;
-
- /* The API should have included a length for the UTF-8 character in <p>,
- * but it doesn't. We therefore assume that p has been validated at least
- * as far as there being enough bytes available in it to accommodate the
- * character without reading beyond the end, and pass that number on to the
- * validating routine */
- if (! isUTF8_CHAR(p, p + len)) {
- _force_out_malformed_utf8_message(p, p + len, _UTF8_NO_CONFIDENCE_IN_CURLEN,
- 1 /* Die */ );
- NOT_REACHED; /* NOTREACHED */
- }
-
- return is_utf8_common_with_len(p, p + len, invlist);
-}
-
-PERL_STATIC_INLINE bool
-S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e,
- SV* const invlist)
+S_is_utf8_common(pTHX_ const U8 *const p, const U8 * const e,
+ SV* const invlist)
{
/* returns a boolean giving whether or not the UTF8-encoded character that
* starts at <p>, and extending no further than <e - 1> is in the inversion
UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
- PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
+ PERL_ARGS_ASSERT_IS_UTF8_COMMON;
if (cp == 0 && (p >= e || *p != '\0')) {
_force_out_malformed_utf8_message(p, e, 0, 1);
return _invlist_contains_cp(invlist, cp);
}
+#if 0 /* Not currently used, but may be needed in the future */
+PERLVAR(I, seen_deprecated_macro, HV *)
+
STATIC void
S_warn_on_first_deprecated_use(pTHX_ const char * const name,
const char * const alternative,
}
}
}
+#endif
bool
-Perl__is_utf8_FOO(pTHX_ U8 classnum,
- const U8 * const p,
- const char * const name,
- const char * const alternative,
- const bool use_utf8,
- const bool use_locale,
- const char * const file,
- const unsigned line)
+Perl__is_utf8_FOO(pTHX_ const U8 classnum, const U8 *p, const U8 * const e)
{
PERL_ARGS_ASSERT__IS_UTF8_FOO;
- warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
-
- if (use_utf8 && UTF8_IS_ABOVE_LATIN1(*p)) {
-
- switch (classnum) {
- case _CC_WORDCHAR:
- case _CC_DIGIT:
- case _CC_ALPHA:
- case _CC_LOWER:
- case _CC_UPPER:
- case _CC_PUNCT:
- case _CC_PRINT:
- case _CC_ALPHANUMERIC:
- case _CC_GRAPH:
- case _CC_CASED:
-
- return is_utf8_common(p, PL_XPosix_ptrs[classnum]);
-
- case _CC_SPACE:
- return is_XPERLSPACE_high(p);
- case _CC_BLANK:
- return is_HORIZWS_high(p);
- case _CC_XDIGIT:
- return is_XDIGIT_high(p);
- case _CC_CNTRL:
- return 0;
- case _CC_ASCII:
- return 0;
- case _CC_VERTSPACE:
- return is_VERTWS_high(p);
- case _CC_IDFIRST:
- return is_utf8_common(p, PL_utf8_perl_idstart);
- case _CC_IDCONT:
- return is_utf8_common(p, PL_utf8_perl_idcont);
- }
- }
-
- /* idcont is the same as wordchar below 256 */
- if (classnum == _CC_IDCONT) {
- classnum = _CC_WORDCHAR;
- }
- else if (classnum == _CC_IDFIRST) {
- if (*p == '_') {
- return TRUE;
- }
- classnum = _CC_ALPHA;
- }
-
- if (! use_locale) {
- if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
- return _generic_isCC(*p, classnum);
- }
-
- return _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )), classnum);
- }
- else {
- if (! use_utf8 || UTF8_IS_INVARIANT(*p)) {
- return isFOO_lc(classnum, *p);
- }
-
- return isFOO_lc(classnum, EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p + 1 )));
- }
-
- NOT_REACHED; /* NOTREACHED */
-}
-
-bool
-Perl__is_utf8_FOO_with_len(pTHX_ const U8 classnum, const U8 *p,
- const U8 * const e)
-{
- PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
-
- return is_utf8_common_with_len(p, e, PL_XPosix_ptrs[classnum]);
+ return is_utf8_common(p, e, PL_XPosix_ptrs[classnum]);
}
bool
-Perl__is_utf8_perl_idstart_with_len(pTHX_ const U8 *p, const U8 * const e)
+Perl__is_utf8_perl_idstart(pTHX_ const U8 *p, const U8 * const e)
{
- PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
+ PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART;
- return is_utf8_common_with_len(p, e, PL_utf8_perl_idstart);
+ return is_utf8_common(p, e, PL_utf8_perl_idstart);
}
bool
-Perl__is_utf8_xidstart(pTHX_ const U8 *p)
+Perl__is_utf8_perl_idcont(pTHX_ const U8 *p, const U8 * const e)
{
- PERL_ARGS_ASSERT__IS_UTF8_XIDSTART;
+ PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT;
- if (*p == '_')
- return TRUE;
- return is_utf8_common(p, PL_utf8_xidstart);
-}
-
-bool
-Perl__is_utf8_perl_idcont_with_len(pTHX_ const U8 *p, const U8 * const e)
-{
- PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
-
- return is_utf8_common_with_len(p, e, PL_utf8_perl_idcont);
-}
-
-bool
-Perl__is_utf8_idcont(pTHX_ const U8 *p)
-{
- PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
-
- return is_utf8_common(p, PL_utf8_idcont);
-}
-
-bool
-Perl__is_utf8_xidcont(pTHX_ const U8 *p)
-{
- PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
-
- return is_utf8_common(p, PL_utf8_xidcont);
-}
-
-bool
-Perl__is_utf8_mark(pTHX_ const U8 *p)
-{
- PERL_ARGS_ASSERT__IS_UTF8_MARK;
-
- return is_utf8_common(p, PL_utf8_mark);
+ return is_utf8_common(p, e, PL_utf8_perl_idcont);
}
STATIC UV
S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
U8* ustrp, STRLEN *lenp,
- SV *invlist, const int * const invmap,
- const unsigned int * const * const aux_tables,
+ SV *invlist, const I32 * const invmap,
+ const U32 * const * const aux_tables,
const U8 * const aux_table_lengths,
const char * const normal)
{
if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
if (UNLIKELY(uv1 > MAX_LEGAL_CP)) {
- Perl_croak(aTHX_ cp_above_legal_max, uv1,
- MAX_LEGAL_CP);
+ Perl_croak(aTHX_ "%s", form_cp_too_large_msg(16, NULL, 0, uv1));
}
if (ckWARN_d(WARN_NON_UNICODE)) {
const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
> HIGHEST_CASE_CHANGING_CP_FOR_USE_ONLY_BY_UTF8_DOT_C))
{
- /* As of Unicode 10.0, this means we avoid swash creation
- * for anything beyond high Plane 1 (below emojis) */
goto cases_to_self;
}
#endif
{
unsigned int i;
- const unsigned int * cp_list;
+ const U32 * cp_list;
U8 * d;
/* 'index' is guaranteed to be non-negative, as this is an inversion
* map that covers all possible inputs. See [perl #133365] */
SSize_t index = _invlist_search(invlist, uv1);
- IV base = invmap[index];
+ I32 base = invmap[index];
/* The data structures are set up so that if 'base' is non-negative,
* the case change is 1-to-1; and if 0, the change is to itself */
}
Size_t
-Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to,
- const unsigned int ** remaining_folds_to)
+Perl__inverse_folds(pTHX_ const UV cp, U32 * first_folds_to,
+ const U32 ** remaining_folds_to)
{
/* Returns the count of the number of code points that fold to the input
* 'cp' (besides itself).
* The reason for this convolution is to avoid having to deal with
* allocating and freeing memory. The lists are already constructed, so
* the return can point to them, but single code points aren't, so would
- * need to be constructed if we didn't employ something like this API */
+ * need to be constructed if we didn't employ something like this API
+ *
+ * The code points returned by this function are all legal Unicode, which
+ * occupy at most 21 bits, and so a U32 is sufficient, and the lists are
+ * constructed with this size (to save space and memory), and we return
+ * pointers, so they must be this size */
/* 'index' is guaranteed to be non-negative, as this is an inversion map
* that covers all possible inputs. See [perl #133365] */
SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
- int base = _Perl_IVCF_invmap[index];
+ I32 base = _Perl_IVCF_invmap[index];
PERL_ARGS_ASSERT__INVERSE_FOLDS;
* to 'cp', and the parallel array containing the length of the list
* array */
*first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
- *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes
- *first_folds_to
- */
+ *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1;
+ /* +1 excludes first_folds_to */
return IVCF_AUX_TABLE_lengths[-base];
}
#endif
/* Only the single code point. This works like 'fc(G) = G - A + a' */
- *first_folds_to = base + cp - invlist_array(PL_utf8_foldclosures)[index];
+ *first_folds_to = (U32) (base + cp
+ - invlist_array(PL_utf8_foldclosures)[index]);
*remaining_folds_to = NULL;
return 1;
}
return original;
}
-STATIC U32
-S_check_and_deprecate(pTHX_ const U8 *p,
- const U8 **e,
- const unsigned int type, /* See below */
- const bool use_locale, /* Is this a 'LC_'
- macro call? */
- const char * const file,
- const unsigned line)
-{
- /* This is a temporary function to deprecate the unsafe calls to the case
- * changing macros and functions. It keeps all the special stuff in just
- * one place.
- *
- * It updates *e with the pointer to the end of the input string. If using
- * the old-style macros, *e is NULL on input, and so this function assumes
- * the input string is long enough to hold the entire UTF-8 sequence, and
- * sets *e accordingly, but it then returns a flag to pass the
- * utf8n_to_uvchr(), to tell it that this size is a guess, and to avoid
- * using the full length if possible.
- *
- * It also does the assert that *e > p when *e is not NULL. This should be
- * migrated to the callers when this function gets deleted.
- *
- * The 'type' parameter is used for the caller to specify which case
- * changing function this is called from: */
-
-# define DEPRECATE_TO_UPPER 0
-# define DEPRECATE_TO_TITLE 1
-# define DEPRECATE_TO_LOWER 2
-# define DEPRECATE_TO_FOLD 3
-
- U32 utf8n_flags = 0;
- const char * name;
- const char * alternative;
-
- PERL_ARGS_ASSERT_CHECK_AND_DEPRECATE;
-
- if (*e == NULL) {
- utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
-
- /* strnlen() makes this function safe for the common case of
- * NUL-terminated strings */
- *e = p + my_strnlen((char *) p, UTF8SKIP(p));
-
- /* For mathoms.c calls, we use the function name we know is stored
- * there. It could be part of a larger path */
- if (type == DEPRECATE_TO_UPPER) {
- name = instr(file, "mathoms.c")
- ? "to_utf8_upper"
- : "toUPPER_utf8";
- alternative = "toUPPER_utf8_safe";
- }
- else if (type == DEPRECATE_TO_TITLE) {
- name = instr(file, "mathoms.c")
- ? "to_utf8_title"
- : "toTITLE_utf8";
- alternative = "toTITLE_utf8_safe";
- }
- else if (type == DEPRECATE_TO_LOWER) {
- name = instr(file, "mathoms.c")
- ? "to_utf8_lower"
- : "toLOWER_utf8";
- alternative = "toLOWER_utf8_safe";
- }
- else if (type == DEPRECATE_TO_FOLD) {
- name = instr(file, "mathoms.c")
- ? "to_utf8_fold"
- : "toFOLD_utf8";
- alternative = "toFOLD_utf8_safe";
- }
- else Perl_croak(aTHX_ "panic: Unexpected case change type");
-
- warn_on_first_deprecated_use(name, alternative, use_locale, file, line);
- }
- else {
- assert (p < *e);
- }
-
- return utf8n_flags;
-}
-
STATIC UV
S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
U8 * ustrp, STRLEN *lenp)
* ustrp will contain *lenp bytes
*
* Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
- * I WITH DOT ABOVE form a case pair, as do 'I' and and LATIN SMALL LETTER
+ * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
* DOTLESS I */
PERL_ARGS_ASSERT_TURKIC_UC;
STRLEN len_result; \
result = utf8n_to_uvchr(p, e - p, &len_result, UTF8_CHECK_ONLY); \
if (len_result == (STRLEN) -1) { \
- _force_out_malformed_utf8_message(p, e, utf8n_flags, \
- 1 /* Die */ ); \
+ _force_out_malformed_utf8_message(p, e, 0, 1 /* Die */ ); \
}
#define CASE_CHANGE_BODY_END(locale_flags, change_macro) \
\
return result;
-/*
-=for apidoc to_utf8_upper
-
-Instead use L</toUPPER_utf8_safe>.
-
-=cut */
-
/* Not currently externally documented, and subject to change:
- * <flags> is set iff iff the rules from the current underlying locale are to
+ * <flags> is set iff the rules from the current underlying locale are to
* be used. */
UV
const U8 *e,
U8* ustrp,
STRLEN *lenp,
- bool flags,
- const char * const file,
- const int line)
+ bool flags)
{
UV result;
- const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_UPPER,
- cBOOL(flags), file, line);
PERL_ARGS_ASSERT__TO_UTF8_UPPER_FLAGS;
CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
}
-/*
-=for apidoc to_utf8_title
-
-Instead use L</toTITLE_utf8_safe>.
-
-=cut */
-
/* Not currently externally documented, and subject to change:
* <flags> is set iff the rules from the current underlying locale are to be
* used. Since titlecase is not defined in POSIX, for other than a
const U8 *e,
U8* ustrp,
STRLEN *lenp,
- bool flags,
- const char * const file,
- const int line)
+ bool flags)
{
UV result;
- const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_TITLE,
- cBOOL(flags), file, line);
PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
}
-/*
-=for apidoc to_utf8_lower
-
-Instead use L</toLOWER_utf8_safe>.
-
-=cut */
-
/* Not currently externally documented, and subject to change:
- * <flags> is set iff iff the rules from the current underlying locale are to
+ * <flags> is set iff the rules from the current underlying locale are to
* be used.
*/
const U8 *e,
U8* ustrp,
STRLEN *lenp,
- bool flags,
- const char * const file,
- const int line)
+ bool flags)
{
UV result;
- const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_LOWER,
- cBOOL(flags), file, line);
PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
}
-/*
-=for apidoc to_utf8_fold
-
-Instead use L</toFOLD_utf8_safe>.
-
-=cut */
-
/* Not currently externally documented, and subject to change,
* in <flags>
* bit FOLD_FLAGS_LOCALE is set iff the rules from the current underlying
const U8 *e,
U8* ustrp,
STRLEN *lenp,
- U8 flags,
- const char * const file,
- const int line)
+ U8 flags)
{
UV result;
- const U32 utf8n_flags = check_and_deprecate(p, &e, DEPRECATE_TO_FOLD,
- cBOOL(flags), file, line);
PERL_ARGS_ASSERT__TO_UTF8_FOLD_FLAGS;
/* Look at every character in the result; if any cross the
* boundary, the whole thing is disallowed */
U8* s = ustrp;
- U8* e = ustrp + *lenp;
- while (s < e) {
+ U8* send = ustrp + *lenp;
+ while (s < send) {
if (isASCII(*s)) {
/* Crossed, have to return the original */
original = valid_utf8_to_uvchr(p, lenp);
}
-/* Note:
- * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
- * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
- * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
- */
-
-SV*
-Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
- I32 minbits, I32 none)
-{
- PERL_ARGS_ASSERT_SWASH_INIT;
-
- /* Returns a copy of a swash initiated by the called function. This is the
- * public interface, and returning a copy prevents others from doing
- * mischief on the original */
-
- return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none,
- NULL, NULL));
-}
-
-SV*
-Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
- I32 minbits, I32 none, SV* invlist,
- U8* const flags_p)
-{
-
- /*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
- * use the following define */
-
-#define CORE_SWASH_INIT_RETURN(x) \
- PL_curpm= old_PL_curpm; \
- return x
-
- /* Initialize and return a swash, creating it if necessary. It does this
- * by calling utf8_heavy.pl in the general case. The returned value may be
- * the swash's inversion list instead if the input parameters allow it.
- * Which is returned should be immaterial to callers, as the only
- * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
- * and swash_to_invlist() handle both these transparently.
- *
- * This interface should only be used by functions that won't destroy or
- * adversely change the swash, as doing so affects all other uses of the
- * swash in the program; the general public should use 'Perl_swash_init'
- * instead.
- *
- * pkg is the name of the package that <name> should be in.
- * name is the name of the swash to find. Typically it is a Unicode
- * property name, including user-defined ones
- * listsv is a string to initialize the swash with. It must be of the form
- * documented as the subroutine return value in
- * L<perlunicode/User-Defined Character Properties>
- * minbits is the number of bits required to represent each data element.
- * It is '1' for binary properties.
- * none I (khw) do not understand this one, but it is used only in tr///.
- * invlist is an inversion list to initialize the swash with (or NULL)
- * flags_p if non-NULL is the address of various input and output flag bits
- * to the routine, as follows: ('I' means is input to the routine;
- * 'O' means output from the routine. Only flags marked O are
- * meaningful on return.)
- * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
- * came from a user-defined property. (I O)
- * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
- * when the swash cannot be located, to simply return NULL. (I)
- * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
- * return of an inversion list instead of a swash hash if this routine
- * thinks that would result in faster execution of swash_fetch() later
- * on. (I)
- *
- * Thus there are three possible inputs to find the swash: <name>,
- * <listsv>, and <invlist>. At least one must be specified. The result
- * will be the union of the specified ones, although <listsv>'s various
- * actions can intersect, etc. what <name> gives. To avoid going out to
- * disk at all, <invlist> should specify completely what the swash should
- * have, and <listsv> should be &PL_sv_undef and <name> should be "".
- *
- * <invlist> is only valid for binary properties */
-
- PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
-
- SV* retval = &PL_sv_undef;
- HV* swash_hv = NULL;
- const bool use_invlist= (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST);
-
- assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
- assert(! invlist || minbits == 1);
-
- PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the
- regex that triggered the swash init and the swash init
- perl logic itself. See perl #122747 */
-
- /* If data was passed in to go out to utf8_heavy to find the swash of, do
- * so */
- if (listsv != &PL_sv_undef || strNE(name, "")) {
- dSP;
- const size_t pkg_len = strlen(pkg);
- const size_t name_len = strlen(name);
- HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
- SV* errsv_save;
- GV *method;
-
- PERL_ARGS_ASSERT__CORE_SWASH_INIT;
-
- PUSHSTACKi(PERLSI_MAGIC);
- ENTER;
- SAVEHINTS();
- save_re_context();
- /* We might get here via a subroutine signature which uses a utf8
- * parameter name, at which point PL_subname will have been set
- * but not yet used. */
- save_item(PL_subname);
- if (PL_parser && PL_parser->error_count)
- SAVEI8(PL_parser->error_count), PL_parser->error_count = 0;
- method = gv_fetchmeth(stash, "SWASHNEW", 8, -1);
- if (!method) { /* demand load UTF-8 */
- ENTER;
- if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
- GvSV(PL_errgv) = NULL;
-#ifndef NO_TAINT_SUPPORT
- /* It is assumed that callers of this routine are not passing in
- * any user derived data. */
- /* Need to do this after save_re_context() as it will set
- * PL_tainted to 1 while saving $1 etc (see the code after getrx:
- * in Perl_magic_get). Even line to create errsv_save can turn on
- * PL_tainted. */
- SAVEBOOL(TAINT_get);
- TAINT_NOT;
-#endif
- Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
- NULL);
- {
- /* Not ERRSV, as there is no need to vivify a scalar we are
- about to discard. */
- SV * const errsv = GvSV(PL_errgv);
- if (!SvTRUE(errsv)) {
- GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
- SvREFCNT_dec(errsv);
- }
- }
- LEAVE;
- }
- SPAGAIN;
- PUSHMARK(SP);
- EXTEND(SP,5);
- mPUSHp(pkg, pkg_len);
- mPUSHp(name, name_len);
- PUSHs(listsv);
- mPUSHi(minbits);
- mPUSHi(none);
- PUTBACK;
- if ((errsv_save = GvSV(PL_errgv))) SAVEFREESV(errsv_save);
- GvSV(PL_errgv) = NULL;
- /* If we already have a pointer to the method, no need to use
- * call_method() to repeat the lookup. */
- if (method
- ? call_sv(MUTABLE_SV(method), G_SCALAR)
- : call_sv(newSVpvs_flags("SWASHNEW", SVs_TEMP), G_SCALAR | G_METHOD))
- {
- retval = *PL_stack_sp--;
- SvREFCNT_inc(retval);
- }
- {
- /* Not ERRSV. See above. */
- SV * const errsv = GvSV(PL_errgv);
- if (!SvTRUE(errsv)) {
- GvSV(PL_errgv) = SvREFCNT_inc_simple(errsv_save);
- SvREFCNT_dec(errsv);
- }
- }
- LEAVE;
- POPSTACK;
- if (IN_PERL_COMPILETIME) {
- CopHINTS_set(PL_curcop, PL_hints);
- }
- if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
- if (SvPOK(retval)) {
-
- /* If caller wants to handle missing properties, let them */
- if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
- CORE_SWASH_INIT_RETURN(NULL);
- }
- Perl_croak(aTHX_
- "Can't find Unicode property definition \"%" SVf "\"",
- SVfARG(retval));
- NOT_REACHED; /* NOTREACHED */
- }
- }
- } /* End of calling the module to find the swash */
-
- /* If this operation fetched a swash, and we will need it later, get it */
- if (retval != &PL_sv_undef
- && (minbits == 1 || (flags_p
- && ! (*flags_p
- & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
- {
- swash_hv = MUTABLE_HV(SvRV(retval));
-
- /* If we don't already know that there is a user-defined component to
- * this swash, and the user has indicated they wish to know if there is
- * one (by passing <flags_p>), find out */
- if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
- SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
- if (user_defined && SvUV(*user_defined)) {
- *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
- }
- }
- }
-
- /* Make sure there is an inversion list for binary properties */
- if (minbits == 1) {
- SV** swash_invlistsvp = NULL;
- SV* swash_invlist = NULL;
- bool invlist_in_swash_is_valid = FALSE;
- bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
- an unclaimed reference count */
-
- /* If this operation fetched a swash, get its already existing
- * inversion list, or create one for it */
-
- if (swash_hv) {
- swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
- if (swash_invlistsvp) {
- swash_invlist = *swash_invlistsvp;
- invlist_in_swash_is_valid = TRUE;
- }
- else {
- swash_invlist = _swash_to_invlist(retval);
- swash_invlist_unclaimed = TRUE;
- }
- }
-
- /* If an inversion list was passed in, have to include it */
- if (invlist) {
-
- /* Any fetched swash will by now have an inversion list in it;
- * otherwise <swash_invlist> will be NULL, indicating that we
- * didn't fetch a swash */
- if (swash_invlist) {
-
- /* Add the passed-in inversion list, which invalidates the one
- * already stored in the swash */
- invlist_in_swash_is_valid = FALSE;
- SvREADONLY_off(swash_invlist); /* Turned on again below */
- _invlist_union(invlist, swash_invlist, &swash_invlist);
- }
- else {
-
- /* Here, there is no swash already. Set up a minimal one, if
- * we are going to return a swash */
- if (! use_invlist) {
- swash_hv = newHV();
- retval = newRV_noinc(MUTABLE_SV(swash_hv));
- }
- swash_invlist = invlist;
- }
- }
-
- /* Here, we have computed the union of all the passed-in data. It may
- * be that there was an inversion list in the swash which didn't get
- * touched; otherwise save the computed one */
- if (! invlist_in_swash_is_valid && ! use_invlist) {
- if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
- {
- Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
- }
- /* We just stole a reference count. */
- if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
- else SvREFCNT_inc_simple_void_NN(swash_invlist);
- }
-
- /* The result is immutable. Forbid attempts to change it. */
- SvREADONLY_on(swash_invlist);
-
- if (use_invlist) {
- SvREFCNT_dec(retval);
- if (!swash_invlist_unclaimed)
- SvREFCNT_inc_simple_void_NN(swash_invlist);
- retval = newRV_noinc(swash_invlist);
- }
- }
-
- CORE_SWASH_INIT_RETURN(retval);
-#undef CORE_SWASH_INIT_RETURN
-}
-
-
-/* This API is wrong for special case conversions since we may need to
- * return several Unicode characters for a single Unicode character
- * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
- * the lower-level routine, and it is similarly broken for returning
- * multiple values. --jhi
- * For those, you should use S__to_utf8_case() instead */
-/* Now SWASHGET is recasted into S_swatch_get in this file. */
-
-/* Note:
- * Returns the value of property/mapping C<swash> for the first character
- * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
- * assumed to be in well-formed UTF-8. If C<do_utf8> is false, the string C<ptr>
- * is assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
- *
- * A "swash" is a hash which contains initially the keys/values set up by
- * SWASHNEW. The purpose is to be able to completely represent a Unicode
- * property for all possible code points. Things are stored in a compact form
- * (see utf8_heavy.pl) so that calculation is required to find the actual
- * property value for a given code point. As code points are looked up, new
- * key/value pairs are added to the hash, so that the calculation doesn't have
- * to ever be re-done. Further, each calculation is done, not just for the
- * desired one, but for a whole block of code points adjacent to that one.
- * For binary properties on ASCII machines, the block is usually for 64 code
- * points, starting with a code point evenly divisible by 64. Thus if the
- * property value for code point 257 is requested, the code goes out and
- * calculates the property values for all 64 code points between 256 and 319,
- * and stores these as a single 64-bit long bit vector, called a "swatch",
- * under the key for code point 256. The key is the UTF-8 encoding for code
- * point 256, minus the final byte. Thus, if the length of the UTF-8 encoding
- * for a code point is 13 bytes, the key will be 12 bytes long. If the value
- * for code point 258 is then requested, this code realizes that it would be
- * stored under the key for 256, and would find that value and extract the
- * relevant bit, offset from 256.
- *
- * Non-binary properties are stored in as many bits as necessary to represent
- * their values (32 currently, though the code is more general than that), not
- * as single bits, but the principle is the same: the value for each key is a
- * vector that encompasses the property values for all code points whose UTF-8
- * representations are represented by the key. That is, for all code points
- * whose UTF-8 representations are length N bytes, and the key is the first N-1
- * bytes of that.
- */
-UV
-Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
-{
- HV *const hv = MUTABLE_HV(SvRV(swash));
- U32 klen;
- U32 off;
- STRLEN slen = 0;
- STRLEN needents;
- const U8 *tmps = NULL;
- SV *swatch;
- const U8 c = *ptr;
-
- PERL_ARGS_ASSERT_SWASH_FETCH;
-
- /* If it really isn't a hash, it isn't really swash; must be an inversion
- * list */
- if (SvTYPE(hv) != SVt_PVHV) {
- return _invlist_contains_cp((SV*)hv,
- (do_utf8)
- ? valid_utf8_to_uvchr(ptr, NULL)
- : c);
- }
-
- /* We store the values in a "swatch" which is a vec() value in a swash
- * hash. Code points 0-255 are a single vec() stored with key length
- * (klen) 0. All other code points have a UTF-8 representation
- * 0xAA..0xYY,0xZZ. A vec() is constructed containing all of them which
- * share 0xAA..0xYY, which is the key in the hash to that vec. So the key
- * length for them is the length of the encoded char - 1. ptr[klen] is the
- * final byte in the sequence representing the character */
- if (!do_utf8 || UTF8_IS_INVARIANT(c)) {
- klen = 0;
- needents = 256;
- off = c;
- }
- else if (UTF8_IS_DOWNGRADEABLE_START(c)) {
- klen = 0;
- needents = 256;
- off = EIGHT_BIT_UTF8_TO_NATIVE(c, *(ptr + 1));
- }
- else {
- klen = UTF8SKIP(ptr) - 1;
-
- /* Each vec() stores 2**UTF_ACCUMULATION_SHIFT values. The offset into
- * the vec is the final byte in the sequence. (In EBCDIC this is
- * converted to I8 to get consecutive values.) To help you visualize
- * all this:
- * Straight 1047 After final byte
- * UTF-8 UTF-EBCDIC I8 transform
- * U+0400: \xD0\x80 \xB8\x41\x41 \xB8\x41\xA0
- * U+0401: \xD0\x81 \xB8\x41\x42 \xB8\x41\xA1
- * ...
- * U+0409: \xD0\x89 \xB8\x41\x4A \xB8\x41\xA9
- * U+040A: \xD0\x8A \xB8\x41\x51 \xB8\x41\xAA
- * ...
- * U+0412: \xD0\x92 \xB8\x41\x59 \xB8\x41\xB2
- * U+0413: \xD0\x93 \xB8\x41\x62 \xB8\x41\xB3
- * ...
- * U+041B: \xD0\x9B \xB8\x41\x6A \xB8\x41\xBB
- * U+041C: \xD0\x9C \xB8\x41\x70 \xB8\x41\xBC
- * ...
- * U+041F: \xD0\x9F \xB8\x41\x73 \xB8\x41\xBF
- * U+0420: \xD0\xA0 \xB8\x42\x41 \xB8\x42\x41
- *
- * (There are no discontinuities in the elided (...) entries.)
- * The UTF-8 key for these 33 code points is '\xD0' (which also is the
- * key for the next 31, up through U+043F, whose UTF-8 final byte is
- * \xBF). Thus in UTF-8, each key is for a vec() for 64 code points.
- * The final UTF-8 byte, which ranges between \x80 and \xBF, is an
- * index into the vec() swatch (after subtracting 0x80, which we
- * actually do with an '&').
- * In UTF-EBCDIC, each key is for a 32 code point vec(). The first 32
- * code points above have key '\xB8\x41'. The final UTF-EBCDIC byte has
- * dicontinuities which go away by transforming it into I8, and we
- * effectively subtract 0xA0 to get the index. */
- needents = (1 << UTF_ACCUMULATION_SHIFT);
- off = NATIVE_UTF8_TO_I8(ptr[klen]) & UTF_CONTINUATION_MASK;
- }
-
- /*
- * This single-entry cache saves about 1/3 of the UTF-8 overhead in test
- * suite. (That is, only 7-8% overall over just a hash cache. Still,
- * it's nothing to sniff at.) Pity we usually come through at least
- * two function calls to get here...
- *
- * NB: this code assumes that swatches are never modified, once generated!
- */
-
- if (hv == PL_last_swash_hv &&
- klen == PL_last_swash_klen &&
- (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
- {
- tmps = PL_last_swash_tmps;
- slen = PL_last_swash_slen;
- }
- else {
- /* Try our second-level swatch cache, kept in a hash. */
- SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
-
- /* If not cached, generate it via swatch_get */
- if (!svp || !SvPOK(*svp)
- || !(tmps = (const U8*)SvPV_const(*svp, slen)))
- {
- if (klen) {
- const UV code_point = valid_utf8_to_uvchr(ptr, NULL);
- swatch = swatch_get(swash,
- code_point & ~((UV)needents - 1),
- needents);
- }
- else { /* For the first 256 code points, the swatch has a key of
- length 0 */
- swatch = swatch_get(swash, 0, needents);
- }
-
- if (IN_PERL_COMPILETIME)
- CopHINTS_set(PL_curcop, PL_hints);
-
- svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
-
- if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
- || (slen << 3) < needents)
- Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
- "svp=%p, tmps=%p, slen=%" UVuf ", needents=%" UVuf,
- svp, tmps, (UV)slen, (UV)needents);
- }
-
- PL_last_swash_hv = hv;
- assert(klen <= sizeof(PL_last_swash_key));
- PL_last_swash_klen = (U8)klen;
- /* FIXME change interpvar.h? */
- PL_last_swash_tmps = (U8 *) tmps;
- PL_last_swash_slen = slen;
- if (klen)
- Copy(ptr, PL_last_swash_key, klen, U8);
- }
-
- switch ((int)((slen << 3) / needents)) {
- case 1:
- return ((UV) tmps[off >> 3] & (1 << (off & 7))) != 0;
- case 8:
- return ((UV) tmps[off]);
- case 16:
- off <<= 1;
- return
- ((UV) tmps[off ] << 8) +
- ((UV) tmps[off + 1]);
- case 32:
- off <<= 2;
- return
- ((UV) tmps[off ] << 24) +
- ((UV) tmps[off + 1] << 16) +
- ((UV) tmps[off + 2] << 8) +
- ((UV) tmps[off + 3]);
- }
- Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
- "slen=%" UVuf ", needents=%" UVuf, (UV)slen, (UV)needents);
- NORETURN_FUNCTION_END;
-}
-
-/* Read a single line of the main body of the swash input text. These are of
- * the form:
- * 0053 0056 0073
- * where each number is hex. The first two numbers form the minimum and
- * maximum of a range, and the third is the value associated with the range.
- * Not all swashes should have a third number
- *
- * On input: l points to the beginning of the line to be examined; it points
- * to somewhere in the string of the whole input text, and is
- * terminated by a \n or the null string terminator.
- * lend points to the null terminator of that string
- * wants_value is non-zero if the swash expects a third number
- * typestr is the name of the swash's mapping, like 'ToLower'
- * On output: *min, *max, and *val are set to the values read from the line.
- * returns a pointer just beyond the line examined. If there was no
- * valid min number on the line, returns lend+1
- */
-
-STATIC U8*
-S_swash_scan_list_line(pTHX_ U8* l, U8* const lend, UV* min, UV* max, UV* val,
- const bool wants_value, const U8* const typestr)
-{
- const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
- STRLEN numlen; /* Length of the number */
- I32 flags = PERL_SCAN_SILENT_ILLDIGIT
- | PERL_SCAN_DISALLOW_PREFIX
- | PERL_SCAN_SILENT_NON_PORTABLE;
-
- /* nl points to the next \n in the scan */
- U8* const nl = (U8*)memchr(l, '\n', lend - l);
-
- PERL_ARGS_ASSERT_SWASH_SCAN_LIST_LINE;
-
- /* Get the first number on the line: the range minimum */
- numlen = lend - l;
- *min = grok_hex((char *)l, &numlen, &flags, NULL);
- *max = *min; /* So can never return without setting max */
- if (numlen) /* If found a hex number, position past it */
- l += numlen;
- else if (nl) { /* Else, go handle next line, if any */
- return nl + 1; /* 1 is length of "\n" */
- }
- else { /* Else, no next line */
- return lend + 1; /* to LIST's end at which \n is not found */
- }
-
- /* The max range value follows, separated by a BLANK */
- if (isBLANK(*l)) {
- ++l;
- flags = PERL_SCAN_SILENT_ILLDIGIT
- | PERL_SCAN_DISALLOW_PREFIX
- | PERL_SCAN_SILENT_NON_PORTABLE;
- numlen = lend - l;
- *max = grok_hex((char *)l, &numlen, &flags, NULL);
- if (numlen)
- l += numlen;
- else /* If no value here, it is a single element range */
- *max = *min;
-
- /* Non-binary tables have a third entry: what the first element of the
- * range maps to. The map for those currently read here is in hex */
- if (wants_value) {
- if (isBLANK(*l)) {
- ++l;
- flags = PERL_SCAN_SILENT_ILLDIGIT
- | PERL_SCAN_DISALLOW_PREFIX
- | PERL_SCAN_SILENT_NON_PORTABLE;
- numlen = lend - l;
- *val = grok_hex((char *)l, &numlen, &flags, NULL);
- if (numlen)
- l += numlen;
- else
- *val = 0;
- }
- else {
- *val = 0;
- if (typeto) {
- /* diag_listed_as: To%s: illegal mapping '%s' */
- Perl_croak(aTHX_ "%s: illegal mapping '%s'",
- typestr, l);
- }
- }
- }
- else
- *val = 0; /* bits == 1, then any val should be ignored */
- }
- else { /* Nothing following range min, should be single element with no
- mapping expected */
- if (wants_value) {
- *val = 0;
- if (typeto) {
- /* diag_listed_as: To%s: illegal mapping '%s' */
- Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
- }
- }
- else
- *val = 0; /* bits == 1, then val should be ignored */
- }
-
- /* Position to next line if any, or EOF */
- if (nl)
- l = nl + 1;
- else
- l = lend;
-
- return l;
-}
-
-/* Note:
- * Returns a swatch (a bit vector string) for a code point sequence
- * that starts from the value C<start> and comprises the number C<span>.
- * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
- * Should be used via swash_fetch, which will cache the swatch in C<swash>.
- */
-STATIC SV*
-S_swatch_get(pTHX_ SV* swash, UV start, UV span)
-{
- SV *swatch;
- U8 *l, *lend, *x, *xend, *s, *send;
- STRLEN lcur, xcur, scur;
- HV *const hv = MUTABLE_HV(SvRV(swash));
- SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
-
- SV** listsvp = NULL; /* The string containing the main body of the table */
- SV** extssvp = NULL;
- SV** invert_it_svp = NULL;
- U8* typestr = NULL;
- STRLEN bits;
- STRLEN octets; /* if bits == 1, then octets == 0 */
- UV none;
- UV end = start + span;
-
- if (invlistsvp == NULL) {
- SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
- SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
- SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
- extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
- listsvp = hv_fetchs(hv, "LIST", FALSE);
- invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
-
- bits = SvUV(*bitssvp);
- none = SvUV(*nonesvp);
- typestr = (U8*)SvPV_nolen(*typesvp);
- }
- else {
- bits = 1;
- none = 0;
- }
- octets = bits >> 3; /* if bits == 1, then octets == 0 */
-
- PERL_ARGS_ASSERT_SWATCH_GET;
-
- if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
- Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %" UVuf,
- (UV)bits);
- }
-
- /* If overflowed, use the max possible */
- if (end < start) {
- end = UV_MAX;
- span = end - start;
- }
-
- /* create and initialize $swatch */
- scur = octets ? (span * octets) : (span + 7) / 8;
- swatch = newSV(scur);
- SvPOK_on(swatch);
- s = (U8*)SvPVX(swatch);
- if (octets && none) {
- const U8* const e = s + scur;
- while (s < e) {
- if (bits == 8)
- *s++ = (U8)(none & 0xff);
- else if (bits == 16) {
- *s++ = (U8)((none >> 8) & 0xff);
- *s++ = (U8)( none & 0xff);
- }
- else if (bits == 32) {
- *s++ = (U8)((none >> 24) & 0xff);
- *s++ = (U8)((none >> 16) & 0xff);
- *s++ = (U8)((none >> 8) & 0xff);
- *s++ = (U8)( none & 0xff);
- }
- }
- *s = '\0';
- }
- else {
- (void)memzero((U8*)s, scur + 1);
- }
- SvCUR_set(swatch, scur);
- s = (U8*)SvPVX(swatch);
-
- if (invlistsvp) { /* If has an inversion list set up use that */
- _invlist_populate_swatch(*invlistsvp, start, end, s);
- return swatch;
- }
-
- /* read $swash->{LIST} */
- l = (U8*)SvPV(*listsvp, lcur);
- lend = l + lcur;
- while (l < lend) {
- UV min, max, val, upper;
- l = swash_scan_list_line(l, lend, &min, &max, &val,
- cBOOL(octets), typestr);
- if (l > lend) {
- break;
- }
-
- /* If looking for something beyond this range, go try the next one */
- if (max < start)
- continue;
-
- /* <end> is generally 1 beyond where we want to set things, but at the
- * platform's infinity, where we can't go any higher, we want to
- * include the code point at <end> */
- upper = (max < end)
- ? max
- : (max != UV_MAX || end != UV_MAX)
- ? end - 1
- : end;
-
- if (octets) {
- UV key;
- if (min < start) {
- if (!none || val < none) {
- val += start - min;
- }
- min = start;
- }
- for (key = min; key <= upper; key++) {
- STRLEN offset;
- /* offset must be non-negative (start <= min <= key < end) */
- offset = octets * (key - start);
- if (bits == 8)
- s[offset] = (U8)(val & 0xff);
- else if (bits == 16) {
- s[offset ] = (U8)((val >> 8) & 0xff);
- s[offset + 1] = (U8)( val & 0xff);
- }
- else if (bits == 32) {
- s[offset ] = (U8)((val >> 24) & 0xff);
- s[offset + 1] = (U8)((val >> 16) & 0xff);
- s[offset + 2] = (U8)((val >> 8) & 0xff);
- s[offset + 3] = (U8)( val & 0xff);
- }
-
- if (!none || val < none)
- ++val;
- }
- }
- else { /* bits == 1, then val should be ignored */
- UV key;
- if (min < start)
- min = start;
-
- for (key = min; key <= upper; key++) {
- const STRLEN offset = (STRLEN)(key - start);
- s[offset >> 3] |= 1 << (offset & 7);
- }
- }
- } /* while */
-
- /* Invert if the data says it should be. Assumes that bits == 1 */
- if (invert_it_svp && SvUV(*invert_it_svp)) {
-
- /* Unicode properties should come with all bits above PERL_UNICODE_MAX
- * be 0, and their inversion should also be 0, as we don't succeed any
- * Unicode property matches for non-Unicode code points */
- if (start <= PERL_UNICODE_MAX) {
-
- /* The code below assumes that we never cross the
- * Unicode/above-Unicode boundary in a range, as otherwise we would
- * have to figure out where to stop flipping the bits. Since this
- * boundary is divisible by a large power of 2, and swatches comes
- * in small powers of 2, this should be a valid assumption */
- assert(start + span - 1 <= PERL_UNICODE_MAX);
-
- send = s + scur;
- while (s < send) {
- *s = ~(*s);
- s++;
- }
- }
- }
-
- /* read $swash->{EXTRAS}
- * This code also copied to swash_to_invlist() below */
- x = (U8*)SvPV(*extssvp, xcur);
- xend = x + xcur;
- while (x < xend) {
- STRLEN namelen;
- U8 *namestr;
- SV** othersvp;
- HV* otherhv;
- STRLEN otherbits;
- SV **otherbitssvp, *other;
- U8 *s, *o, *nl;
- STRLEN slen, olen;
-
- const U8 opc = *x++;
- if (opc == '\n')
- continue;
-
- nl = (U8*)memchr(x, '\n', xend - x);
-
- if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
- if (nl) {
- x = nl + 1; /* 1 is length of "\n" */
- continue;
- }
- else {
- x = xend; /* to EXTRAS' end at which \n is not found */
- break;
- }
- }
-
- namestr = x;
- if (nl) {
- namelen = nl - namestr;
- x = nl + 1;
- }
- else {
- namelen = xend - namestr;
- x = xend;
- }
-
- othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
- otherhv = MUTABLE_HV(SvRV(*othersvp));
- otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
- otherbits = (STRLEN)SvUV(*otherbitssvp);
- if (bits < otherbits)
- Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
- "bits=%" UVuf ", otherbits=%" UVuf, (UV)bits, (UV)otherbits);
-
- /* The "other" swatch must be destroyed after. */
- other = swatch_get(*othersvp, start, span);
- o = (U8*)SvPV(other, olen);
-
- if (!olen)
- Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
-
- s = (U8*)SvPV(swatch, slen);
- if (bits == 1 && otherbits == 1) {
- if (slen != olen)
- Perl_croak(aTHX_ "panic: swatch_get found swatch length "
- "mismatch, slen=%" UVuf ", olen=%" UVuf,
- (UV)slen, (UV)olen);
-
- switch (opc) {
- case '+':
- while (slen--)
- *s++ |= *o++;
- break;
- case '!':
- while (slen--)
- *s++ |= ~*o++;
- break;
- case '-':
- while (slen--)
- *s++ &= ~*o++;
- break;
- case '&':
- while (slen--)
- *s++ &= *o++;
- break;
- default:
- break;
- }
- }
- else {
- STRLEN otheroctets = otherbits >> 3;
- STRLEN offset = 0;
- U8* const send = s + slen;
-
- while (s < send) {
- UV otherval = 0;
-
- if (otherbits == 1) {
- otherval = (o[offset >> 3] >> (offset & 7)) & 1;
- ++offset;
- }
- else {
- STRLEN vlen = otheroctets;
- otherval = *o++;
- while (--vlen) {
- otherval <<= 8;
- otherval |= *o++;
- }
- }
-
- if (opc == '+' && otherval)
- NOOP; /* replace with otherval */
- else if (opc == '!' && !otherval)
- otherval = 1;
- else if (opc == '-' && otherval)
- otherval = 0;
- else if (opc == '&' && !otherval)
- otherval = 0;
- else {
- s += octets; /* no replacement */
- continue;
- }
-
- if (bits == 8)
- *s++ = (U8)( otherval & 0xff);
- else if (bits == 16) {
- *s++ = (U8)((otherval >> 8) & 0xff);
- *s++ = (U8)( otherval & 0xff);
- }
- else if (bits == 32) {
- *s++ = (U8)((otherval >> 24) & 0xff);
- *s++ = (U8)((otherval >> 16) & 0xff);
- *s++ = (U8)((otherval >> 8) & 0xff);
- *s++ = (U8)( otherval & 0xff);
- }
- }
- }
- sv_free(other); /* through with it! */
- } /* while */
- return swatch;
-}
-
-SV*
-Perl__swash_to_invlist(pTHX_ SV* const swash)
-{
-
- /* Subject to change or removal. For use only in one place in regcomp.c.
- * Ownership is given to one reference count in the returned SV* */
-
- U8 *l, *lend;
- char *loc;
- STRLEN lcur;
- HV *const hv = MUTABLE_HV(SvRV(swash));
- UV elements = 0; /* Number of elements in the inversion list */
- U8 empty[] = "";
- SV** listsvp;
- SV** typesvp;
- SV** bitssvp;
- SV** extssvp;
- SV** invert_it_svp;
-
- U8* typestr;
- STRLEN bits;
- STRLEN octets; /* if bits == 1, then octets == 0 */
- U8 *x, *xend;
- STRLEN xcur;
-
- SV* invlist;
-
- PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
-
- /* If not a hash, it must be the swash's inversion list instead */
- if (SvTYPE(hv) != SVt_PVHV) {
- return SvREFCNT_inc_simple_NN((SV*) hv);
- }
-
- /* The string containing the main body of the table */
- listsvp = hv_fetchs(hv, "LIST", FALSE);
- typesvp = hv_fetchs(hv, "TYPE", FALSE);
- bitssvp = hv_fetchs(hv, "BITS", FALSE);
- extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
- invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
-
- typestr = (U8*)SvPV_nolen(*typesvp);
- bits = SvUV(*bitssvp);
- octets = bits >> 3; /* if bits == 1, then octets == 0 */
-
- /* read $swash->{LIST} */
- if (SvPOK(*listsvp)) {
- l = (U8*)SvPV(*listsvp, lcur);
- }
- else {
- /* LIST legitimately doesn't contain a string during compilation phases
- * of Perl itself, before the Unicode tables are generated. In this
- * case, just fake things up by creating an empty list */
- l = empty;
- lcur = 0;
- }
- loc = (char *) l;
- lend = l + lcur;
-
- if (*l == 'V') { /* Inversion list format */
- const char *after_atou = (char *) lend;
- UV element0;
- UV* other_elements_ptr;
-
- /* The first number is a count of the rest */
- l++;
- if (!grok_atoUV((const char *)l, &elements, &after_atou)) {
- Perl_croak(aTHX_ "panic: Expecting a valid count of elements"
- " at start of inversion list");
- }
- if (elements == 0) {
- invlist = _new_invlist(0);
- }
- else {
- l = (U8 *) after_atou;
-
- /* Get the 0th element, which is needed to setup the inversion list
- * */
- while (isSPACE(*l)) l++;
- after_atou = (char *) lend;
- if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
- Perl_croak(aTHX_ "panic: Expecting a valid 0th element for"
- " inversion list");
- }
- l = (U8 *) after_atou;
- invlist = _setup_canned_invlist(elements, element0,
- &other_elements_ptr);
- elements--;
-
- /* Then just populate the rest of the input */
- while (elements-- > 0) {
- if (l > lend) {
- Perl_croak(aTHX_ "panic: Expecting %" UVuf " more"
- " elements than available", elements);
- }
- while (isSPACE(*l)) l++;
- after_atou = (char *) lend;
- if (!grok_atoUV((const char *)l, other_elements_ptr++,
- &after_atou))
- {
- Perl_croak(aTHX_ "panic: Expecting a valid element"
- " in inversion list");
- }
- l = (U8 *) after_atou;
- }
- }
- }
- else {
-
- /* Scan the input to count the number of lines to preallocate array
- * size based on worst possible case, which is each line in the input
- * creates 2 elements in the inversion list: 1) the beginning of a
- * range in the list; 2) the beginning of a range not in the list. */
- while ((loc = (char *) memchr(loc, '\n', lend - (U8 *) loc)) != NULL) {
- elements += 2;
- loc++;
- }
-
- /* If the ending is somehow corrupt and isn't a new line, add another
- * element for the final range that isn't in the inversion list */
- if (! (*lend == '\n'
- || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
- {
- elements++;
- }
-
- invlist = _new_invlist(elements);
-
- /* Now go through the input again, adding each range to the list */
- while (l < lend) {
- UV start, end;
- UV val; /* Not used by this function */
-
- l = swash_scan_list_line(l, lend, &start, &end, &val,
- cBOOL(octets), typestr);
-
- if (l > lend) {
- break;
- }
-
- invlist = _add_range_to_invlist(invlist, start, end);
- }
- }
-
- /* Invert if the data says it should be */
- if (invert_it_svp && SvUV(*invert_it_svp)) {
- _invlist_invert(invlist);
- }
-
- /* This code is copied from swatch_get()
- * read $swash->{EXTRAS} */
- x = (U8*)SvPV(*extssvp, xcur);
- xend = x + xcur;
- while (x < xend) {
- STRLEN namelen;
- U8 *namestr;
- SV** othersvp;
- HV* otherhv;
- STRLEN otherbits;
- SV **otherbitssvp, *other;
- U8 *nl;
-
- const U8 opc = *x++;
- if (opc == '\n')
- continue;
-
- nl = (U8*)memchr(x, '\n', xend - x);
-
- if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
- if (nl) {
- x = nl + 1; /* 1 is length of "\n" */
- continue;
- }
- else {
- x = xend; /* to EXTRAS' end at which \n is not found */
- break;
- }
- }
-
- namestr = x;
- if (nl) {
- namelen = nl - namestr;
- x = nl + 1;
- }
- else {
- namelen = xend - namestr;
- x = xend;
- }
-
- othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
- otherhv = MUTABLE_HV(SvRV(*othersvp));
- otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
- otherbits = (STRLEN)SvUV(*otherbitssvp);
-
- if (bits != otherbits || bits != 1) {
- Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
- "properties, bits=%" UVuf ", otherbits=%" UVuf,
- (UV)bits, (UV)otherbits);
- }
-
- /* The "other" swatch must be destroyed after. */
- other = _swash_to_invlist((SV *)*othersvp);
-
- /* End of code copied from swatch_get() */
- switch (opc) {
- case '+':
- _invlist_union(invlist, other, &invlist);
- break;
- case '!':
- _invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
- break;
- case '-':
- _invlist_subtract(invlist, other, &invlist);
- break;
- case '&':
- _invlist_intersection(invlist, other, &invlist);
- break;
- default:
- break;
- }
- sv_free(other); /* through with it! */
- }
-
- SvREADONLY_on(invlist);
- return invlist;
-}
-
-SV*
-Perl__get_swash_invlist(pTHX_ SV* const swash)
-{
- SV** ptr;
-
- PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
-
- if (! SvROK(swash)) {
- return NULL;
- }
-
- /* If it really isn't a hash, it isn't really swash; must be an inversion
- * list */
- if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
- return SvRV(swash);
- }
-
- ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
- if (! ptr) {
- return NULL;
- }
-
- return *ptr;
-}
-
bool
Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
{
/*
=for apidoc pv_uni_display
-Build to the scalar C<dsv> a displayable version of the string C<spv>,
-length C<len>, the displayable version being at most C<pvlim> bytes long
-(if longer, the rest is truncated and C<"..."> will be appended).
+Build to the scalar C<dsv> a displayable version of the UTF-8 encoded string
+C<spv>, length C<len>, the displayable version being at most C<pvlim> bytes
+long (if longer, the rest is truncated and C<"..."> will be appended).
The C<flags> argument can have C<UNI_DISPLAY_ISPRINT> set to display
C<isPRINT()>able characters as themselves, C<UNI_DISPLAY_BACKSLASH>
C<UNI_DISPLAY_QQ> (and its alias C<UNI_DISPLAY_REGEX>) have both
C<UNI_DISPLAY_BACKSLASH> and C<UNI_DISPLAY_ISPRINT> turned on.
+Additionally, there is now C<UNI_DISPLAY_BACKSPACE> which allows C<\b> for a
+backspace, but only when C<UNI_DISPLAY_BACKSLASH> also is set.
+
The pointer to the PV of the C<dsv> is returned.
See also L</sv_uni_display>.
SvUTF8_off(dsv);
for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
UV u;
- /* This serves double duty as a flag and a character to print after
- a \ when flags & UNI_DISPLAY_BACKSLASH is true.
- */
- char ok = 0;
+ bool ok = 0;
if (pvlim && SvCUR(dsv) >= pvlim) {
truncated++;
if (u < 256) {
const unsigned char c = (unsigned char)u & 0xFF;
if (flags & UNI_DISPLAY_BACKSLASH) {
- switch (c) {
- case '\n':
- ok = 'n'; break;
- case '\r':
- ok = 'r'; break;
- case '\t':
- ok = 't'; break;
- case '\f':
- ok = 'f'; break;
- case '\a':
- ok = 'a'; break;
- case '\\':
- ok = '\\'; break;
- default: break;
- }
- if (ok) {
- const char string = ok;
- sv_catpvs(dsv, "\\");
- sv_catpvn(dsv, &string, 1);
- }
- }
+ if ( isMNEMONIC_CNTRL(c)
+ && ( c != '\b'
+ || (flags & UNI_DISPLAY_BACKSPACE)))
+ {
+ const char * mnemonic = cntrl_to_mnemonic(c);
+ sv_catpvn(dsv, mnemonic, strlen(mnemonic));
+ ok = 1;
+ }
+ else if (c == '\\') {
+ sv_catpvs(dsv, "\\\\");
+ ok = 1;
+ }
+ }
/* isPRINT() is the locale-blind version. */
if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
const char string = c;
For case-insensitiveness, the "casefolding" of Unicode is used
instead of upper/lowercasing both the characters, see
-L<http://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
+L<https://www.unicode.org/unicode/reports/tr21/> (Case Mappings).
=cut */
if (flags & FOLDEQ_LOCALE) {
if (IN_UTF8_CTYPE_LOCALE) {
- flags &= ~FOLDEQ_LOCALE;
+ if (UNLIKELY(PL_in_utf8_turkic_locale)) {
+ flags_for_folder |= FOLD_FLAGS_LOCALE;
+ }
+ else {
+ flags &= ~FOLDEQ_LOCALE;
+ }
}
else {
flags_for_folder |= FOLD_FLAGS_LOCALE;
return 1;
}
-/* XXX The next two functions should likely be moved to mathoms.c once all
- * occurrences of them are removed from the core; some cpan-upstream modules
- * still use them */
-
-U8 *
-Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
-{
- PERL_ARGS_ASSERT_UVUNI_TO_UTF8;
-
- return uvoffuni_to_utf8_flags(d, uv, 0);
-}
-
-/*
-=for apidoc utf8n_to_uvuni
-
-Instead use L</utf8_to_uvchr_buf>, or rarely, L</utf8n_to_uvchr>.
-
-This function was useful for code that wanted to handle both EBCDIC and
-ASCII platforms with Unicode properties, but starting in Perl v5.20, the
-distinctions between the platforms have mostly been made invisible to most
-code, so this function is quite unlikely to be what you want. If you do need
-this precise functionality, use instead
-C<L<NATIVE_TO_UNI(utf8_to_uvchr_buf(...))|/utf8_to_uvchr_buf>>
-or C<L<NATIVE_TO_UNI(utf8n_to_uvchr(...))|/utf8n_to_uvchr>>.
-
-=cut
-*/
-
-UV
-Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
-{
- PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
-
- return NATIVE_TO_UNI(utf8n_to_uvchr(s, curlen, retlen, flags));
-}
-
-/*
-=for apidoc uvuni_to_utf8_flags
-
-Instead you almost certainly want to use L</uvchr_to_utf8> or
-L</uvchr_to_utf8_flags>.
-
-This function is a deprecated synonym for L</uvoffuni_to_utf8_flags>,
-which itself, while not deprecated, should be used only in isolated
-circumstances. These functions were useful for code that wanted to handle
-both EBCDIC and ASCII platforms with Unicode properties, but starting in Perl
-v5.20, the distinctions between the platforms have mostly been made invisible
-to most code, so this function is quite unlikely to be what you want.
-
-=cut
-*/
-
-U8 *
-Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
-{
- PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
-
- return uvoffuni_to_utf8_flags(d, uv, flags);
-}
-
-/*
-=for apidoc utf8_to_uvchr
-
-Returns the native code point of the first character in the string C<s>
-which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
-length, in bytes, of that character.
-
-Some, but not all, UTF-8 malformations are detected, and in fact, some
-malformed input could cause reading beyond the end of the input buffer, which
-is why this function is deprecated. Use L</utf8_to_uvchr_buf> instead.
-
-If C<s> points to one of the detected malformations, and UTF8 warnings are
-enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
-C<NULL>) to -1. If those warnings are off, the computed value if well-defined (or
-the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
-is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
-next possible position in C<s> that could begin a non-malformed character.
-See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
-
-=cut
-*/
-
-UV
-Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
-{
- PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
-
- /* This function is unsafe if malformed UTF-8 input is given it, which is
- * why the function is deprecated. If the first byte of the input
- * indicates that there are more bytes remaining in the sequence that forms
- * the character than there are in the input buffer, it can read past the
- * end. But we can make it safe if the input string happens to be
- * NUL-terminated, as many strings in Perl are, by refusing to read past a
- * NUL. A NUL indicates the start of the next character anyway. If the
- * input isn't NUL-terminated, the function remains unsafe, as it always
- * has been.
- *
- * An initial NUL has to be handled separately, but all ASCIIs can be
- * handled the same way, speeding up this common case */
-
- if (UTF8_IS_INVARIANT(*s)) { /* Assumes 's' contains at least 1 byte */
- return (UV) *s;
- }
-
- return utf8_to_uvchr_buf(s,
- s + my_strnlen((char *) s, UTF8SKIP(s)),
- retlen);
-}
-
/*
* ex: set ts=8 sts=4 sw=4 et:
*/