"Use of code point 0x%" UVXf " is not allowed; the"
" permissible max is 0x%" UVXf;
-#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
-
/*
=head1 Unicode Support
These are various utility functions for manipulating UTF8-encoded
This function is for code that wants any warning and/or error messages to be
returned to the caller rather than be displayed. All messages that would have
-been displayed if all lexcial warnings are enabled will be returned.
+been displayed if all lexical warnings are enabled will be returned.
It is just like C<L</uvchr_to_utf8_flags>> but it takes an extra parameter
placed after all the others, C<msgs>. If this parameter is 0, this function
* performance hit on these high EBCDIC code points. */
if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
- if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
- Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
+ if (UNLIKELY(uv > MAX_LEGAL_CP)) {
+ Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_LEGAL_CP);
}
if ( (flags & UNICODE_WARN_SUPER)
|| ( (flags & UNICODE_WARN_PERL_EXTENDED)
STRLEN len = OFFUNISKIP(uv);
U8 *p = d+len-1;
while (p > d) {
- *p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
- uv >>= UTF_ACCUMULATION_SHIFT;
+ *p-- = I8_TO_NATIVE_UTF8((uv & MASK) | MARK);
+ uv >>= SHIFT;
}
*p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
return d+len;
PERL_STATIC_INLINE char *
S_unexpected_non_continuation_text(pTHX_ const U8 * const s,
- /* How many bytes to print */
+ /* Max number of bytes to print */
STRLEN print_len,
/* Which one is the non-continuation */
? "immediately"
: Perl_form(aTHX_ "%d bytes",
(int) non_cont_byte_pos);
+ const U8 * x = s + non_cont_byte_pos;
+ const U8 * e = s + print_len;
PERL_ARGS_ASSERT_UNEXPECTED_NON_CONTINUATION_TEXT;
* calculated, it's likely faster to pass it; verify under DEBUGGING */
assert(expect_len == UTF8SKIP(s));
+ /* As a defensive coding measure, don't output anything past a NUL. Such
+ * bytes shouldn't be in the middle of a malformation, and could mark the
+ * end of the allocated string, and what comes after is undefined */
+ for (; x < e; x++) {
+ if (*x == '\0') {
+ x++; /* Output this particular NUL */
+ break;
+ }
+ }
+
return Perl_form(aTHX_ "%s: %s (unexpected non-continuation byte 0x%02x,"
" %s after start byte 0x%02x; need %d bytes, got %d)",
malformed_text,
- _byte_dump_string(s, print_len, 0),
+ _byte_dump_string(s, x - s, 0),
*(s + non_cont_byte_pos),
where,
*s,
*/
UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
- STRLEN curlen,
- STRLEN *retlen,
- const U32 flags)
+Perl_utf8n_to_uvchr(const U8 *s,
+ STRLEN curlen,
+ STRLEN *retlen,
+ const U32 flags)
{
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
}
-/* The tables below come from http://bjoern.hoehrmann.de/utf-8/decoder/dfa/,
- * which requires this copyright notice */
-
-/* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
-
-Permission is hereby granted, free of charge, to any person obtaining a copy of
-this software and associated documentation files (the "Software"), to deal in
-the Software without restriction, including without limitation the rights to
-use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
-of the Software, and to permit persons to whom the Software is furnished to do
-so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-*/
-
-#if 0
-static U8 utf8d_C9[] = {
- /* The first part of the table maps bytes to character classes that
- * to reduce the size of the transition table and create bitmasks. */
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/
- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/
- 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/
- 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8, /*-FF*/
-
- /* The second part is a transition table that maps a combination
- * of a state of the automaton and a character class to a state. */
- 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
- 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
- 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
- 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
- 12,36,12,12,12,12,12,12,12,12,12,12
-};
-
-#endif
-
-#ifndef EBCDIC
-
-/* This is a version of the above table customized for Perl that doesn't
- * exclude surrogates and accepts start bytes up through F7 (representing
- * 2**21 - 1). */
-static U8 dfa_tab_for_perl[] = {
- /* The first part of the table maps bytes to character classes to reduce
- * the size of the transition table and create bitmasks. */
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-1F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-3F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-5F*/
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /*-7F*/
- 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, /*-9F*/
- 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, /*-BF*/
- 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, /*-DF*/
- 10,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3, 11,4,4,4,4,4,4,4,8,8,8,8,8,8,8,8, /*-FF*/
-
- /* The second part is a transition table that maps a combination
- * of a state of the automaton and a character class to a state. */
- 0,12,24,36,96,12,12,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,/*23*/
- 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,/*47*/
- 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,/*71*/
- 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,/*95*/
- 12,36,12,12,12,12,12,36,12,36,12,12 /* 96- 107 */
-
- /* The customization was to repurpose the surrogates type '4' to instead be
- * for start bytes F1-F7. Types 5 and 6 are now unused, and their entries in
- * the transition part of the table are set to 12, so are illegal.
- *
- * To do higher code points would require expansion and some rearrangement of
- * the table. The type '1' entries for continuation bytes 80-8f would have to
- * be split into several types, because they aren't treated uniformly for
- * higher start bytes, since overlongs for F8 are 80-87; FC: 80-83; and FE:
- * 80-81. We start needing to worry about overflow if FE is included.
- * Ignoring, FE and FF, we could use type 5 for F9-FB, and 6 for FD (remember
- * from the web site that these are used to right shift). FE would
- * necessarily be type 7; and FF, type 8. And new states would have to be
- * created for F8 and FC (and FE and FF if used), so quite a bit of work would
- * be involved.
- *
- * XXX Better would be to customize the table so that the noncharacters are
- * excluded. This again is non trivial, but doing so would simplify the code
- * that uses this, and might make it small enough to make it inlinable */
-};
-
-#endif
-
/*
=for apidoc utf8n_to_uvchr_error
=item C<UTF8_GOT_NON_CONTINUATION>
The input sequence was malformed in that a non-continuation type byte was found
-in a position where only a continuation type one should be.
+in a position where only a continuation type one should be. See also
+L</C<UTF8_GOT_SHORT>>.
=item C<UTF8_GOT_OVERFLOW>
a complete sequence. In other words, the input is for a partial character
sequence.
+
+C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
+sequence. The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
+that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
+sequence was looked at. If no other flags are present, it means that the
+sequence was valid as far as it went. Depending on the application, this could
+mean one of three things:
+
+=over
+
+=item *
+
+The C<curlen> length parameter passed in was too small, and the function was
+prevented from examining all the necessary bytes.
+
+=item *
+
+The buffer being looked at is based on reading data, and the data received so
+far stopped in the middle of a character, so that the next read will
+read the remainder of this character. (It is up to the caller to deal with the
+split bytes somehow.)
+
+=item *
+
+This is a real error, and the partial sequence is all we're going to get.
+
+=back
+
=item C<UTF8_GOT_SUPER>
The input sequence was malformed in that it is for a non-Unicode code point;
*/
UV
-Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
+Perl_utf8n_to_uvchr_error(const U8 *s,
STRLEN curlen,
STRLEN *retlen,
const U32 flags,
*/
UV
-Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
+Perl__utf8n_to_uvchr_msgs_helper(const U8 *s,
STRLEN curlen,
STRLEN *retlen,
const U32 flags,
{
const U8 * const s0 = s;
const U8 * send = s0 + curlen;
- U32 possible_problems = 0; /* A bit is set here for each potential problem
- found as we go along */
- UV uv = (UV) -1;
- STRLEN expectlen = 0; /* How long should this sequence be?
- (initialized to silence compilers' wrong
- warning) */
- STRLEN avail_len = 0; /* When input is too short, gives what that is */
- U32 discard_errors = 0; /* Used to save branches when 'errors' is NULL;
- this gets set and discarded */
+ U32 possible_problems; /* A bit is set here for each potential problem
+ found as we go along */
+ UV uv;
+ STRLEN expectlen; /* How long should this sequence be? */
+ STRLEN avail_len; /* When input is too short, gives what that is */
+ U32 discard_errors; /* Used to save branches when 'errors' is NULL; this
+ gets set and discarded */
/* The below are used only if there is both an overlong malformation and a
* too short one. Otherwise the first two are set to 's0' and 'send', and
* the third not used at all */
- U8 * adjusted_s0 = (U8 *) s0;
+ U8 * adjusted_s0;
U8 temp_char_buf[UTF8_MAXBYTES + 1]; /* Used to avoid a Newx in this
routine; see [perl #130921] */
- UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
+ UV uv_so_far;
+ dTHX;
- UV state = 0;
+ PERL_ARGS_ASSERT__UTF8N_TO_UVCHR_MSGS_HELPER;
- PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
+ /* Here, is one of: a) malformed; b) a problematic code point (surrogate,
+ * non-unicode, or nonchar); or c) on ASCII platforms, one of the Hangul
+ * syllables that the dfa doesn't properly handle. Quickly dispose of the
+ * final case. */
+
+#ifndef EBCDIC
+
+ /* Each of the affected Hanguls starts with \xED */
+
+ if (is_HANGUL_ED_utf8_safe(s0, send)) {
+ if (retlen) {
+ *retlen = 3;
+ }
+ if (errors) {
+ *errors = 0;
+ }
+ if (msgs) {
+ *msgs = NULL;
+ }
+
+ return ((0xED & UTF_START_MASK(3)) << (2 * UTF_ACCUMULATION_SHIFT))
+ | ((s0[1] & UTF_CONTINUATION_MASK) << UTF_ACCUMULATION_SHIFT)
+ | (s0[2] & UTF_CONTINUATION_MASK);
+ }
+
+#endif
+
+ /* In conjunction with the exhaustive tests that can be enabled in
+ * APItest/t/utf8_warn_base.pl, this can make sure the dfa does precisely
+ * what it is intended to do, and that no flaws in it are masked by
+ * dropping down and executing the code below
+ assert(! isUTF8_CHAR(s0, send)
+ || UTF8_IS_SURROGATE(s0, send)
+ || UTF8_IS_SUPER(s0, send)
+ || UTF8_IS_NONCHAR(s0,send));
+ */
+
+ s = s0;
+ uv = *s0;
+ possible_problems = 0;
+ expectlen = 0;
+ avail_len = 0;
+ discard_errors = 0;
+ adjusted_s0 = (U8 *) s0;
+ uv_so_far = 0;
if (errors) {
*errors = 0;
*retlen = expectlen;
}
- /* An invariant is trivially well-formed */
- if (UTF8_IS_INVARIANT(*s0)) {
- return *s0;
- }
-
-#ifndef EBCDIC
-
- /* Measurements show that this dfa is somewhat faster than the regular code
- * below, so use it first, dropping down for the non-normal cases. */
-
-# define PERL_UTF8_DECODE_REJECT 12
-
- while (s < send && LIKELY(state != PERL_UTF8_DECODE_REJECT)) {
- UV type = dfa_tab_for_perl[*s];
-
- if (state != 0) {
- uv = (*s & 0x3fu) | (uv << UTF_ACCUMULATION_SHIFT);
- state = dfa_tab_for_perl[256 + state + type];
- }
- else {
- uv = (0xff >> type) & (*s);
- state = dfa_tab_for_perl[256 + type];
- }
-
- if (state == 0) {
-
- /* If this could be a code point that the flags don't allow (the first
- * surrogate is the first such possible one), delve further, but we already
- * have calculated 'uv' */
- if ( (flags & (UTF8_DISALLOW_ILLEGAL_INTERCHANGE
- |UTF8_WARN_ILLEGAL_INTERCHANGE))
- && uv >= UNICODE_SURROGATE_FIRST)
- {
- curlen = s + 1 - s0;
- goto got_uv;
- }
-
- return uv;
- }
-
- s++;
- }
-
- /* Here, is some sort of failure. Use the full mechanism */
-
- uv = *s0;
-
-#endif
-
/* A continuation character can't start a valid sequence */
if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
possible_problems |= UTF8_GOT_CONTINUATION;
}
}
- got_uv:
-
/* Here, we have found all the possible problems, except for when the input
* is for a problematic code point not allowed by the input parameters. */
PERL_ARGS_ASSERT_BYTES_TO_UTF8;
PERL_UNUSED_CONTEXT;
- Newx(d, (*lenp) * 2 + 1, U8);
+ /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
+ Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
dst = d;
while (s < send) {
*d = '\0';
*lenp = d-dst;
- /* Trim unused space */
- Renew(dst, *lenp + 1, U8);
-
return dst;
}
if (*p == '_')
return TRUE;
- return is_utf8_common(p, NULL,
- "This is buggy if this gets used",
- PL_utf8_idstart);
+ return is_utf8_common(p, PL_utf8_idstart);
}
bool
return _to_upper_title_latin1((U8) c, p, lenp, 'S');
}
- uvchr_to_utf8(p, c);
- return CALL_UPPER_CASE(c, p, p, lenp);
+ return CALL_UPPER_CASE(c, NULL, p, lenp);
}
UV
return _to_upper_title_latin1((U8) c, p, lenp, 's');
}
- uvchr_to_utf8(p, c);
- return CALL_TITLE_CASE(c, p, p, lenp);
+ return CALL_TITLE_CASE(c, NULL, p, lenp);
}
STATIC U8
return to_lower_latin1((U8) c, p, lenp, 0 /* 0 is a dummy arg */ );
}
- uvchr_to_utf8(p, c);
- return CALL_LOWER_CASE(c, p, p, lenp);
+ return CALL_LOWER_CASE(c, NULL, p, lenp);
}
UV
-Perl__to_fold_latin1(pTHX_ const U8 c, U8* p, STRLEN *lenp,
- const unsigned int flags)
+Perl__to_fold_latin1(const U8 c, U8* p, STRLEN *lenp, const unsigned int flags)
{
/* Corresponds to to_lower_latin1(); <flags> bits meanings:
* FOLD_FLAGS_NOMIX_ASCII iff non-ASCII to ASCII folds are prohibited
UV converted;
PERL_ARGS_ASSERT__TO_FOLD_LATIN1;
- PERL_UNUSED_CONTEXT;
assert (! (flags & FOLD_FLAGS_LOCALE));
PERL_ARGS_ASSERT__TO_UNI_FOLD_FLAGS;
if (flags & FOLD_FLAGS_LOCALE) {
- /* Treat a UTF-8 locale as not being in locale at all, except for
- * potentially warning */
+ /* Treat a non-Turkic UTF-8 locale as not being in locale at all,
+ * except for potentially warning */
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
- if (IN_UTF8_CTYPE_LOCALE) {
+ if (IN_UTF8_CTYPE_LOCALE && ! PL_in_utf8_turkic_locale) {
flags &= ~FOLD_FLAGS_LOCALE;
}
else {
/* Here, above 255. If no special needs, just use the macro */
if ( ! (flags & (FOLD_FLAGS_LOCALE|FOLD_FLAGS_NOMIX_ASCII))) {
- uvchr_to_utf8(p, c);
- return CALL_FOLD_CASE(c, p, p, lenp, flags & FOLD_FLAGS_FULL);
+ return CALL_FOLD_CASE(c, NULL, p, lenp, flags & FOLD_FLAGS_FULL);
}
else { /* Otherwise, _toFOLD_utf8_flags has the intelligence to deal with
the special flags. */
}
PERL_STATIC_INLINE bool
-S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
- const char *const swashname, SV* const invlist)
+S_is_utf8_common(pTHX_ const U8 *const p, SV* const invlist)
{
/* returns a boolean giving whether or not the UTF8-encoded character that
- * starts at <p> is in the swash indicated by <swashname>. <swash>
- * contains a pointer to where the swash indicated by <swashname>
- * is to be stored; which this routine will do, so that future calls will
- * look at <*swash> and only generate a swash if it is not null. <invlist>
- * is NULL or an inversion list that defines the swash. If not null, it
- * saves time during initialization of the swash.
+ * starts at <p> is in the inversion list indicated by <invlist>.
*
* Note that it is assumed that the buffer length of <p> is enough to
* contain all the bytes that comprise the character. Thus, <*p> should
* have been checked before this call for mal-formedness enough to assure
- * that. */
+ * that. This function, does make sure to not look past any NUL, so it is
+ * safe to use on C, NUL-terminated, strings */
+ STRLEN len = my_strnlen((char *) p, UTF8SKIP(p));
PERL_ARGS_ASSERT_IS_UTF8_COMMON;
* as far as there being enough bytes available in it to accommodate the
* character without reading beyond the end, and pass that number on to the
* validating routine */
- if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
- _force_out_malformed_utf8_message(p, p + UTF8SKIP(p),
- _UTF8_NO_CONFIDENCE_IN_CURLEN,
+ if (! isUTF8_CHAR(p, p + len)) {
+ _force_out_malformed_utf8_message(p, p + len, _UTF8_NO_CONFIDENCE_IN_CURLEN,
1 /* Die */ );
NOT_REACHED; /* NOTREACHED */
}
- if (invlist) {
- return _invlist_contains_cp(invlist, valid_utf8_to_uvchr(p, NULL));
- }
-
- assert(swash);
-
- if (!*swash) {
- U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
- *swash = _core_swash_init("utf8",
-
- /* Only use the name if there is no inversion
- * list; otherwise will go out to disk */
- (invlist) ? "" : swashname,
-
- &PL_sv_undef, 1, 0, invlist, &flags);
- }
-
- return swash_fetch(*swash, p, TRUE) != 0;
+ return is_utf8_common_with_len(p, p + len, invlist);
}
PERL_STATIC_INLINE bool
S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e,
- SV **swash, const char *const swashname,
SV* const invlist)
{
/* returns a boolean giving whether or not the UTF8-encoded character that
- * starts at <p>, and extending no further than <e - 1> is in the swash
- * indicated by <swashname>. <swash> contains a pointer to where the swash
- * indicated by <swashname> is to be stored; which this routine will do, so
- * that future calls will look at <*swash> and only generate a swash if it
- * is not null. <invlist> is NULL or an inversion list that defines the
- * swash. If not null, it saves time during initialization of the swash.
- */
+ * starts at <p>, and extending no further than <e - 1> is in the inversion
+ * list <invlist>. */
+
+ UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
- if (! isUTF8_CHAR(p, e)) {
+ if (cp == 0 && (p >= e || *p != '\0')) {
_force_out_malformed_utf8_message(p, e, 0, 1);
NOT_REACHED; /* NOTREACHED */
}
- if (invlist) {
- return _invlist_contains_cp(invlist, valid_utf8_to_uvchr(p, NULL));
- }
-
- assert(swash);
-
- if (!*swash) {
- U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
- *swash = _core_swash_init("utf8",
-
- /* Only use the name if there is no inversion
- * list; otherwise will go out to disk */
- (invlist) ? "" : swashname,
-
- &PL_sv_undef, 1, 0, invlist, &flags);
- }
-
- return swash_fetch(*swash, p, TRUE) != 0;
+ assert(invlist);
+ return _invlist_contains_cp(invlist, cp);
}
STATIC void
if (instr(file, "mathoms.c")) {
Perl_warner(aTHX_ WARN_DEPRECATED,
- "In %s, line %d, starting in Perl v5.30, %s()"
+ "In %s, line %d, starting in Perl v5.32, %s()"
" will be removed. Avoid this message by"
" converting to use %s().\n",
file, line, name, alternative);
}
else {
Perl_warner(aTHX_ WARN_DEPRECATED,
- "In %s, line %d, starting in Perl v5.30, %s() will"
+ "In %s, line %d, starting in Perl v5.32, %s() will"
" require an additional parameter. Avoid this"
" message by converting to use %s().\n",
file, line, name, alternative);
case _CC_GRAPH:
case _CC_CASED:
- return is_utf8_common(p,
- NULL,
- "This is buggy if this gets used",
- PL_XPosix_ptrs[classnum]);
+ return is_utf8_common(p, PL_XPosix_ptrs[classnum]);
case _CC_SPACE:
return is_XPERLSPACE_high(p);
case _CC_VERTSPACE:
return is_VERTWS_high(p);
case _CC_IDFIRST:
- return is_utf8_common(p, NULL,
- "This is buggy if this gets used",
- PL_utf8_perl_idstart);
+ return is_utf8_common(p, PL_utf8_perl_idstart);
case _CC_IDCONT:
- return is_utf8_common(p, NULL,
- "This is buggy if this gets used",
- PL_utf8_perl_idcont);
+ return is_utf8_common(p, PL_utf8_perl_idcont);
}
}
{
PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
- return is_utf8_common_with_len(p, e, NULL,
- "This is buggy if this gets used",
- PL_XPosix_ptrs[classnum]);
+ return is_utf8_common_with_len(p, e, PL_XPosix_ptrs[classnum]);
}
bool
{
PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
- return is_utf8_common_with_len(p, e, NULL,
- "This is buggy if this gets used",
- PL_utf8_perl_idstart);
+ return is_utf8_common_with_len(p, e, PL_utf8_perl_idstart);
}
bool
if (*p == '_')
return TRUE;
- return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
+ return is_utf8_common(p, PL_utf8_xidstart);
}
bool
{
PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
- return is_utf8_common_with_len(p, e, NULL,
- "This is buggy if this gets used",
- PL_utf8_perl_idcont);
+ return is_utf8_common_with_len(p, e, PL_utf8_perl_idcont);
}
bool
{
PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
- return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
+ return is_utf8_common(p, PL_utf8_idcont);
}
bool
{
PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
- return is_utf8_common(p, &PL_utf8_xidcont, "XIdContinue", NULL);
+ return is_utf8_common(p, PL_utf8_xidcont);
}
bool
{
PERL_ARGS_ASSERT__IS_UTF8_MARK;
- return is_utf8_common(p, &PL_utf8_mark, "IsM", NULL);
+ return is_utf8_common(p, PL_utf8_mark);
}
STATIC UV
S__to_utf8_case(pTHX_ const UV uv1, const U8 *p,
U8* ustrp, STRLEN *lenp,
SV *invlist, const int * const invmap,
- const int * const * aux_tables,
+ const unsigned int * const * const aux_tables,
const U8 * const aux_table_lengths,
const char * const normal)
{
}
if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
- if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) {
+ if (UNLIKELY(uv1 > MAX_LEGAL_CP)) {
Perl_croak(aTHX_ cp_above_legal_max, uv1,
- MAX_EXTERNALLY_LEGAL_CP);
+ MAX_LEGAL_CP);
}
if (ckWARN_d(WARN_NON_UNICODE)) {
const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
{
unsigned int i;
- const int * cp_list;
+ const unsigned int * cp_list;
U8 * d;
+
+ /* 'index' is guaranteed to be non-negative, as this is an inversion
+ * map that covers all possible inputs. See [perl #133365] */
SSize_t index = _invlist_search(invlist, uv1);
IV base = invmap[index];
/* Here, there was no mapping defined, which means that the code point maps
* to itself. Return the inputs */
cases_to_self:
- len = UTF8SKIP(p);
- if (p != ustrp) { /* Don't copy onto itself */
- Copy(p, ustrp, len, U8);
+ if (p) {
+ len = UTF8SKIP(p);
+ if (p != ustrp) { /* Don't copy onto itself */
+ Copy(p, ustrp, len, U8);
+ }
+ *lenp = len;
+ }
+ else {
+ *lenp = uvchr_to_utf8(ustrp, uv1) - ustrp;
}
-
- if (lenp)
- *lenp = len;
return uv1;
}
+Size_t
+Perl__inverse_folds(pTHX_ const UV cp, unsigned int * first_folds_to,
+ const unsigned int ** remaining_folds_to)
+{
+ /* Returns the count of the number of code points that fold to the input
+ * 'cp' (besides itself).
+ *
+ * If the return is 0, there is nothing else that folds to it, and
+ * '*first_folds_to' is set to 0, and '*remaining_folds_to' is set to NULL.
+ *
+ * If the return is 1, '*first_folds_to' is set to the single code point,
+ * and '*remaining_folds_to' is set to NULL.
+ *
+ * Otherwise, '*first_folds_to' is set to a code point, and
+ * '*remaining_fold_to' is set to an array that contains the others. The
+ * length of this array is the returned count minus 1.
+ *
+ * The reason for this convolution is to avoid having to deal with
+ * allocating and freeing memory. The lists are already constructed, so
+ * the return can point to them, but single code points aren't, so would
+ * need to be constructed if we didn't employ something like this API */
+
+ /* 'index' is guaranteed to be non-negative, as this is an inversion map
+ * that covers all possible inputs. See [perl #133365] */
+ SSize_t index = _invlist_search(PL_utf8_foldclosures, cp);
+ int base = _Perl_IVCF_invmap[index];
+
+ PERL_ARGS_ASSERT__INVERSE_FOLDS;
+
+ if (base == 0) { /* No fold */
+ *first_folds_to = 0;
+ *remaining_folds_to = NULL;
+ return 0;
+ }
+
+#ifndef HAS_IVCF_AUX_TABLES /* This Unicode version only has 1-1 folds */
+
+ assert(base > 0);
+
+#else
+
+ if (UNLIKELY(base < 0)) { /* Folds to more than one character */
+
+ /* The data structure is set up so that the absolute value of 'base' is
+ * an index into a table of pointers to arrays, with the array
+ * corresponding to the index being the list of code points that fold
+ * to 'cp', and the parallel array containing the length of the list
+ * array */
+ *first_folds_to = IVCF_AUX_TABLE_ptrs[-base][0];
+ *remaining_folds_to = IVCF_AUX_TABLE_ptrs[-base] + 1; /* +1 excludes
+ *first_folds_to
+ */
+ return IVCF_AUX_TABLE_lengths[-base];
+ }
+
+#endif
+
+ /* Only the single code point. This works like 'fc(G) = G - A + a' */
+ *first_folds_to = base + cp - invlist_array(PL_utf8_foldclosures)[index];
+ *remaining_folds_to = NULL;
+ return 1;
+}
+
STATIC UV
S_check_locale_boundary_crossing(pTHX_ const U8* const p, const UV result,
U8* const ustrp, STRLEN *lenp)
if (*e == NULL) {
utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
- *e = p + UTF8SKIP(p);
+
+ /* strnlen() makes this function safe for the common case of
+ * NUL-terminated strings */
+ *e = p + my_strnlen((char *) p, UTF8SKIP(p));
/* For mathoms.c calls, we use the function name we know is stored
* there. It could be part of a larger path */
return utf8n_flags;
}
+STATIC UV
+S_turkic_fc(pTHX_ const U8 * const p, const U8 * const e,
+ U8 * ustrp, STRLEN *lenp)
+{
+ /* Returns 0 if the foldcase of the input UTF-8 encoded sequence from
+ * p0..e-1 according to Turkic rules is the same as for non-Turkic.
+ * Otherwise, it returns the first code point of the Turkic foldcased
+ * sequence, and the entire sequence will be stored in *ustrp. ustrp will
+ * contain *lenp bytes
+ *
+ * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
+ * I WITH DOT ABOVE form a case pair, as do 'I' and LATIN SMALL LETTER
+ * DOTLESS I */
+
+ PERL_ARGS_ASSERT_TURKIC_FC;
+ assert(e > p);
+
+ if (UNLIKELY(*p == 'I')) {
+ *lenp = 2;
+ ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
+ ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
+ return LATIN_SMALL_LETTER_DOTLESS_I;
+ }
+
+ if (UNLIKELY(memBEGINs(p, e - p,
+ LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8)))
+ {
+ *lenp = 1;
+ *ustrp = 'i';
+ return 'i';
+ }
+
+ return 0;
+}
+
+STATIC UV
+S_turkic_lc(pTHX_ const U8 * const p0, const U8 * const e,
+ U8 * ustrp, STRLEN *lenp)
+{
+ /* Returns 0 if the lowercase of the input UTF-8 encoded sequence from
+ * p0..e-1 according to Turkic rules is the same as for non-Turkic.
+ * Otherwise, it returns the first code point of the Turkic lowercased
+ * sequence, and the entire sequence will be stored in *ustrp. ustrp will
+ * contain *lenp bytes */
+
+ PERL_ARGS_ASSERT_TURKIC_LC;
+ assert(e > p0);
+
+ /* A 'I' requires context as to what to do */
+ if (UNLIKELY(*p0 == 'I')) {
+ const U8 * p = p0 + 1;
+
+ /* According to the Unicode SpecialCasing.txt file, a capital 'I'
+ * modified by a dot above lowercases to 'i' even in turkic locales. */
+ while (p < e) {
+ UV cp;
+
+ if (memBEGINs(p, e - p, COMBINING_DOT_ABOVE_UTF8)) {
+ ustrp[0] = 'i';
+ *lenp = 1;
+ return 'i';
+ }
+
+ /* For the dot above to modify the 'I', it must be part of a
+ * combining sequence immediately following the 'I', and no other
+ * modifier with a ccc of 230 may intervene */
+ cp = utf8_to_uvchr_buf(p, e, NULL);
+ if (! _invlist_contains_cp(PL_CCC_non0_non230, cp)) {
+ break;
+ }
+
+ /* Here the combining sequence continues */
+ p += UTF8SKIP(p);
+ }
+ }
+
+ /* In all other cases the lc is the same as the fold */
+ return turkic_fc(p0, e, ustrp, lenp);
+}
+
+STATIC UV
+S_turkic_uc(pTHX_ const U8 * const p, const U8 * const e,
+ U8 * ustrp, STRLEN *lenp)
+{
+ /* Returns 0 if the upper or title-case of the input UTF-8 encoded sequence
+ * from p0..e-1 according to Turkic rules is the same as for non-Turkic.
+ * Otherwise, it returns the first code point of the Turkic upper or
+ * title-cased sequence, and the entire sequence will be stored in *ustrp.
+ * ustrp will contain *lenp bytes
+ *
+ * Turkic differs only from non-Turkic in that 'i' and LATIN CAPITAL LETTER
+ * I WITH DOT ABOVE form a case pair, as do 'I' and and LATIN SMALL LETTER
+ * DOTLESS I */
+
+ PERL_ARGS_ASSERT_TURKIC_UC;
+ assert(e > p);
+
+ if (*p == 'i') {
+ *lenp = 2;
+ ustrp[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ ustrp[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE;
+ }
+
+ if (memBEGINs(p, e - p, LATIN_SMALL_LETTER_DOTLESS_I_UTF8)) {
+ *lenp = 1;
+ *ustrp = 'I';
+ return 'I';
+ }
+
+ return 0;
+}
+
/* The process for changing the case is essentially the same for the four case
* change types, except there are complications for folding. Otherwise the
* difference is only which case to change to. To make sure that they all do
* the input code point calculated from the UTF-8. The fold code needs to
* realize all this and take it from there.
*
+ * To deal with Turkic locales, the function specified by the parameter
+ * 'turkic' is called when appropriate.
+ *
* If you read the two macros as sequential, it's easier to understand what's
* going on. */
#define CASE_CHANGE_BODY_START(locale_flags, LC_L1_change_macro, L1_func, \
- L1_func_extra_param) \
+ L1_func_extra_param, turkic) \
\
if (flags & (locale_flags)) { \
_CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
- /* Treat a UTF-8 locale as not being in locale at all */ \
if (IN_UTF8_CTYPE_LOCALE) { \
+ if (UNLIKELY(PL_in_utf8_turkic_locale)) { \
+ UV ret = turkic(p, e, ustrp, lenp); \
+ if (ret) return ret; \
+ } \
+ \
+ /* Otherwise, treat a UTF-8 locale as not being in locale at \
+ * all */ \
flags &= ~(locale_flags); \
} \
} \
} \
} \
else if UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(p, e) { \
+ U8 c = EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)); \
if (flags & (locale_flags)) { \
- result = LC_L1_change_macro(EIGHT_BIT_UTF8_TO_NATIVE(*p, \
- *(p+1))); \
+ result = LC_L1_change_macro(c); \
} \
else { \
- return L1_func(EIGHT_BIT_UTF8_TO_NATIVE(*p, *(p+1)), \
- ustrp, lenp, L1_func_extra_param); \
+ return L1_func(c, ustrp, lenp, L1_func_extra_param); \
} \
} \
else { /* malformed UTF-8 or ord above 255 */ \
/* ~0 makes anything non-zero in 'flags' mean we are using locale rules */
/* 2nd char of uc(U+DF) is 'S' */
- CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S');
+ CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 'S',
+ turkic_uc);
CASE_CHANGE_BODY_END (~0, CALL_UPPER_CASE);
}
PERL_ARGS_ASSERT__TO_UTF8_TITLE_FLAGS;
/* 2nd char of ucfirst(U+DF) is 's' */
- CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's');
+ CASE_CHANGE_BODY_START(~0, toUPPER_LC, _to_upper_title_latin1, 's',
+ turkic_uc);
CASE_CHANGE_BODY_END (~0, CALL_TITLE_CASE);
}
PERL_ARGS_ASSERT__TO_UTF8_LOWER_FLAGS;
- CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */)
+ CASE_CHANGE_BODY_START(~0, toLOWER_LC, to_lower_latin1, 0 /* 0 is dummy */,
+ turkic_lc);
CASE_CHANGE_BODY_END (~0, CALL_LOWER_CASE)
}
assert(p != ustrp); /* Otherwise overwrites */
CASE_CHANGE_BODY_START(FOLD_FLAGS_LOCALE, toFOLD_LC, _to_fold_latin1,
- ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)));
+ ((flags) & (FOLD_FLAGS_FULL | FOLD_FLAGS_NOMIX_ASCII)),
+ turkic_fc);
result = CALL_FOLD_CASE(result, p, ustrp, lenp, flags & FOLD_FLAGS_FULL);
/* Special case these two characters, as what normally gets
* returned under locale doesn't work */
- if (memEQs((char *) p, UTF8SKIP(p), CAP_SHARP_S))
+ if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
{
/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
}
else
#endif
- if (memEQs((char *) p, UTF8SKIP(p), LONG_S_T))
+ if (memBEGINs((char *) p, e - p, LONG_S_T))
{
/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
* 255/256 boundary which is forbidden under /l, and so the code
* wouldn't catch that they are equivalent (which they are only in
* this release) */
- else if (memEQs((char *) p, UTF8SKIP(p), DOTTED_I)) {
+ else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
"Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
* works. */
*lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
- Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
+ Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
ustrp, *lenp, U8);
return LATIN_SMALL_LETTER_LONG_S;
Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
I32 minbits, I32 none)
{
- PERL_ARGS_ASSERT_SWASH_INIT;
-
/* Returns a copy of a swash initiated by the called function. This is the
* public interface, and returning a copy prevents others from doing
- * mischief on the original */
-
- return newSVsv(_core_swash_init(pkg, name, listsv, minbits, none,
- NULL, NULL));
-}
-
-SV*
-Perl__core_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv,
- I32 minbits, I32 none, SV* invlist,
- U8* const flags_p)
-{
+ * mischief on the original. The only remaining use of this is in tr/// */
/*NOTE NOTE NOTE - If you want to use "return" in this routine you MUST
* use the following define */
-#define CORE_SWASH_INIT_RETURN(x) \
+#define SWASH_INIT_RETURN(x) \
PL_curpm= old_PL_curpm; \
- return x
+ return newSVsv(x)
/* Initialize and return a swash, creating it if necessary. It does this
- * by calling utf8_heavy.pl in the general case. The returned value may be
- * the swash's inversion list instead if the input parameters allow it.
- * Which is returned should be immaterial to callers, as the only
- * operations permitted on a swash, swash_fetch(), _get_swash_invlist(),
- * and swash_to_invlist() handle both these transparently.
- *
- * This interface should only be used by functions that won't destroy or
- * adversely change the swash, as doing so affects all other uses of the
- * swash in the program; the general public should use 'Perl_swash_init'
- * instead.
+ * by calling utf8_heavy.pl in the general case.
*
* pkg is the name of the package that <name> should be in.
- * name is the name of the swash to find. Typically it is a Unicode
- * property name, including user-defined ones
+ * name is the name of the swash to find.
* listsv is a string to initialize the swash with. It must be of the form
* documented as the subroutine return value in
* L<perlunicode/User-Defined Character Properties>
* minbits is the number of bits required to represent each data element.
- * It is '1' for binary properties.
* none I (khw) do not understand this one, but it is used only in tr///.
- * invlist is an inversion list to initialize the swash with (or NULL)
- * flags_p if non-NULL is the address of various input and output flag bits
- * to the routine, as follows: ('I' means is input to the routine;
- * 'O' means output from the routine. Only flags marked O are
- * meaningful on return.)
- * _CORE_SWASH_INIT_USER_DEFINED_PROPERTY indicates if the swash
- * came from a user-defined property. (I O)
- * _CORE_SWASH_INIT_RETURN_IF_UNDEF indicates that instead of croaking
- * when the swash cannot be located, to simply return NULL. (I)
- * _CORE_SWASH_INIT_ACCEPT_INVLIST indicates that the caller will accept a
- * return of an inversion list instead of a swash hash if this routine
- * thinks that would result in faster execution of swash_fetch() later
- * on. (I)
*
- * Thus there are three possible inputs to find the swash: <name>,
- * <listsv>, and <invlist>. At least one must be specified. The result
+ * Thus there are two possible inputs to find the swash: <name> and
+ * <listsv>. At least one must be specified. The result
* will be the union of the specified ones, although <listsv>'s various
* actions can intersect, etc. what <name> gives. To avoid going out to
* disk at all, <invlist> should specify completely what the swash should
* have, and <listsv> should be &PL_sv_undef and <name> should be "".
- *
- * <invlist> is only valid for binary properties */
+ */
PMOP *old_PL_curpm= PL_curpm; /* save away the old PL_curpm */
SV* retval = &PL_sv_undef;
- HV* swash_hv = NULL;
- const bool use_invlist= (flags_p && *flags_p & _CORE_SWASH_INIT_ACCEPT_INVLIST);
- assert(listsv != &PL_sv_undef || strNE(name, "") || invlist);
- assert(! invlist || minbits == 1);
+ PERL_ARGS_ASSERT_SWASH_INIT;
+
+ assert(listsv != &PL_sv_undef || strNE(name, ""));
PL_curpm= NULL; /* reset PL_curpm so that we dont get confused between the
regex that triggered the swash init and the swash init
SV* errsv_save;
GV *method;
- PERL_ARGS_ASSERT__CORE_SWASH_INIT;
PUSHSTACKi(PERLSI_MAGIC);
ENTER;
if (IN_PERL_COMPILETIME) {
CopHINTS_set(PL_curcop, PL_hints);
}
- if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
- if (SvPOK(retval)) {
-
- /* If caller wants to handle missing properties, let them */
- if (flags_p && *flags_p & _CORE_SWASH_INIT_RETURN_IF_UNDEF) {
- CORE_SWASH_INIT_RETURN(NULL);
- }
- Perl_croak(aTHX_
- "Can't find Unicode property definition \"%" SVf "\"",
- SVfARG(retval));
- NOT_REACHED; /* NOTREACHED */
- }
- }
} /* End of calling the module to find the swash */
- /* If this operation fetched a swash, and we will need it later, get it */
- if (retval != &PL_sv_undef
- && (minbits == 1 || (flags_p
- && ! (*flags_p
- & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY))))
- {
- swash_hv = MUTABLE_HV(SvRV(retval));
-
- /* If we don't already know that there is a user-defined component to
- * this swash, and the user has indicated they wish to know if there is
- * one (by passing <flags_p>), find out */
- if (flags_p && ! (*flags_p & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY)) {
- SV** user_defined = hv_fetchs(swash_hv, "USER_DEFINED", FALSE);
- if (user_defined && SvUV(*user_defined)) {
- *flags_p |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
- }
- }
- }
-
- /* Make sure there is an inversion list for binary properties */
- if (minbits == 1) {
- SV** swash_invlistsvp = NULL;
- SV* swash_invlist = NULL;
- bool invlist_in_swash_is_valid = FALSE;
- bool swash_invlist_unclaimed = FALSE; /* whether swash_invlist has
- an unclaimed reference count */
-
- /* If this operation fetched a swash, get its already existing
- * inversion list, or create one for it */
-
- if (swash_hv) {
- swash_invlistsvp = hv_fetchs(swash_hv, "V", FALSE);
- if (swash_invlistsvp) {
- swash_invlist = *swash_invlistsvp;
- invlist_in_swash_is_valid = TRUE;
- }
- else {
- swash_invlist = _swash_to_invlist(retval);
- swash_invlist_unclaimed = TRUE;
- }
- }
-
- /* If an inversion list was passed in, have to include it */
- if (invlist) {
-
- /* Any fetched swash will by now have an inversion list in it;
- * otherwise <swash_invlist> will be NULL, indicating that we
- * didn't fetch a swash */
- if (swash_invlist) {
-
- /* Add the passed-in inversion list, which invalidates the one
- * already stored in the swash */
- invlist_in_swash_is_valid = FALSE;
- SvREADONLY_off(swash_invlist); /* Turned on again below */
- _invlist_union(invlist, swash_invlist, &swash_invlist);
- }
- else {
-
- /* Here, there is no swash already. Set up a minimal one, if
- * we are going to return a swash */
- if (! use_invlist) {
- swash_hv = newHV();
- retval = newRV_noinc(MUTABLE_SV(swash_hv));
- }
- swash_invlist = invlist;
- }
- }
-
- /* Here, we have computed the union of all the passed-in data. It may
- * be that there was an inversion list in the swash which didn't get
- * touched; otherwise save the computed one */
- if (! invlist_in_swash_is_valid && ! use_invlist) {
- if (! hv_stores(MUTABLE_HV(SvRV(retval)), "V", swash_invlist))
- {
- Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
- }
- /* We just stole a reference count. */
- if (swash_invlist_unclaimed) swash_invlist_unclaimed = FALSE;
- else SvREFCNT_inc_simple_void_NN(swash_invlist);
- }
-
- /* The result is immutable. Forbid attempts to change it. */
- SvREADONLY_on(swash_invlist);
-
- if (use_invlist) {
- SvREFCNT_dec(retval);
- if (!swash_invlist_unclaimed)
- SvREFCNT_inc_simple_void_NN(swash_invlist);
- retval = newRV_noinc(swash_invlist);
- }
- }
-
- CORE_SWASH_INIT_RETURN(retval);
-#undef CORE_SWASH_INIT_RETURN
+ SWASH_INIT_RETURN(retval);
+#undef SWASH_INIT_RETURN
}
S_swatch_get(pTHX_ SV* swash, UV start, UV span)
{
SV *swatch;
- U8 *l, *lend, *x, *xend, *s, *send;
+ U8 *l, *lend, *x, *xend, *s;
STRLEN lcur, xcur, scur;
HV *const hv = MUTABLE_HV(SvRV(swash));
- SV** const invlistsvp = hv_fetchs(hv, "V", FALSE);
SV** listsvp = NULL; /* The string containing the main body of the table */
SV** extssvp = NULL;
- SV** invert_it_svp = NULL;
U8* typestr = NULL;
- STRLEN bits;
+ STRLEN bits = 0;
STRLEN octets; /* if bits == 1, then octets == 0 */
UV none;
UV end = start + span;
- if (invlistsvp == NULL) {
SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
listsvp = hv_fetchs(hv, "LIST", FALSE);
- invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
bits = SvUV(*bitssvp);
none = SvUV(*nonesvp);
typestr = (U8*)SvPV_nolen(*typesvp);
- }
- else {
- bits = 1;
- none = 0;
- }
octets = bits >> 3; /* if bits == 1, then octets == 0 */
PERL_ARGS_ASSERT_SWATCH_GET;
- if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
+ if (bits != 8 && bits != 16 && bits != 32) {
Perl_croak(aTHX_ "panic: swatch_get doesn't expect bits %" UVuf,
(UV)bits);
}
SvCUR_set(swatch, scur);
s = (U8*)SvPVX(swatch);
- if (invlistsvp) { /* If has an inversion list set up use that */
- _invlist_populate_swatch(*invlistsvp, start, end, s);
- return swatch;
- }
-
/* read $swash->{LIST} */
l = (U8*)SvPV(*listsvp, lcur);
lend = l + lcur;
while (l < lend) {
- UV min, max, val, upper;
+ UV min = 0, max = 0, val = 0, upper;
l = swash_scan_list_line(l, lend, &min, &max, &val,
cBOOL(octets), typestr);
if (l > lend) {
++val;
}
}
- else { /* bits == 1, then val should be ignored */
- UV key;
- if (min < start)
- min = start;
-
- for (key = min; key <= upper; key++) {
- const STRLEN offset = (STRLEN)(key - start);
- s[offset >> 3] |= 1 << (offset & 7);
- }
- }
} /* while */
- /* Invert if the data says it should be. Assumes that bits == 1 */
- if (invert_it_svp && SvUV(*invert_it_svp)) {
-
- /* Unicode properties should come with all bits above PERL_UNICODE_MAX
- * be 0, and their inversion should also be 0, as we don't succeed any
- * Unicode property matches for non-Unicode code points */
- if (start <= PERL_UNICODE_MAX) {
-
- /* The code below assumes that we never cross the
- * Unicode/above-Unicode boundary in a range, as otherwise we would
- * have to figure out where to stop flipping the bits. Since this
- * boundary is divisible by a large power of 2, and swatches comes
- * in small powers of 2, this should be a valid assumption */
- assert(start + span - 1 <= PERL_UNICODE_MAX);
-
- send = s + scur;
- while (s < send) {
- *s = ~(*s);
- s++;
- }
- }
- }
-
- /* read $swash->{EXTRAS}
- * This code also copied to swash_to_invlist() below */
+ /* read $swash->{EXTRAS} */
x = (U8*)SvPV(*extssvp, xcur);
xend = x + xcur;
while (x < xend) {
Perl_croak(aTHX_ "panic: swatch_get got improper swatch");
s = (U8*)SvPV(swatch, slen);
- if (bits == 1 && otherbits == 1) {
- if (slen != olen)
- Perl_croak(aTHX_ "panic: swatch_get found swatch length "
- "mismatch, slen=%" UVuf ", olen=%" UVuf,
- (UV)slen, (UV)olen);
-
- switch (opc) {
- case '+':
- while (slen--)
- *s++ |= *o++;
- break;
- case '!':
- while (slen--)
- *s++ |= ~*o++;
- break;
- case '-':
- while (slen--)
- *s++ &= ~*o++;
- break;
- case '&':
- while (slen--)
- *s++ &= *o++;
- break;
- default:
- break;
- }
- }
- else {
+ {
STRLEN otheroctets = otherbits >> 3;
STRLEN offset = 0;
U8* const send = s + slen;
*s++ = (U8)((otherval >> 8) & 0xff);
*s++ = (U8)( otherval & 0xff);
}
- }
+ }
}
sv_free(other); /* through with it! */
} /* while */
return swatch;
}
-HV*
-Perl__swash_inversion_hash(pTHX)
-{
-
- /* Subject to change or removal. For use only in regcomp.c and regexec.c
- * Can't be used on a property that is subject to user override, as it
- * relies on the value of SPECIALS in the swash which would be set by
- * utf8_heavy.pl to the hash in the non-overriden file, and hence is not set
- * for overridden properties
- *
- * Returns a hash which is the inversion and closure of a swash mapping.
- * For example, consider the input lines:
- * 004B 006B
- * 004C 006C
- * 212A 006B
- *
- * The returned hash would have two keys, the UTF-8 for 006B and the UTF-8 for
- * 006C. The value for each key is an array. For 006C, the array would
- * have two elements, the UTF-8 for itself, and for 004C. For 006B, there
- * would be three elements in its array, the UTF-8 for 006B, 004B and 212A.
- *
- * Note that there are no elements in the hash for 004B, 004C, 212A. The
- * keys are only code points that are folded-to, so it isn't a full closure.
- *
- * Essentially, for any code point, it gives all the code points that map to
- * it, or the list of 'froms' for that point.
- *
- * Currently it ignores any additions or deletions from other swashes,
- * looking at just the main body of the swash, and if there are SPECIALS
- * in the swash, at that hash
- *
- * The specials hash can be extra code points, and most likely consists of
- * maps from single code points to multiple ones (each expressed as a string
- * of UTF-8 characters). This function currently returns only 1-1 mappings.
- * However consider this possible input in the specials hash:
- * "\xEF\xAC\x85" => "\x{0073}\x{0074}", # U+FB05 => 0073 0074
- * "\xEF\xAC\x86" => "\x{0073}\x{0074}", # U+FB06 => 0073 0074
- *
- * Both FB05 and FB06 map to the same multi-char sequence, which we don't
- * currently handle. But it also means that FB05 and FB06 are equivalent in
- * a 1-1 mapping which we should handle, and this relationship may not be in
- * the main table. Therefore this function examines all the multi-char
- * sequences and adds the 1-1 mappings that come out of that.
- *
- * XXX This function was originally intended to be multipurpose, but its
- * only use is quite likely to remain for constructing the inversion of
- * the CaseFolding (//i) property. If it were more general purpose for
- * regex patterns, it would have to do the FB05/FB06 game for simple folds,
- * because certain folds are prohibited under /iaa and /il. As an example,
- * in Unicode 3.0.1 both U+0130 and U+0131 fold to 'i', and hence are both
- * equivalent under /i. But under /iaa and /il, the folds to 'i' are
- * prohibited, so we would not figure out that they fold to each other.
- * Code could be written to automatically figure this out, similar to the
- * code that does this for multi-character folds, but this is the only case
- * where something like this is ever likely to happen, as all the single
- * char folds to the 0-255 range are now quite settled. Instead there is a
- * little special code that is compiled only for this Unicode version. This
- * is smaller and didn't require much coding time to do. But this makes
- * this routine strongly tied to being used just for CaseFolding. If ever
- * it should be generalized, this would have to be fixed */
-
- U8 *l, *lend;
- STRLEN lcur;
- SV * swash = _core_swash_init("utf8", "ToCf", &PL_sv_undef, 4, 0, NULL, NULL);
- HV *const hv = MUTABLE_HV(SvRV(swash));
-
- /* The string containing the main body of the table. This will have its
- * assertion fail if the swash has been converted to its inversion list */
- SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
-
- SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
- SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
- SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
- /*SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);*/
- const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
- const STRLEN bits = SvUV(*bitssvp);
- const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
- const UV none = SvUV(*nonesvp);
- SV **specials_p = hv_fetchs(hv, "SPECIALS", 0);
-
- HV* ret = newHV();
-
- /* Must have at least 8 bits to get the mappings */
- if (bits != 8 && bits != 16 && bits != 32) {
- Perl_croak(aTHX_ "panic: swash_inversion_hash doesn't expect bits %"
- UVuf, (UV)bits);
- }
-
- if (specials_p) { /* It might be "special" (sometimes, but not always, a
- mapping to more than one character */
-
- /* Construct an inverse mapping hash for the specials */
- HV * const specials_hv = MUTABLE_HV(SvRV(*specials_p));
- HV * specials_inverse = newHV();
- char *char_from; /* the lhs of the map */
- I32 from_len; /* its byte length */
- char *char_to; /* the rhs of the map */
- I32 to_len; /* its byte length */
- SV *sv_to; /* and in a sv */
- AV* from_list; /* list of things that map to each 'to' */
-
- hv_iterinit(specials_hv);
-
- /* The keys are the characters (in UTF-8) that map to the corresponding
- * UTF-8 string value. Iterate through the list creating the inverse
- * list. */
- while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
- SV** listp;
- if (! SvPOK(sv_to)) {
- Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
- "unexpectedly is not a string, flags=%lu",
- (unsigned long)SvFLAGS(sv_to));
- }
- /*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %" UVXf ", First char of to is %" UVXf "\n", valid_utf8_to_uvchr((U8*) char_from, 0), valid_utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
-
- /* Each key in the inverse list is a mapped-to value, and the key's
- * hash value is a list of the strings (each in UTF-8) that map to
- * it. Those strings are all one character long */
- if ((listp = hv_fetch(specials_inverse,
- SvPVX(sv_to),
- SvCUR(sv_to), 0)))
- {
- from_list = (AV*) *listp;
- }
- else { /* No entry yet for it: create one */
- from_list = newAV();
- if (! hv_store(specials_inverse,
- SvPVX(sv_to),
- SvCUR(sv_to),
- (SV*) from_list, 0))
- {
- Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
- }
- }
-
- /* Here have the list associated with this 'to' (perhaps newly
- * created and empty). Just add to it. Note that we ASSUME that
- * the input is guaranteed to not have duplications, so we don't
- * check for that. Duplications just slow down execution time. */
- av_push(from_list, newSVpvn_utf8(char_from, from_len, TRUE));
- }
-
- /* Here, 'specials_inverse' contains the inverse mapping. Go through
- * it looking for cases like the FB05/FB06 examples above. There would
- * be an entry in the hash like
- * 'st' => [ FB05, FB06 ]
- * In this example we will create two lists that get stored in the
- * returned hash, 'ret':
- * FB05 => [ FB05, FB06 ]
- * FB06 => [ FB05, FB06 ]
- *
- * Note that there is nothing to do if the array only has one element.
- * (In the normal 1-1 case handled below, we don't have to worry about
- * two lists, as everything gets tied to the single list that is
- * generated for the single character 'to'. But here, we are omitting
- * that list, ('st' in the example), so must have multiple lists.) */
- while ((from_list = (AV *) hv_iternextsv(specials_inverse,
- &char_to, &to_len)))
- {
- if (av_tindex_skip_len_mg(from_list) > 0) {
- SSize_t i;
-
- /* We iterate over all combinations of i,j to place each code
- * point on each list */
- for (i = 0; i <= av_tindex_skip_len_mg(from_list); i++) {
- SSize_t j;
- AV* i_list = newAV();
- SV** entryp = av_fetch(from_list, i, FALSE);
- if (entryp == NULL) {
- Perl_croak(aTHX_ "panic: av_fetch() unexpectedly"
- " failed");
- }
- if (hv_fetch(ret, SvPVX(*entryp), SvCUR(*entryp), FALSE)) {
- Perl_croak(aTHX_ "panic: unexpected entry for %s",
- SvPVX(*entryp));
- }
- if (! hv_store(ret, SvPVX(*entryp), SvCUR(*entryp),
- (SV*) i_list, FALSE))
- {
- Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
- }
-
- /* For DEBUG_U: UV u = valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0);*/
- for (j = 0; j <= av_tindex_skip_len_mg(from_list); j++) {
- entryp = av_fetch(from_list, j, FALSE);
- if (entryp == NULL) {
- Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
- }
-
- /* When i==j this adds itself to the list */
- av_push(i_list, newSVuv(utf8_to_uvchr_buf(
- (U8*) SvPVX(*entryp),
- (U8*) SvPVX(*entryp) + SvCUR(*entryp),
- 0)));
- /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, valid_utf8_to_uvchr((U8*) SvPVX(*entryp), 0), u));*/
- }
- }
- }
- }
- SvREFCNT_dec(specials_inverse); /* done with it */
- } /* End of specials */
-
- /* read $swash->{LIST} */
-
-#if UNICODE_MAJOR_VERSION == 3 \
- && UNICODE_DOT_VERSION == 0 \
- && UNICODE_DOT_DOT_VERSION == 1
-
- /* For this version only U+130 and U+131 are equivalent under qr//i. Add a
- * rule so that things work under /iaa and /il */
-
- SV * mod_listsv = sv_mortalcopy(*listsvp);
- sv_catpv(mod_listsv, "130\t130\t131\n");
- l = (U8*)SvPV(mod_listsv, lcur);
-
-#else
-
- l = (U8*)SvPV(*listsvp, lcur);
-
-#endif
-
- lend = l + lcur;
-
- /* Go through each input line */
- while (l < lend) {
- UV min, max, val;
- UV inverse;
- l = swash_scan_list_line(l, lend, &min, &max, &val,
- cBOOL(octets), typestr);
- if (l > lend) {
- break;
- }
-
- /* Each element in the range is to be inverted */
- for (inverse = min; inverse <= max; inverse++) {
- AV* list;
- SV** listp;
- IV i;
- bool found_key = FALSE;
- bool found_inverse = FALSE;
-
- /* The key is the inverse mapping */
- char key[UTF8_MAXBYTES+1];
- char* key_end = (char *) uvchr_to_utf8((U8*) key, val);
- STRLEN key_len = key_end - key;
-
- /* Get the list for the map */
- if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
- list = (AV*) *listp;
- }
- else { /* No entry yet for it: create one */
- list = newAV();
- if (! hv_store(ret, key, key_len, (SV*) list, FALSE)) {
- Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed");
- }
- }
-
- /* Look through list to see if this inverse mapping already is
- * listed, or if there is a mapping to itself already */
- for (i = 0; i <= av_tindex_skip_len_mg(list); i++) {
- SV** entryp = av_fetch(list, i, FALSE);
- SV* entry;
- UV uv;
- if (entryp == NULL) {
- Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
- }
- entry = *entryp;
- uv = SvUV(entry);
- /*DEBUG_U(PerlIO_printf(Perl_debug_log, "list for %" UVXf " contains %" UVXf "\n", val, uv));*/
- if (uv == val) {
- found_key = TRUE;
- }
- if (uv == inverse) {
- found_inverse = TRUE;
- }
-
- /* No need to continue searching if found everything we are
- * looking for */
- if (found_key && found_inverse) {
- break;
- }
- }
-
- /* Make sure there is a mapping to itself on the list */
- if (! found_key) {
- av_push(list, newSVuv(val));
- /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, val, val));*/
- }
-
-
- /* Simply add the value to the list */
- if (! found_inverse) {
- av_push(list, newSVuv(inverse));
- /*DEBUG_U(PerlIO_printf(Perl_debug_log, "%s: %d: Adding %" UVXf " to list for %" UVXf "\n", __FILE__, __LINE__, inverse, val));*/
- }
-
- /* swatch_get() increments the value of val for each element in the
- * range. That makes more compact tables possible. You can
- * express the capitalization, for example, of all consecutive
- * letters with a single line: 0061\t007A\t0041 This maps 0061 to
- * 0041, 0062 to 0042, etc. I (khw) have never understood 'none',
- * and it's not documented; it appears to be used only in
- * implementing tr//; I copied the semantics from swatch_get(), just
- * in case */
- if (!none || val < none) {
- ++val;
- }
- }
- }
-
- SvREFCNT_dec(swash);
-
- return ret;
-}
-
-SV*
-Perl__swash_to_invlist(pTHX_ SV* const swash)
-{
-
- /* Subject to change or removal. For use only in one place in regcomp.c.
- * Ownership is given to one reference count in the returned SV* */
-
- U8 *l, *lend;
- char *loc;
- STRLEN lcur;
- HV *const hv = MUTABLE_HV(SvRV(swash));
- UV elements = 0; /* Number of elements in the inversion list */
- U8 empty[] = "";
- SV** listsvp;
- SV** typesvp;
- SV** bitssvp;
- SV** extssvp;
- SV** invert_it_svp;
-
- U8* typestr;
- STRLEN bits;
- STRLEN octets; /* if bits == 1, then octets == 0 */
- U8 *x, *xend;
- STRLEN xcur;
-
- SV* invlist;
-
- PERL_ARGS_ASSERT__SWASH_TO_INVLIST;
-
- /* If not a hash, it must be the swash's inversion list instead */
- if (SvTYPE(hv) != SVt_PVHV) {
- return SvREFCNT_inc_simple_NN((SV*) hv);
- }
-
- /* The string containing the main body of the table */
- listsvp = hv_fetchs(hv, "LIST", FALSE);
- typesvp = hv_fetchs(hv, "TYPE", FALSE);
- bitssvp = hv_fetchs(hv, "BITS", FALSE);
- extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
- invert_it_svp = hv_fetchs(hv, "INVERT_IT", FALSE);
-
- typestr = (U8*)SvPV_nolen(*typesvp);
- bits = SvUV(*bitssvp);
- octets = bits >> 3; /* if bits == 1, then octets == 0 */
-
- /* read $swash->{LIST} */
- if (SvPOK(*listsvp)) {
- l = (U8*)SvPV(*listsvp, lcur);
- }
- else {
- /* LIST legitimately doesn't contain a string during compilation phases
- * of Perl itself, before the Unicode tables are generated. In this
- * case, just fake things up by creating an empty list */
- l = empty;
- lcur = 0;
- }
- loc = (char *) l;
- lend = l + lcur;
-
- if (*l == 'V') { /* Inversion list format */
- const char *after_atou = (char *) lend;
- UV element0;
- UV* other_elements_ptr;
-
- /* The first number is a count of the rest */
- l++;
- if (!grok_atoUV((const char *)l, &elements, &after_atou)) {
- Perl_croak(aTHX_ "panic: Expecting a valid count of elements"
- " at start of inversion list");
- }
- if (elements == 0) {
- invlist = _new_invlist(0);
- }
- else {
- l = (U8 *) after_atou;
-
- /* Get the 0th element, which is needed to setup the inversion list
- * */
- while (isSPACE(*l)) l++;
- if (!grok_atoUV((const char *)l, &element0, &after_atou)) {
- Perl_croak(aTHX_ "panic: Expecting a valid 0th element for"
- " inversion list");
- }
- l = (U8 *) after_atou;
- invlist = _setup_canned_invlist(elements, element0,
- &other_elements_ptr);
- elements--;
-
- /* Then just populate the rest of the input */
- while (elements-- > 0) {
- if (l > lend) {
- Perl_croak(aTHX_ "panic: Expecting %" UVuf " more"
- " elements than available", elements);
- }
- while (isSPACE(*l)) l++;
- if (!grok_atoUV((const char *)l, other_elements_ptr++,
- &after_atou))
- {
- Perl_croak(aTHX_ "panic: Expecting a valid element"
- " in inversion list");
- }
- l = (U8 *) after_atou;
- }
- }
- }
- else {
-
- /* Scan the input to count the number of lines to preallocate array
- * size based on worst possible case, which is each line in the input
- * creates 2 elements in the inversion list: 1) the beginning of a
- * range in the list; 2) the beginning of a range not in the list. */
- while ((loc = (char *) memchr(loc, '\n', lend - (U8 *) loc)) != NULL) {
- elements += 2;
- loc++;
- }
-
- /* If the ending is somehow corrupt and isn't a new line, add another
- * element for the final range that isn't in the inversion list */
- if (! (*lend == '\n'
- || (*lend == '\0' && (lcur == 0 || *(lend - 1) == '\n'))))
- {
- elements++;
- }
-
- invlist = _new_invlist(elements);
-
- /* Now go through the input again, adding each range to the list */
- while (l < lend) {
- UV start, end;
- UV val; /* Not used by this function */
-
- l = swash_scan_list_line(l, lend, &start, &end, &val,
- cBOOL(octets), typestr);
-
- if (l > lend) {
- break;
- }
-
- invlist = _add_range_to_invlist(invlist, start, end);
- }
- }
-
- /* Invert if the data says it should be */
- if (invert_it_svp && SvUV(*invert_it_svp)) {
- _invlist_invert(invlist);
- }
-
- /* This code is copied from swatch_get()
- * read $swash->{EXTRAS} */
- x = (U8*)SvPV(*extssvp, xcur);
- xend = x + xcur;
- while (x < xend) {
- STRLEN namelen;
- U8 *namestr;
- SV** othersvp;
- HV* otherhv;
- STRLEN otherbits;
- SV **otherbitssvp, *other;
- U8 *nl;
-
- const U8 opc = *x++;
- if (opc == '\n')
- continue;
-
- nl = (U8*)memchr(x, '\n', xend - x);
-
- if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
- if (nl) {
- x = nl + 1; /* 1 is length of "\n" */
- continue;
- }
- else {
- x = xend; /* to EXTRAS' end at which \n is not found */
- break;
- }
- }
-
- namestr = x;
- if (nl) {
- namelen = nl - namestr;
- x = nl + 1;
- }
- else {
- namelen = xend - namestr;
- x = xend;
- }
-
- othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
- otherhv = MUTABLE_HV(SvRV(*othersvp));
- otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
- otherbits = (STRLEN)SvUV(*otherbitssvp);
-
- if (bits != otherbits || bits != 1) {
- Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
- "properties, bits=%" UVuf ", otherbits=%" UVuf,
- (UV)bits, (UV)otherbits);
- }
-
- /* The "other" swatch must be destroyed after. */
- other = _swash_to_invlist((SV *)*othersvp);
-
- /* End of code copied from swatch_get() */
- switch (opc) {
- case '+':
- _invlist_union(invlist, other, &invlist);
- break;
- case '!':
- _invlist_union_maybe_complement_2nd(invlist, other, TRUE, &invlist);
- break;
- case '-':
- _invlist_subtract(invlist, other, &invlist);
- break;
- case '&':
- _invlist_intersection(invlist, other, &invlist);
- break;
- default:
- break;
- }
- sv_free(other); /* through with it! */
- }
-
- SvREADONLY_on(invlist);
- return invlist;
-}
-
-SV*
-Perl__get_swash_invlist(pTHX_ SV* const swash)
-{
- SV** ptr;
-
- PERL_ARGS_ASSERT__GET_SWASH_INVLIST;
-
- if (! SvROK(swash)) {
- return NULL;
- }
-
- /* If it really isn't a hash, it isn't really swash; must be an inversion
- * list */
- if (SvTYPE(SvRV(swash)) != SVt_PVHV) {
- return SvRV(swash);
- }
-
- ptr = hv_fetchs(MUTABLE_HV(SvRV(swash)), "V", FALSE);
- if (! ptr) {
- return NULL;
- }
-
- return *ptr;
-}
-
bool
Perl_check_utf8_print(pTHX_ const U8* s, const STRLEN len)
{
* that effect. However, if the caller knows what
* it's doing, it can pass this flag to indicate that,
* and the assertion is skipped.
- * FOLDEQ_S2_ALREADY_FOLDED Similarly.
+ * FOLDEQ_S2_ALREADY_FOLDED Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
+ * to s2, and s2 doesn't have to be UTF-8 encoded.
+ * This introduces an asymmetry to save a few branches
+ * in a loop. Currently, this is not a problem, as
+ * never are both inputs pre-folded. Simply call this
+ * function with the pre-folded one as the second
+ * string.
* FOLDEQ_S2_FOLDS_SANE
*/
I32
PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
- assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
- && (((flags & FOLDEQ_S1_ALREADY_FOLDED)
- && !(flags & FOLDEQ_S1_FOLDS_SANE))
- || ((flags & FOLDEQ_S2_ALREADY_FOLDED)
- && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
+ assert( ! ( (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
+ && (( (flags & FOLDEQ_S1_ALREADY_FOLDED)
+ && !(flags & FOLDEQ_S1_FOLDS_SANE))
+ || ( (flags & FOLDEQ_S2_ALREADY_FOLDED)
+ && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
/* The algorithm is to trial the folds without regard to the flags on
* the first line of the above assert(), and then see if the result
* violates them. This means that the inputs can't be pre-folded to a
if (flags & FOLDEQ_LOCALE) {
if (IN_UTF8_CTYPE_LOCALE) {
- flags &= ~FOLDEQ_LOCALE;
+ if (UNLIKELY(PL_in_utf8_turkic_locale)) {
+ flags_for_folder |= FOLD_FLAGS_LOCALE;
+ }
+ else {
+ flags &= ~FOLDEQ_LOCALE;
+ }
}
else {
flags_for_folder |= FOLD_FLAGS_LOCALE;
}
}
+ if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
+ flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
+ }
if (pe1) {
e1 = *(U8**)pe1;
if (n2 == 0) { /* Same for s2 */
if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
- f2 = (U8 *) p2;
- assert(u2);
- n2 = UTF8SKIP(f2);
+
+ /* Point to the already-folded character. But for non-UTF-8
+ * variants, convert to UTF-8 for the algorithm below */
+ if (UTF8_IS_INVARIANT(*p2)) {
+ f2 = (U8 *) p2;
+ n2 = 1;
+ }
+ else if (u2) {
+ f2 = (U8 *) p2;
+ n2 = UTF8SKIP(f2);
+ }
+ else {
+ foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
+ foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
+ f2 = foldbuf2;
+ n2 = 2;
+ }
}
else {
if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
}
/*
+=for apidoc utf8_to_uvchr
+
+Returns the native code point of the first character in the string C<s>
+which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
+length, in bytes, of that character.
+
+Some, but not all, UTF-8 malformations are detected, and in fact, some
+malformed input could cause reading beyond the end of the input buffer, which
+is why this function is deprecated. Use L</utf8_to_uvchr_buf> instead.
+
+If C<s> points to one of the detected malformations, and UTF8 warnings are
+enabled, zero is returned and C<*retlen> is set (if C<retlen> isn't
+C<NULL>) to -1. If those warnings are off, the computed value if well-defined (or
+the Unicode REPLACEMENT CHARACTER, if not) is silently returned, and C<*retlen>
+is set (if C<retlen> isn't NULL) so that (S<C<s> + C<*retlen>>) is the
+next possible position in C<s> that could begin a non-malformed character.
+See L</utf8n_to_uvchr> for details on when the REPLACEMENT CHARACTER is returned.
+
+=cut
+*/
+
+UV
+Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
+{
+ PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
+
+ /* This function is unsafe if malformed UTF-8 input is given it, which is
+ * why the function is deprecated. If the first byte of the input
+ * indicates that there are more bytes remaining in the sequence that forms
+ * the character than there are in the input buffer, it can read past the
+ * end. But we can make it safe if the input string happens to be
+ * NUL-terminated, as many strings in Perl are, by refusing to read past a
+ * NUL. A NUL indicates the start of the next character anyway. If the
+ * input isn't NUL-terminated, the function remains unsafe, as it always
+ * has been.
+ *
+ * An initial NUL has to be handled separately, but all ASCIIs can be
+ * handled the same way, speeding up this common case */
+
+ if (UTF8_IS_INVARIANT(*s)) { /* Assumes 's' contains at least 1 byte */
+ return (UV) *s;
+ }
+
+ return utf8_to_uvchr_buf(s,
+ s + my_strnlen((char *) s, UTF8SKIP(s)),
+ retlen);
+}
+
+/*
* ex: set ts=8 sts=4 sw=4 et:
*/