"Use of code point 0x%" UVXf " is not allowed; the"
" permissible max is 0x%" UVXf;
-#define MAX_EXTERNALLY_LEGAL_CP ((UV) (IV_MAX))
-
/*
=head1 Unicode Support
These are various utility functions for manipulating UTF8-encoded
* performance hit on these high EBCDIC code points. */
if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
- if (UNLIKELY(uv > MAX_EXTERNALLY_LEGAL_CP)) {
- Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_EXTERNALLY_LEGAL_CP);
+ if (UNLIKELY(uv > MAX_LEGAL_CP)) {
+ Perl_croak(aTHX_ cp_above_legal_max, uv, MAX_LEGAL_CP);
}
if ( (flags & UNICODE_WARN_SUPER)
|| ( (flags & UNICODE_WARN_PERL_EXTENDED)
=item C<UTF8_GOT_NON_CONTINUATION>
The input sequence was malformed in that a non-continuation type byte was found
-in a position where only a continuation type one should be.
+in a position where only a continuation type one should be. See also
+L</C<UTF8_GOT_SHORT>>.
=item C<UTF8_GOT_OVERFLOW>
a complete sequence. In other words, the input is for a partial character
sequence.
+
+C<UTF8_GOT_SHORT> and C<UTF8_GOT_NON_CONTINUATION> both indicate a too short
+sequence. The difference is that C<UTF8_GOT_NON_CONTINUATION> indicates always
+that there is an error, while C<UTF8_GOT_SHORT> means that an incomplete
+sequence was looked at. If no other flags are present, it means that the
+sequence was valid as far as it went. Depending on the application, this could
+mean one of three things:
+
+=over
+
+=item *
+
+The C<curlen> length parameter passed in was too small, and the function was
+prevented from examining all the necessary bytes.
+
+=item *
+
+The buffer being looked at is based on reading data, and the data received so
+far stopped in the middle of a character, so that the next read will
+read the remainder of this character. (It is up to the caller to deal with the
+split bytes somehow.)
+
+=item *
+
+This is a real error, and the partial sequence is all we're going to get.
+
+=back
+
=item C<UTF8_GOT_SUPER>
The input sequence was malformed in that it is for a non-Unicode code point;
PERL_ARGS_ASSERT_BYTES_TO_UTF8;
PERL_UNUSED_CONTEXT;
- Newx(d, (*lenp) * 2 + 1, U8);
+ /* 1 for each byte + 1 for each byte that expands to two, + trailing NUL */
+ Newx(d, (*lenp) + variant_under_utf8_count(s, send) + 1, U8);
dst = d;
while (s < send) {
*d = '\0';
*lenp = d-dst;
- /* Trim unused space */
- Renew(dst, *lenp + 1, U8);
-
return dst;
}
if (*p == '_')
return TRUE;
- return is_utf8_common(p, NULL,
- "This is buggy if this gets used",
- PL_utf8_idstart);
+ return is_utf8_common(p, PL_utf8_idstart);
}
bool
}
PERL_STATIC_INLINE bool
-S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
- const char *const swashname, SV* const invlist)
+S_is_utf8_common(pTHX_ const U8 *const p, SV* const invlist)
{
/* returns a boolean giving whether or not the UTF8-encoded character that
- * starts at <p> is in the swash indicated by <swashname>. <swash>
- * contains a pointer to where the swash indicated by <swashname>
- * is to be stored; which this routine will do, so that future calls will
- * look at <*swash> and only generate a swash if it is not null. <invlist>
- * is NULL or an inversion list that defines the swash. If not null, it
- * saves time during initialization of the swash.
+ * starts at <p> is in the inversion list indicated by <invlist>.
*
* Note that it is assumed that the buffer length of <p> is enough to
* contain all the bytes that comprise the character. Thus, <*p> should
* have been checked before this call for mal-formedness enough to assure
- * that. */
+ * that. This function, does make sure to not look past any NUL, so it is
+ * safe to use on C, NUL-terminated, strings */
+ STRLEN len = my_strnlen((char *) p, UTF8SKIP(p));
PERL_ARGS_ASSERT_IS_UTF8_COMMON;
* as far as there being enough bytes available in it to accommodate the
* character without reading beyond the end, and pass that number on to the
* validating routine */
- if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
- _force_out_malformed_utf8_message(p, p + UTF8SKIP(p),
- _UTF8_NO_CONFIDENCE_IN_CURLEN,
+ if (! isUTF8_CHAR(p, p + len)) {
+ _force_out_malformed_utf8_message(p, p + len, _UTF8_NO_CONFIDENCE_IN_CURLEN,
1 /* Die */ );
NOT_REACHED; /* NOTREACHED */
}
- if (invlist) {
- return _invlist_contains_cp(invlist, valid_utf8_to_uvchr(p, NULL));
- }
-
- assert(swash);
-
- if (!*swash) {
- U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
- *swash = _core_swash_init("utf8",
-
- /* Only use the name if there is no inversion
- * list; otherwise will go out to disk */
- (invlist) ? "" : swashname,
-
- &PL_sv_undef, 1, 0, invlist, &flags);
- }
-
- return swash_fetch(*swash, p, TRUE) != 0;
+ return is_utf8_common_with_len(p, p + len, invlist);
}
PERL_STATIC_INLINE bool
S_is_utf8_common_with_len(pTHX_ const U8 *const p, const U8 * const e,
- SV **swash, const char *const swashname,
SV* const invlist)
{
/* returns a boolean giving whether or not the UTF8-encoded character that
- * starts at <p>, and extending no further than <e - 1> is in the swash
- * indicated by <swashname>. <swash> contains a pointer to where the swash
- * indicated by <swashname> is to be stored; which this routine will do, so
- * that future calls will look at <*swash> and only generate a swash if it
- * is not null. <invlist> is NULL or an inversion list that defines the
- * swash. If not null, it saves time during initialization of the swash.
- */
+ * starts at <p>, and extending no further than <e - 1> is in the inversion
+ * list <invlist>. */
+
+ UV cp = utf8n_to_uvchr(p, e - p, NULL, 0);
PERL_ARGS_ASSERT_IS_UTF8_COMMON_WITH_LEN;
- if (! isUTF8_CHAR(p, e)) {
+ if (cp == 0 && (p >= e || *p != '\0')) {
_force_out_malformed_utf8_message(p, e, 0, 1);
NOT_REACHED; /* NOTREACHED */
}
- if (invlist) {
- return _invlist_contains_cp(invlist, valid_utf8_to_uvchr(p, NULL));
- }
-
- assert(swash);
-
- if (!*swash) {
- U8 flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
- *swash = _core_swash_init("utf8",
-
- /* Only use the name if there is no inversion
- * list; otherwise will go out to disk */
- (invlist) ? "" : swashname,
-
- &PL_sv_undef, 1, 0, invlist, &flags);
- }
-
- return swash_fetch(*swash, p, TRUE) != 0;
+ assert(invlist);
+ return _invlist_contains_cp(invlist, cp);
}
STATIC void
if (instr(file, "mathoms.c")) {
Perl_warner(aTHX_ WARN_DEPRECATED,
- "In %s, line %d, starting in Perl v5.30, %s()"
+ "In %s, line %d, starting in Perl v5.32, %s()"
" will be removed. Avoid this message by"
" converting to use %s().\n",
file, line, name, alternative);
}
else {
Perl_warner(aTHX_ WARN_DEPRECATED,
- "In %s, line %d, starting in Perl v5.30, %s() will"
+ "In %s, line %d, starting in Perl v5.32, %s() will"
" require an additional parameter. Avoid this"
" message by converting to use %s().\n",
file, line, name, alternative);
case _CC_GRAPH:
case _CC_CASED:
- return is_utf8_common(p,
- NULL,
- "This is buggy if this gets used",
- PL_XPosix_ptrs[classnum]);
+ return is_utf8_common(p, PL_XPosix_ptrs[classnum]);
case _CC_SPACE:
return is_XPERLSPACE_high(p);
case _CC_VERTSPACE:
return is_VERTWS_high(p);
case _CC_IDFIRST:
- return is_utf8_common(p, NULL,
- "This is buggy if this gets used",
- PL_utf8_perl_idstart);
+ return is_utf8_common(p, PL_utf8_perl_idstart);
case _CC_IDCONT:
- return is_utf8_common(p, NULL,
- "This is buggy if this gets used",
- PL_utf8_perl_idcont);
+ return is_utf8_common(p, PL_utf8_perl_idcont);
}
}
{
PERL_ARGS_ASSERT__IS_UTF8_FOO_WITH_LEN;
- return is_utf8_common_with_len(p, e, NULL,
- "This is buggy if this gets used",
- PL_XPosix_ptrs[classnum]);
+ return is_utf8_common_with_len(p, e, PL_XPosix_ptrs[classnum]);
}
bool
{
PERL_ARGS_ASSERT__IS_UTF8_PERL_IDSTART_WITH_LEN;
- return is_utf8_common_with_len(p, e, NULL,
- "This is buggy if this gets used",
- PL_utf8_perl_idstart);
+ return is_utf8_common_with_len(p, e, PL_utf8_perl_idstart);
}
bool
if (*p == '_')
return TRUE;
- return is_utf8_common(p, &PL_utf8_xidstart, "XIdStart", NULL);
+ return is_utf8_common(p, PL_utf8_xidstart);
}
bool
{
PERL_ARGS_ASSERT__IS_UTF8_PERL_IDCONT_WITH_LEN;
- return is_utf8_common_with_len(p, e, NULL,
- "This is buggy if this gets used",
- PL_utf8_perl_idcont);
+ return is_utf8_common_with_len(p, e, PL_utf8_perl_idcont);
}
bool
{
PERL_ARGS_ASSERT__IS_UTF8_IDCONT;
- return is_utf8_common(p, &PL_utf8_idcont, "IdContinue", NULL);
+ return is_utf8_common(p, PL_utf8_idcont);
}
bool
{
PERL_ARGS_ASSERT__IS_UTF8_XIDCONT;
- return is_utf8_common(p, &PL_utf8_xidcont, "XIdContinue", NULL);
+ return is_utf8_common(p, PL_utf8_xidcont);
}
bool
{
PERL_ARGS_ASSERT__IS_UTF8_MARK;
- return is_utf8_common(p, NULL, "IsM", PL_utf8_mark);
+ return is_utf8_common(p, PL_utf8_mark);
}
STATIC UV
}
if (UNLIKELY(UNICODE_IS_SUPER(uv1))) {
- if (UNLIKELY(uv1 > MAX_EXTERNALLY_LEGAL_CP)) {
+ if (UNLIKELY(uv1 > MAX_LEGAL_CP)) {
Perl_croak(aTHX_ cp_above_legal_max, uv1,
- MAX_EXTERNALLY_LEGAL_CP);
+ MAX_LEGAL_CP);
}
if (ckWARN_d(WARN_NON_UNICODE)) {
const char* desc = (PL_op) ? OP_DESC(PL_op) : normal;
if (*e == NULL) {
utf8n_flags = _UTF8_NO_CONFIDENCE_IN_CURLEN;
- *e = p + UTF8SKIP(p);
+
+ /* strnlen() makes this function safe for the common case of
+ * NUL-terminated strings */
+ *e = p + my_strnlen((char *) p, UTF8SKIP(p));
/* For mathoms.c calls, we use the function name we know is stored
* there. It could be part of a larger path */
/* Special case these two characters, as what normally gets
* returned under locale doesn't work */
- if (memEQs((char *) p, UTF8SKIP(p), CAP_SHARP_S))
+ if (memBEGINs((char *) p, e - p, CAP_SHARP_S))
{
/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
}
else
#endif
- if (memEQs((char *) p, UTF8SKIP(p), LONG_S_T))
+ if (memBEGINs((char *) p, e - p, LONG_S_T))
{
/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
* 255/256 boundary which is forbidden under /l, and so the code
* wouldn't catch that they are equivalent (which they are only in
* this release) */
- else if (memEQs((char *) p, UTF8SKIP(p), DOTTED_I)) {
+ else if (memBEGINs((char *) p, e - p, DOTTED_I)) {
/* diag_listed_as: Can't do %s("%s") on non-UTF-8 locale; resolved to "%s". */
Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
"Can't do fc(\"\\x{0130}\") on non-UTF-8 locale; "
* works. */
*lenp = 2 * sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 2;
- Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
+ Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 LATIN_SMALL_LETTER_LONG_S_UTF8,
ustrp, *lenp, U8);
return LATIN_SMALL_LETTER_LONG_S;
* that effect. However, if the caller knows what
* it's doing, it can pass this flag to indicate that,
* and the assertion is skipped.
- * FOLDEQ_S2_ALREADY_FOLDED Similarly.
+ * FOLDEQ_S2_ALREADY_FOLDED Similar to FOLDEQ_S1_ALREADY_FOLDED, but applies
+ * to s2, and s2 doesn't have to be UTF-8 encoded.
+ * This introduces an asymmetry to save a few branches
+ * in a loop. Currently, this is not a problem, as
+ * never are both inputs pre-folded. Simply call this
+ * function with the pre-folded one as the second
+ * string.
* FOLDEQ_S2_FOLDS_SANE
*/
I32
PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
- assert( ! ((flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
- && (((flags & FOLDEQ_S1_ALREADY_FOLDED)
- && !(flags & FOLDEQ_S1_FOLDS_SANE))
- || ((flags & FOLDEQ_S2_ALREADY_FOLDED)
- && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
+ assert( ! ( (flags & (FOLDEQ_UTF8_NOMIX_ASCII | FOLDEQ_LOCALE))
+ && (( (flags & FOLDEQ_S1_ALREADY_FOLDED)
+ && !(flags & FOLDEQ_S1_FOLDS_SANE))
+ || ( (flags & FOLDEQ_S2_ALREADY_FOLDED)
+ && !(flags & FOLDEQ_S2_FOLDS_SANE)))));
/* The algorithm is to trial the folds without regard to the flags on
* the first line of the above assert(), and then see if the result
* violates them. This means that the inputs can't be pre-folded to a
flags_for_folder |= FOLD_FLAGS_LOCALE;
}
}
+ if (flags & FOLDEQ_UTF8_NOMIX_ASCII) {
+ flags_for_folder |= FOLD_FLAGS_NOMIX_ASCII;
+ }
if (pe1) {
e1 = *(U8**)pe1;
if (n2 == 0) { /* Same for s2 */
if (flags & FOLDEQ_S2_ALREADY_FOLDED) {
- f2 = (U8 *) p2;
- assert(u2);
- n2 = UTF8SKIP(f2);
+
+ /* Point to the already-folded character. But for non-UTF-8
+ * variants, convert to UTF-8 for the algorithm below */
+ if (UTF8_IS_INVARIANT(*p2)) {
+ f2 = (U8 *) p2;
+ n2 = 1;
+ }
+ else if (u2) {
+ f2 = (U8 *) p2;
+ n2 = UTF8SKIP(f2);
+ }
+ else {
+ foldbuf2[0] = UTF8_EIGHT_BIT_HI(*p2);
+ foldbuf2[1] = UTF8_EIGHT_BIT_LO(*p2);
+ f2 = foldbuf2;
+ n2 = 2;
+ }
}
else {
if (isASCII(*p2) && ! (flags & FOLDEQ_LOCALE)) {
}
return utf8_to_uvchr_buf(s,
- s + my_strnlen((char *) s, UTF8_MAXBYTES),
- retlen);
+ s + my_strnlen((char *) s, UTF8SKIP(s)),
+ retlen);
}
/*