/*
-Tests if some arbitrary number of bytes begins in a valid UTF-8
+Tests if the first C<len> bytes of string C<s> form a valid UTF-8
character. Note that an INVARIANT (i.e. ASCII) character is a valid
-UTF-8 character. The actual number of bytes in the UTF-8 character
+UTF-8 character. The number of bytes in the UTF-8 character
will be returned if it is valid, otherwise 0.
This is the "slow" version as opposed to the "fast" version which is
PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
if (UTF8_IS_INVARIANT(u))
- return 1;
+ return len == 1;
if (!UTF8_IS_START(u))
return 0;
}
/*
+=for apidoc is_utf8_char_buf
+
+Returns the number of bytes that comprise the first UTF-8 encoded character in
+buffer C<buf>. C<buf_end> should point to one position beyond the end of the
+buffer. 0 is returned if C<buf> does not point to a complete, valid UTF-8
+encoded character.
+
+Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character.
+
+=cut */
+
+STRLEN
+Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
+{
+
+ STRLEN len;
+
+ PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
+
+ if (buf_end <= buf) {
+ return 0;
+ }
+
+ len = buf_end - buf;
+ if (len > UTF8SKIP(buf)) {
+ len = UTF8SKIP(buf);
+ }
+
+#ifdef IS_UTF8_CHAR
+ if (IS_UTF8_CHAR_FAST(len))
+ return IS_UTF8_CHAR(buf, len) ? len : 0;
+#endif /* #ifdef IS_UTF8_CHAR */
+ return is_utf8_char_slow(buf, len);
+}
+
+/*
=for apidoc is_utf8_char
+DEPRECATED!
+
Tests if some arbitrary number of bytes begins in a valid UTF-8
character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
character is a valid UTF-8 character. The actual number of bytes in the UTF-8
character will be returned if it is valid, otherwise 0.
-WARNING: use only if you *know* that C<s> has at least either UTF8_MAXBYTES or
-UTF8SKIP(s) bytes.
+This function is deprecated due to the possibility that malformed input could
+cause reading beyond the end of the input buffer. Use C<is_utf8_char_buf>
+instead.
=cut */
+
STRLEN
Perl_is_utf8_char(const U8 *s)
{
- const STRLEN len = UTF8SKIP(s);
-
PERL_ARGS_ASSERT_IS_UTF8_CHAR;
-#ifdef IS_UTF8_CHAR
- if (IS_UTF8_CHAR_FAST(len))
- return IS_UTF8_CHAR(s, len) ? len : 0;
-#endif /* #ifdef IS_UTF8_CHAR */
- return is_utf8_char_slow(s, len);
+
+ /* Assumes we have enough space, which is why this is deprecated */
+ return is_utf8_char_buf(s, s + UTF8SKIP(s));
}
* LENP will be set to the length in bytes of the string of changed characters
*
* The functions return the ordinal of the first character in the string of OUTP */
-#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUpper")
-#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTitle")
-#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLower")
+#define CALL_UPPER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_toupper, "ToUc", "utf8::ToSpecUc")
+#define CALL_TITLE_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_totitle, "ToTc", "utf8::ToSpecTc")
+#define CALL_LOWER_CASE(INP, OUTP, LENP) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tolower, "ToLc", "utf8::ToSpecLc")
/* This additionally has the input parameter SPECIALS, which if non-zero will
* cause this to use the SPECIALS hash for folding (meaning get full case
* folding); otherwise, when zero, this implies a simple case fold */
-#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecFold" : NULL)
+#define CALL_FOLD_CASE(INP, OUTP, LENP, SPECIALS) Perl_to_utf8_case(aTHX_ INP, OUTP, LENP, &PL_utf8_tofold, "ToCf", (SPECIALS) ? "utf8::ToSpecCf" : NULL)
UV
Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
return CALL_FOLD_CASE(p, p, lenp, flags);
}
-/* for now these all assume no locale info available for Unicode > 255 */
+/* for now these all assume no locale info available for Unicode > 255; and
+ * the corresponding macros in handy.h (like isALNUM_LC_uvchr) should have been
+ * called instead, so that these don't get called for < 255 */
bool
Perl_is_uni_alnum_lc(pTHX_ UV c)
S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
const char *const swashname)
{
+ /* returns a boolean giving whether or not the UTF8-encoded character that
+ * starts at <p> is in the swash indicated by <swashname>. <swash>
+ * contains a pointer to where the swash indicated by <swashname>
+ * is to be stored; which this routine will do, so that future calls will
+ * look at <*swash> and only generate a swash if it is not null
+ *
+ * Note that it is assumed that the buffer length of <p> is enough to
+ * contain all the bytes that comprise the character. Thus, <*p> should
+ * have been checked before this call for mal-formedness enough to assure
+ * that. */
+
dVAR;
PERL_ARGS_ASSERT_IS_UTF8_COMMON;
- if (!is_utf8_char(p))
+ /* The API should have included a length for the UTF-8 character in <p>,
+ * but it doesn't. We therefor assume that p has been validated at least
+ * as far as there being enough bytes available in it to accommodate the
+ * character without reading beyond the end, and pass that number on to the
+ * validating routine */
+ if (!is_utf8_char_buf(p, p + UTF8SKIP(p)))
return FALSE;
if (!*swash)
*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
}
+bool
+Perl__is_utf8_quotemeta(pTHX_ const U8 *p)
+{
+ /* For exclusive use of pp_quotemeta() */
+
+ dVAR;
+
+ PERL_ARGS_ASSERT__IS_UTF8_QUOTEMETA;
+
+ return is_utf8_common(p, &PL_utf8_quotemeta, "_Perl_Quotemeta");
+}
+
/*
=for apidoc to_utf8_case
}
/* Note:
- * Returns a "swash" which is a hash described in utf8.c:S_swash_fetch().
+ * Returns a "swash" which is a hash described in utf8.c:Perl_swash_fetch().
* C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
* For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
*/
SV** swash_invlistsvp = NULL;
SV* swash_invlist = NULL;
bool invlist_in_swash_is_valid = FALSE;
- HV* swash_hv;
+ HV* swash_hv = NULL;
/* If this operation fetched a swash, get its already existing
* inversion list or create one for it */
if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
|| (slen << 3) < needents)
- Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
+ Perl_croak(aTHX_ "panic: swash_fetch got improper swatch, "
+ "svp=%p, tmps=%p, slen=%"UVuf", needents=%"UVuf,
+ svp, tmps, (UV)slen, (UV)needents);
}
PL_last_swash_hv = hv;
off <<= 2;
return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
}
- Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
+ Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width, "
+ "slen=%"UVuf", needents=%"UVuf, (UV)slen, (UV)needents);
NORETURN_FUNCTION_END;
}
if (wants_value) {
if (isBLANK(*l)) {
++l;
- flags = PERL_SCAN_SILENT_ILLDIGIT
- | PERL_SCAN_DISALLOW_PREFIX
- | PERL_SCAN_SILENT_NON_PORTABLE;
- numlen = lend - l;
- *val = grok_hex((char *)l, &numlen, &flags, NULL);
- if (numlen)
- l += numlen;
- else
- *val = 0;
+
+ /* The ToLc, etc table mappings are not in hex, and must be
+ * corrected by adding the code point to them */
+ if (typeto) {
+ char *after_strtol = (char *) lend;
+ *val = Strtol((char *)l, &after_strtol, 10);
+ l = (U8 *) after_strtol;
+ }
+ else { /* Other tables are in hex, and are the correct result
+ without tweaking */
+ flags = PERL_SCAN_SILENT_ILLDIGIT
+ | PERL_SCAN_DISALLOW_PREFIX
+ | PERL_SCAN_SILENT_NON_PORTABLE;
+ numlen = lend - l;
+ *val = grok_hex((char *)l, &numlen, &flags, NULL);
+ if (numlen)
+ l += numlen;
+ else
+ *val = 0;
+ }
}
else {
*val = 0;
HV *const hv = MUTABLE_HV(SvRV(swash));
SV** const invlistsvp = hv_fetchs(hv, "INVLIST", FALSE);
- SV** listsvp; /* The string containing the main body of the table */
- SV** extssvp;
- SV** invert_it_svp;
- U8* typestr;
+ SV** listsvp = NULL; /* The string containing the main body of the table */
+ SV** extssvp = NULL;
+ SV** invert_it_svp = NULL;
+ U8* typestr = NULL;
STRLEN bits;
STRLEN octets; /* if bits == 1, then octets == 0 */
UV none;
otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
otherbits = (STRLEN)SvUV(*otherbitssvp);
if (bits < otherbits)
- Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch");
+ Perl_croak(aTHX_ "panic: swatch_get found swatch size mismatch, "
+ "bits=%"UVuf", otherbits=%"UVuf, (UV)bits, (UV)otherbits);
/* The "other" swatch must be destroyed after. */
other = swatch_get(*othersvp, start, span);
s = (U8*)SvPV(swatch, slen);
if (bits == 1 && otherbits == 1) {
if (slen != olen)
- Perl_croak(aTHX_ "panic: swatch_get found swatch length mismatch");
+ Perl_croak(aTHX_ "panic: swatch_get found swatch length "
+ "mismatch, slen=%"UVuf", olen=%"UVuf,
+ (UV)slen, (UV)olen);
switch (opc) {
case '+':
while ((sv_to = hv_iternextsv(specials_hv, &char_from, &from_len))) {
SV** listp;
if (! SvPOK(sv_to)) {
- Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() unexpectedly is not a string");
+ Perl_croak(aTHX_ "panic: value returned from hv_iternextsv() "
+ "unexpectedly is not a string, flags=%lu",
+ (unsigned long)SvFLAGS(sv_to));
}
/*DEBUG_U(PerlIO_printf(Perl_debug_log, "Found mapping from %"UVXf", First char of to is %"UVXf"\n", utf8_to_uvchr((U8*) char_from, 0), utf8_to_uvchr((U8*) SvPVX(sv_to), 0)));*/
otherbits = (STRLEN)SvUV(*otherbitssvp);
if (bits != otherbits || bits != 1) {
- Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean properties");
+ Perl_croak(aTHX_ "panic: _swash_to_invlist only operates on boolean "
+ "properties, bits=%"UVuf", otherbits=%"UVuf,
+ (UV)bits, (UV)otherbits);
}
/* The "other" swatch must be destroyed after. */
=for apidoc uvchr_to_utf8
Adds the UTF-8 representation of the Native code point C<uv> to the end
-of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
+of the string C<d>; C<d> should have at least C<UTF8_MAXBYTES+1> free
bytes available. The return value is the pointer to the byte after the
end of the new character. In other words,