/*
-Tests if some arbitrary number of bytes begins in a valid UTF-8
+Tests if the first C<len> bytes of string C<s> form a valid UTF-8
character. Note that an INVARIANT (i.e. ASCII) character is a valid
-UTF-8 character. The actual number of bytes in the UTF-8 character
+UTF-8 character. The number of bytes in the UTF-8 character
will be returned if it is valid, otherwise 0.
This is the "slow" version as opposed to the "fast" version which is
PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
if (UTF8_IS_INVARIANT(u))
- return 1;
+ return len == 1;
if (!UTF8_IS_START(u))
return 0;
}
/*
+=for apidoc is_utf8_char_buf
+
+Returns the number of bytes that comprise the first UTF-8 encoded character in
+buffer C<buf>. C<buf_end> should point to one position beyond the end of the
+buffer. 0 is returned if C<buf> does not point to a complete, valid UTF-8
+encoded character.
+
+Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character.
+
+=cut */
+
+STRLEN
+Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
+{
+
+ STRLEN len;
+
+ PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
+
+ if (buf_end <= buf) {
+ return 0;
+ }
+
+ len = buf_end - buf;
+ if (len > UTF8SKIP(buf)) {
+ len = UTF8SKIP(buf);
+ }
+
+#ifdef IS_UTF8_CHAR
+ if (IS_UTF8_CHAR_FAST(len))
+ return IS_UTF8_CHAR(buf, len) ? len : 0;
+#endif /* #ifdef IS_UTF8_CHAR */
+ return is_utf8_char_slow(buf, len);
+}
+
+/*
=for apidoc is_utf8_char
+DEPRECATED!
+
Tests if some arbitrary number of bytes begins in a valid UTF-8
character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
character is a valid UTF-8 character. The actual number of bytes in the UTF-8
character will be returned if it is valid, otherwise 0.
-WARNING: use only if you *know* that C<s> has at least either UTF8_MAXBYTES or
-UTF8SKIP(s) bytes.
+This function is deprecated due to the possibility that malformed input could
+cause reading beyond the end of the input buffer. Use C<is_utf8_char_buf>
+instead.
=cut */
+
STRLEN
Perl_is_utf8_char(const U8 *s)
{
- const STRLEN len = UTF8SKIP(s);
-
PERL_ARGS_ASSERT_IS_UTF8_CHAR;
-#ifdef IS_UTF8_CHAR
- if (IS_UTF8_CHAR_FAST(len))
- return IS_UTF8_CHAR(s, len) ? len : 0;
-#endif /* #ifdef IS_UTF8_CHAR */
- return is_utf8_char_slow(s, len);
+
+ /* Assumes we have enough space, which is why this is deprecated */
+ return is_utf8_char_buf(s, s + UTF8SKIP(s));
}
PERL_ARGS_ASSERT_IS_UTF8_COMMON;
- if (!is_utf8_char(p))
+ /* The API should have included a length for the UTF-8 character in <p>,
+ * but it doesn't. We therefor assume that p has been validated at least
+ * as far as there being enough bytes available in it to accommodate the
+ * character without reading beyond the end, and pass that number on to the
+ * validating routine */
+ if (!is_utf8_char_buf(p, p + UTF8SKIP(p)))
return FALSE;
if (!*swash)
*swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
}
+bool
+Perl__is_utf8_quotemeta(pTHX_ const U8 *p)
+{
+ /* For exclusive use of pp_quotemeta() */
+
+ dVAR;
+
+ PERL_ARGS_ASSERT__IS_UTF8_QUOTEMETA;
+
+ return is_utf8_common(p, &PL_utf8_quotemeta, "_Perl_Quotemeta");
+}
+
/*
=for apidoc to_utf8_case