ApR |I32 |is_lvalue_sub
: Used in cop.h
XopR |I32 |was_lvalue_sub
-iRn |STRLEN |_is_utf8_char_slow|NN const U8 *s|const STRLEN len
+iRn |STRLEN |_is_utf8_char_slow|NN const U8 *s|NN const U8 *e
ADMpPR |U32 |to_uni_upper_lc|U32 c
ADMpPR |U32 |to_uni_title_lc|U32 c
ADMpPR |U32 |to_uni_lower_lc|U32 c
}
/*
-Tests if the first C<len> bytes of string C<s> form a valid UTF-8
-character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a
-valid UTF-8 character. The number of bytes in the UTF-8 character
-will be returned if it is valid, otherwise 0.
-
-This is the "slow" version as opposed to the "fast" version which is
-the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
-difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
-or less you should use the IS_UTF8_CHAR(), for lengths of five or more
-you should use the _slow(). In practice this means that the _slow()
-will be used very rarely, since the maximum Unicode code point (as of
-Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
-the "Perl extended UTF-8" (e.g, the infamous 'v-strings') will encode into
-five bytes or more.
+A helper function for the macro isUTF8_CHAR(), which should be used instead of
+this function. The macro will handle smaller code points directly saving time,
+using this function as a fall-back for higher code points.
+Tests if the first bytes of string C<s> form a valid UTF-8 character. 0 is
+returned if the bytes starting at C<s> up to but not including C<e> do not form a
+complete well-formed UTF-8 character; otherwise the number of bytes in the
+character is returned.
+Note that an INVARIANT (i.e. ASCII on non-EBCDIC) character is a valid UTF-8
+character.
=cut */
PERL_STATIC_INLINE STRLEN
-S__is_utf8_char_slow(const U8 *s, const STRLEN len)
+S__is_utf8_char_slow(const U8 *s, const U8 *e)
{
dTHX; /* The function called below requires thread context */
PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW;
- utf8n_to_uvchr(s, len, &actual_len, UTF8_CHECK_ONLY);
+ assert(e >= s);
+ utf8n_to_uvchr(s, e - s, &actual_len, UTF8_CHECK_ONLY);
return (actual_len == (STRLEN) -1) ? 0 : actual_len;
}
have to guess. The API function C<is_utf8_string> can help; it'll tell
you if a string contains only valid UTF-8 characters. However, it can't
do the work for you. On a character-by-character basis,
-C<is_utf8_char_buf>
+C<isUTF8_CHAR>
will tell you whether the current character in a string is valid UTF-8.
=head2 How does UTF-8 represent Unicode characters?
=item *
-C<is_utf8_char_buf(buf, buf_end)> returns true if the pointer points to
+C<isUTF8_CHAR(buf, buf_end)> returns true if the pointer points to
a valid UTF-8 character.
=item *
#define PERL_ARGS_ASSERT__IS_UTF8_FOO \
assert(p)
-PERL_STATIC_INLINE STRLEN S__is_utf8_char_slow(const U8 *s, const STRLEN len)
+PERL_STATIC_INLINE STRLEN S__is_utf8_char_slow(const U8 *s, const U8 *e)
__attribute__warn_unused_result__
- __attribute__nonnull__(1);
+ __attribute__nonnull__(1)
+ __attribute__nonnull__(2);
#define PERL_ARGS_ASSERT__IS_UTF8_CHAR_SLOW \
- assert(s)
+ assert(s); assert(e)
PERL_CALLCONV bool Perl__is_utf8_mark(pTHX_ const U8 *p)
__attribute__warn_unused_result__
/*
=for apidoc is_utf8_char_buf
-Returns the number of bytes that comprise the first UTF-8 encoded character in
-buffer C<buf>. C<buf_end> should point to one position beyond the end of the
-buffer. 0 is returned if C<buf> does not point to a complete, valid UTF-8
-encoded character.
-
-Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
-machines) is a valid UTF-8 character.
+This is identical to the macro isUTF8_CHAR.
=cut */
Perl_is_utf8_char_buf(const U8 *buf, const U8* buf_end)
{
- STRLEN len;
-
PERL_ARGS_ASSERT_IS_UTF8_CHAR_BUF;
- if (buf_end <= buf) {
- return 0;
- }
-
- len = buf_end - buf;
- if (len > UTF8SKIP(buf)) {
- len = UTF8SKIP(buf);
- }
-
- if (IS_UTF8_CHAR_FAST(len))
- return IS_UTF8_CHAR(buf, len) ? len : 0;
- return _is_utf8_char_slow(buf, len);
+ return isUTF8_CHAR(buf, buf_end);
}
/*
PERL_ARGS_ASSERT_IS_UTF8_CHAR;
/* Assumes we have enough space, which is why this is deprecated */
- return is_utf8_char_buf(s, s + UTF8SKIP(s));
+ return isUTF8_CHAR(s, s + UTF8SKIP(s));
}
PERL_ARGS_ASSERT_IS_UTF8_STRING;
while (x < send) {
- /* Inline the easy bits of is_utf8_char() here for speed... */
- if (UTF8_IS_INVARIANT(*x)) {
- x++;
- }
- else {
- /* ... and call is_utf8_char() only if really needed. */
- const STRLEN c = UTF8SKIP(x);
- const U8* const next_char_ptr = x + c;
-
- if (next_char_ptr > send) {
- return FALSE;
- }
-
- if (IS_UTF8_CHAR_FAST(c)) {
- if (!IS_UTF8_CHAR(x, c))
- return FALSE;
- }
- else if (! _is_utf8_char_slow(x, c)) {
- return FALSE;
- }
- x = next_char_ptr;
- }
+ STRLEN len = isUTF8_CHAR(x, send);
+ if (UNLIKELY(! len)) {
+ return FALSE;
+ }
+ x += len;
}
return TRUE;
{
const U8* const send = s + (len ? len : strlen((const char *)s));
const U8* x = s;
- STRLEN c;
STRLEN outlen = 0;
PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
while (x < send) {
- const U8* next_char_ptr;
-
- /* Inline the easy bits of is_utf8_char() here for speed... */
- if (UTF8_IS_INVARIANT(*x))
- next_char_ptr = x + 1;
- else {
- /* ... and call is_utf8_char() only if really needed. */
- c = UTF8SKIP(x);
- next_char_ptr = c + x;
- if (next_char_ptr > send) {
- goto out;
- }
- if (IS_UTF8_CHAR_FAST(c)) {
- if (!IS_UTF8_CHAR(x, c))
- c = 0;
- } else
- c = _is_utf8_char_slow(x, c);
- if (!c)
- goto out;
- }
- x = next_char_ptr;
- outlen++;
+ STRLEN len = isUTF8_CHAR(x, send);
+ if (UNLIKELY(! len)) {
+ goto out;
+ }
+ x += len;
+ outlen++;
}
out:
* as far as there being enough bytes available in it to accommodate the
* character without reading beyond the end, and pass that number on to the
* validating routine */
- if (! is_utf8_char_buf(p, p + UTF8SKIP(p))) {
+ if (! isUTF8_CHAR(p, p + UTF8SKIP(p))) {
if (ckWARN_d(WARN_UTF8)) {
Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED,WARN_UTF8),
"Passing malformed UTF-8 to \"%s\" is deprecated", swashname);
((end) > (input) + 1) && \
toFOLD((input)[0]) == 's' && \
toFOLD((input)[1]) == 's')
+
#define SHARP_S_SKIP 2
/* If you want to exclude surrogates, and beyond legal Unicode, see the blame
* log for earlier versions which gave details for these */
-/* regen/regcharclass.pl generates is_UTF8_CHAR_utf8() macros for up to these
+/* A helper macro for isUTF8_CHAR, so use that one, and not this one. This is
+ * retained solely for backwards compatibility and may be deprecated and
+ * removed in a future Perl version.
+ *
+ * regen/regcharclass.pl generates is_UTF8_CHAR_utf8() macros for up to these
* number of bytes. So this has to be coordinated with that file */
#ifdef EBCDIC
# define IS_UTF8_CHAR_FAST(n) ((n) <= 3)
#endif
#ifndef EBCDIC
-/* This was generated by regen/regcharclass.pl, and then moved here. The lines
- * that generated it were then commented out. This was done solely because it
- * takes on the order of 10 minutes to generate, and is never going to change.
- * The EBCDIC equivalent hasn't been commented out in regcharclass.pl, so it
- * should generate and run the correct stuff */
+/* A helper macro for isUTF8_CHAR, so use that one instead of this. This was
+ * generated by regen/regcharclass.pl, and then moved here. The lines that
+ * generated it were then commented out. This was done solely because it takes
+ * on the order of 10 minutes to generate, and is never going to change, unless
+ * the generated code is improved.
+ *
+ * The EBCDIC versions have been cut to not cover all of legal Unicode, so
+ * don't take too long to generate, and there is a separate one for each code
+ * page, so they are in regcharclass.h instead of here */
/*
UTF8_CHAR: Matches utf8 from 1 to 4 bytes
: 0 )
#endif
-/* IS_UTF8_CHAR(p) is strictly speaking wrong (not UTF-8) because it
- * (1) allows UTF-8 encoded UTF-16 surrogates
- * (2) it allows code points past U+10FFFF.
- * The Perl_is_utf8_char() full "slow" code will handle the Perl
- * "extended UTF-8". */
-#define IS_UTF8_CHAR(p, n) (is_UTF8_CHAR_utf8_safe(p, (p) + (n)) == n)
-
+/*
+ * =for apidoc isUTF8_CHAR
+ *
+ * Returns the number of bytes beginning at C<s> which form a legal UTF-8 (or
+ * UTF-EBCDIC) encoded character, looking no further than C<e - s> bytes into
+ * C<s>. Returns 0 if the sequence starting at C<s> through C<e - 1> is not
+ * well-formed UTF-8
+
+Note that an INVARIANT character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character. */
+
+#define isUTF8_CHAR(s, e) (((e) <= (s)) \
+ ? 0 \
+ : (UTF8_IS_INVARIANT(*s)) \
+ ? 1 \
+ : (((e) - (s)) < UTF8SKIP(s)) \
+ ? 0 \
+ : (IS_UTF8_CHAR_FAST(UTF8SKIP(s))) \
+ ? is_UTF8_CHAR_utf8_safe(s,e) \
+ : _is_utf8_char_slow(s, e))
+
+/* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is
+ * retained solely for backwards compatibility */
+#define IS_UTF8_CHAR(p, n) (isUTF8_CHAR(p, (p) + (n)) == n)
#endif /* H_UTF8 */