PERL_STATIC_INLINE struct regexp *
S_ReANY(const REGEXP * const re)
{
+ XPV* const p = (XPV*)SvANY(re);
assert(isREGEXP(re));
- return re->sv_u.svu_rx;
+ return SvTYPE(re) == SVt_PVLV ? p->xpv_len_u.xpvlenu_rx
+ : (struct regexp *)p;
}
/* ------------------------------- sv.h ------------------------------- */
C<L</is_c9strict_utf8_string_loclen>>.
=cut
+
+*/
+
+#define is_utf8_invariant_string(s, len) \
+ is_utf8_invariant_string_loc(s, len, NULL)
+
+/*
+=for apidoc is_utf8_invariant_string_loc
+
+Like C<L</is_utf8_invariant_string>> but upon failure, stores the location of
+the first UTF-8 variant character in the C<ep> pointer; if all characters are
+UTF-8 invariant, this function does not change the contents of C<*ep>.
+
+=cut
+
*/
PERL_STATIC_INLINE bool
-S_is_utf8_invariant_string(const U8* const s, const STRLEN len)
+S_is_utf8_invariant_string_loc(const U8* const s, STRLEN len, const U8 ** ep)
{
- const U8* const send = s + (len ? len : strlen((const char *)s));
+ const U8* send;
const U8* x = s;
- PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING;
+ PERL_ARGS_ASSERT_IS_UTF8_INVARIANT_STRING_LOC;
+
+ if (len == 0) {
+ len = strlen((const char *)s);
+ }
+
+ send = s + len;
+
+#ifndef EBCDIC
+ /* Try to get the widest word on this platform */
+# ifdef HAS_LONG_LONG
+# define PERL_WORDCAST unsigned long long
+# define PERL_WORDSIZE LONGLONGSIZE
+# else
+# define PERL_WORDCAST UV
+# define PERL_WORDSIZE UVSIZE
+# endif
+
+# if PERL_WORDSIZE == 4
+# define PERL_VARIANTS_WORD_MASK 0x80808080
+# define PERL_WORD_BOUNDARY_MASK 0x3
+# elif PERL_WORDSIZE == 8
+# define PERL_VARIANTS_WORD_MASK UINT64_C(0x8080808080808080)
+# define PERL_WORD_BOUNDARY_MASK 0x7
+# else
+# error Unexpected word size
+# endif
+
+ /* Process per-byte until reach word boundary. XXX This loop could be
+ * eliminated if we knew that this platform had fast unaligned reads */
+ while (x < send && (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK)) {
+ if (! UTF8_IS_INVARIANT(*x)) {
+ if (ep) {
+ *ep = x;
+ }
- for (; x < send; ++x) {
- if (!UTF8_IS_INVARIANT(*x))
- return FALSE;
+ return FALSE;
+ }
+ x++;
+ }
+
+ /* Process per-word as long as we have at least a full word left */
+ while (x + PERL_WORDSIZE <= send) {
+ if ((* (PERL_WORDCAST *) x) & PERL_VARIANTS_WORD_MASK) {
+
+ /* Found a variant. Just return if caller doesn't want its exact
+ * position */
+ if (! ep) {
+ return FALSE;
+ }
+
+ /* Otherwise fall into final loop to find which byte it is */
+ break;
+ }
+ x += PERL_WORDSIZE;
+ }
+
+# undef PERL_WORDCAST
+# undef PERL_WORDSIZE
+# undef PERL_WORD_BOUNDARY_MASK
+# undef PERL_VARIANTS_WORD_MASK
+#endif
+
+ /* Process per-byte */
+ while (x < send) {
+ if (! UTF8_IS_INVARIANT(*x)) {
+ if (ep) {
+ *ep = x;
+ }
+
+ return FALSE;
+ }
+
+ x++;
}
return TRUE;
See also
C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
C<L</is_utf8_string_loc>>,
C<L</is_utf8_string_loclen>>,
C<L</is_utf8_fixed_width_buf_flags>>,
=cut
*/
+#define is_utf8_string(s, len) is_utf8_string_loclen(s, len, NULL, NULL)
+
+#if defined(PERL_CORE) || defined (PERL_EXT)
+
+/*
+=for apidoc is_utf8_non_invariant_string
+
+Returns TRUE if L<perlapi/is_utf8_invariant_string> returns FALSE for the first
+C<len> bytes of the string C<s>, but they are, nonetheless, legal Perl-extended
+UTF-8; otherwise returns FALSE.
+
+A TRUE return means that at least one code point represented by the sequence
+either is a wide character not representable as a single byte, or the
+representation differs depending on whether the sequence is encoded in UTF-8 or
+not.
+
+See also
+C<L<perlapi/is_utf8_invariant_string>>,
+C<L<perlapi/is_utf8_string>>
+
+=cut
+
+This is commonly used to determine if a SV's UTF-8 flag should be turned on.
+It needn't be if its string is entirely UTF-8 invariant, and it shouldn't be if
+it otherwise contains invalid UTF-8.
+
+It is an internal function because khw thinks that XS code shouldn't be working
+at this low a level. A valid use case could change that.
+
+*/
+
PERL_STATIC_INLINE bool
-Perl_is_utf8_string(const U8 *s, const STRLEN len)
+S_is_utf8_non_invariant_string(const U8* const s, STRLEN len)
{
- /* This is now marked pure in embed.fnc, because isUTF8_CHAR now is pure.
- * Be aware of possible changes to that */
-
- const U8* const send = s + (len ? len : strlen((const char *)s));
- const U8* x = s;
+ const U8 * first_variant;
- PERL_ARGS_ASSERT_IS_UTF8_STRING;
+ PERL_ARGS_ASSERT_IS_UTF8_NON_INVARIANT_STRING;
- while (x < send) {
- const STRLEN cur_len = isUTF8_CHAR(x, send);
- if (UNLIKELY(! cur_len)) {
- return FALSE;
- }
- x += cur_len;
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ return FALSE;
}
- return TRUE;
+ return is_utf8_string(first_variant, len - (first_variant - s));
}
+#endif
+
/*
=for apidoc is_strict_utf8_string
See also
C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
C<L</is_utf8_string>>,
C<L</is_utf8_string_flags>>,
C<L</is_utf8_string_loc>>,
=cut
*/
-PERL_STATIC_INLINE bool
-S_is_strict_utf8_string(const U8 *s, const STRLEN len)
-{
- const U8* const send = s + (len ? len : strlen((const char *)s));
- const U8* x = s;
-
- PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING;
-
- while (x < send) {
- const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
- if (UNLIKELY(! cur_len)) {
- return FALSE;
- }
- x += cur_len;
- }
-
- return TRUE;
-}
+#define is_strict_utf8_string(s, len) is_strict_utf8_string_loclen(s, len, NULL, NULL)
/*
=for apidoc is_c9strict_utf8_string
See also
C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
C<L</is_utf8_string>>,
C<L</is_utf8_string_flags>>,
C<L</is_utf8_string_loc>>,
=cut
*/
-PERL_STATIC_INLINE bool
-S_is_c9strict_utf8_string(const U8 *s, const STRLEN len)
-{
- const U8* const send = s + (len ? len : strlen((const char *)s));
- const U8* x = s;
-
- PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING;
-
- while (x < send) {
- const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
- if (UNLIKELY(! cur_len)) {
- return FALSE;
- }
- x += cur_len;
- }
-
- return TRUE;
-}
-
-/* The above 3 functions could have been moved into the more general one just
- * below, and made #defines that call it with the right 'flags'. They are
- * currently kept separate to increase their chances of getting inlined */
+#define is_c9strict_utf8_string(s, len) is_c9strict_utf8_string_loclen(s, len, NULL, 0)
/*
=for apidoc is_utf8_string_flags
See also
C<L</is_utf8_invariant_string>>,
+C<L</is_utf8_invariant_string_loc>>,
C<L</is_utf8_string>>,
C<L</is_utf8_string_loc>>,
C<L</is_utf8_string_loc_flags>>,
*/
PERL_STATIC_INLINE bool
-S_is_utf8_string_flags(const U8 *s, const STRLEN len, const U32 flags)
+S_is_utf8_string_flags(const U8 *s, STRLEN len, const U32 flags)
{
- const U8* const send = s + (len ? len : strlen((const char *)s));
- const U8* x = s;
+ const U8 * first_variant;
PERL_ARGS_ASSERT_IS_UTF8_STRING_FLAGS;
assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
- |UTF8_DISALLOW_ABOVE_31_BIT)));
+ |UTF8_DISALLOW_PERL_EXTENDED)));
+
+ if (len == 0) {
+ len = strlen((const char *)s);
+ }
if (flags == 0) {
return is_utf8_string(s, len);
}
- if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+ if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
== UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
{
return is_strict_utf8_string(s, len);
}
- if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+ if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
== UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
{
return is_c9strict_utf8_string(s, len);
}
- while (x < send) {
- STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
- if (UNLIKELY(! cur_len)) {
- return FALSE;
+ if (! is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ const U8* const send = s + len;
+ const U8* x = first_variant;
+
+ while (x < send) {
+ STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
+ if (UNLIKELY(! cur_len)) {
+ return FALSE;
+ }
+ x += cur_len;
}
- x += cur_len;
}
return TRUE;
*/
PERL_STATIC_INLINE bool
-Perl_is_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
{
- const U8* const send = s + (len ? len : strlen((const char *)s));
- const U8* x = s;
- STRLEN outlen = 0;
+ const U8 * first_variant;
PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
- while (x < send) {
- const STRLEN cur_len = isUTF8_CHAR(x, send);
- if (UNLIKELY(! cur_len)) {
- break;
- }
- x += cur_len;
- outlen++;
+ if (len == 0) {
+ len = strlen((const char *) s);
}
- if (el)
- *el = outlen;
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ if (el)
+ *el = len;
- if (ep) {
- *ep = x;
+ if (ep) {
+ *ep = s + len;
+ }
+
+ return TRUE;
}
- return (x == send);
+ {
+ const U8* const send = s + len;
+ const U8* x = first_variant;
+ STRLEN outlen = first_variant - s;
+
+ while (x < send) {
+ const STRLEN cur_len = isUTF8_CHAR(x, send);
+ if (UNLIKELY(! cur_len)) {
+ break;
+ }
+ x += cur_len;
+ outlen++;
+ }
+
+ if (el)
+ *el = outlen;
+
+ if (ep) {
+ *ep = x;
+ }
+
+ return (x == send);
+ }
}
/*
*/
PERL_STATIC_INLINE bool
-S_is_strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+S_is_strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
{
- const U8* const send = s + (len ? len : strlen((const char *)s));
- const U8* x = s;
- STRLEN outlen = 0;
+ const U8 * first_variant;
PERL_ARGS_ASSERT_IS_STRICT_UTF8_STRING_LOCLEN;
- while (x < send) {
- const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
- if (UNLIKELY(! cur_len)) {
- break;
- }
- x += cur_len;
- outlen++;
+ if (len == 0) {
+ len = strlen((const char *) s);
}
- if (el)
- *el = outlen;
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ if (el)
+ *el = len;
+
+ if (ep) {
+ *ep = s + len;
+ }
- if (ep) {
- *ep = x;
+ return TRUE;
}
- return (x == send);
+ {
+ const U8* const send = s + len;
+ const U8* x = first_variant;
+ STRLEN outlen = first_variant - s;
+
+ while (x < send) {
+ const STRLEN cur_len = isSTRICT_UTF8_CHAR(x, send);
+ if (UNLIKELY(! cur_len)) {
+ break;
+ }
+ x += cur_len;
+ outlen++;
+ }
+
+ if (el)
+ *el = outlen;
+
+ if (ep) {
+ *ep = x;
+ }
+
+ return (x == send);
+ }
}
/*
*/
PERL_STATIC_INLINE bool
-S_is_c9strict_utf8_string_loclen(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el)
+S_is_c9strict_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
{
- const U8* const send = s + (len ? len : strlen((const char *)s));
- const U8* x = s;
- STRLEN outlen = 0;
+ const U8 * first_variant;
PERL_ARGS_ASSERT_IS_C9STRICT_UTF8_STRING_LOCLEN;
- while (x < send) {
- const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
- if (UNLIKELY(! cur_len)) {
- break;
- }
- x += cur_len;
- outlen++;
+ if (len == 0) {
+ len = strlen((const char *) s);
}
- if (el)
- *el = outlen;
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ if (el)
+ *el = len;
- if (ep) {
- *ep = x;
+ if (ep) {
+ *ep = s + len;
+ }
+
+ return TRUE;
}
- return (x == send);
+ {
+ const U8* const send = s + len;
+ const U8* x = first_variant;
+ STRLEN outlen = first_variant - s;
+
+ while (x < send) {
+ const STRLEN cur_len = isC9_STRICT_UTF8_CHAR(x, send);
+ if (UNLIKELY(! cur_len)) {
+ break;
+ }
+ x += cur_len;
+ outlen++;
+ }
+
+ if (el)
+ *el = outlen;
+
+ if (ep) {
+ *ep = x;
+ }
+
+ return (x == send);
+ }
}
/*
*/
PERL_STATIC_INLINE bool
-S_is_utf8_string_loclen_flags(const U8 *s, const STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
+S_is_utf8_string_loclen_flags(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el, const U32 flags)
{
- const U8* const send = s + (len ? len : strlen((const char *)s));
- const U8* x = s;
- STRLEN outlen = 0;
+ const U8 * first_variant;
PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN_FLAGS;
assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
- |UTF8_DISALLOW_ABOVE_31_BIT)));
+ |UTF8_DISALLOW_PERL_EXTENDED)));
+
+ if (len == 0) {
+ len = strlen((const char *) s);
+ }
if (flags == 0) {
return is_utf8_string_loclen(s, len, ep, el);
}
- if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+ if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
== UTF8_DISALLOW_ILLEGAL_INTERCHANGE)
{
return is_strict_utf8_string_loclen(s, len, ep, el);
}
- if ((flags & ~UTF8_DISALLOW_ABOVE_31_BIT)
+ if ((flags & ~UTF8_DISALLOW_PERL_EXTENDED)
== UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE)
{
return is_c9strict_utf8_string_loclen(s, len, ep, el);
}
- while (x < send) {
- const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
- if (UNLIKELY(! cur_len)) {
- break;
+ if (is_utf8_invariant_string_loc(s, len, &first_variant)) {
+ if (el)
+ *el = len;
+
+ if (ep) {
+ *ep = s + len;
}
- x += cur_len;
- outlen++;
+
+ return TRUE;
}
- if (el)
- *el = outlen;
+ {
+ const U8* send = s + len;
+ const U8* x = first_variant;
+ STRLEN outlen = first_variant - s;
+
+ while (x < send) {
+ const STRLEN cur_len = isUTF8_CHAR_flags(x, send, flags);
+ if (UNLIKELY(! cur_len)) {
+ break;
+ }
+ x += cur_len;
+ outlen++;
+ }
+
+ if (el)
+ *el = outlen;
- if (ep) {
- *ep = x;
- }
+ if (ep) {
+ *ep = x;
+ }
- return (x == send);
+ return (x == send);
+ }
}
/*
PERL_ARGS_ASSERT_IS_UTF8_VALID_PARTIAL_CHAR_FLAGS;
assert(0 == (flags & ~(UTF8_DISALLOW_ILLEGAL_INTERCHANGE
- |UTF8_DISALLOW_ABOVE_31_BIT)));
+ |UTF8_DISALLOW_PERL_EXTENDED)));
if (s >= e || s + UTF8SKIP(s) <= e) {
return FALSE;
PERL_STATIC_INLINE bool
S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
- const STRLEN len,
+ STRLEN len,
const U8 **ep,
STRLEN *el,
const U32 flags)
return 1;
}
-I32
+PERL_STATIC_INLINE I32
Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
{
/* Compare non-utf8 using Unicode (Latin1) semantics. Does not work on
=cut
*/
-I32
+PERL_STATIC_INLINE I32
Perl_foldEQ_locale(const char *s1, const char *s2, I32 len)
{
dVAR;
return 1;
}
+#if ! defined (HAS_MEMRCHR) && (defined(PERL_CORE) || defined(PERL_EXT))
+
+PERL_STATIC_INLINE void *
+S_my_memrchr(const char * s, const char c, const STRLEN len)
+{
+ /* memrchr(), since many platforms lack it */
+
+ const char * t = s + len - 1;
+
+ PERL_ARGS_ASSERT_MY_MEMRCHR;
+
+ while (t >= s) {
+ if (*t == c) {
+ return (void *) t;
+ }
+ t--;
+ }
+
+ return NULL;
+}
+
+#endif
+
/*
* ex: set ts=8 sts=4 sw=4 et:
*/