else {
anded_flags = ANYOF_FLAGS(and_with)
&( ANYOF_COMMON_FLAGS
- |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER);
+ |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
+ |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
}
}
if (OP(or_with) != ANYOFD) {
ored_flags
|= ANYOF_FLAGS(or_with)
- & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
+ & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
+ |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
}
}
* by the time we reach here */
assert(! (ANYOF_FLAGS(ssc)
& ~( ANYOF_COMMON_FLAGS
- |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)));
+ |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
+ |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
if (end == UV_MAX && start <= NUM_ANYOF_CODE_POINTS) {
ANYOF_FLAGS(node) |= ANYOF_MATCHES_ALL_ABOVE_BITMAP;
}
- else if (end >= NUM_ANYOF_CODE_POINTS) {
- ANYOF_FLAGS(node) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
- }
/* Quit if are above what we should change */
if (start >= NUM_ANYOF_CODE_POINTS) {
optimizable = FALSE; /* Will have to leave this an
ANYOF node */
- /* We don't know yet, so have to assume that the
- * property could match something in the upper Latin1
- * range, hence something that isn't utf8. Note that
- * this would cause things in <depends_list> to match
- * inappropriately, except that any \p{}, including
- * this one forces Unicode semantics, which means there
- * is no <depends_list> */
- ANYOF_FLAGS(ret)
- |= ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES;
+ /* We don't know yet what this matches, so have to flag
+ * it */
+ ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
}
else {
else {
cp_list = has_upper_latin1_only_utf8_matches;
}
- ANYOF_FLAGS(ret) |= ANYOF_HAS_UTF8_NONBITMAP_MATCHES;
+ ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP;
}
/* If there is a swash and more than one element, we can't use the swash in
if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
assert(! (ANYOF_FLAGS(node)
- & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
- |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)));
+ & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP));
ARG_SET(node, ANYOF_ONLY_HAS_BITMAP);
}
else {
AV * const av = newAV();
SV *rv;
- assert(ANYOF_FLAGS(node)
- & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
- |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
-
av_store(av, 0, (runtime_defns)
? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
if (swash) {
PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
- assert(ANYOF_FLAGS(node)
- & (ANYOF_HAS_UTF8_NONBITMAP_MATCHES
- |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES|ANYOF_LOC_FOLD));
-
if (data && data->count) {
const U32 n = ARG(node);
}
}
- if ((flags & (ANYOF_MATCHES_ALL_ABOVE_BITMAP
- |ANYOF_HAS_UTF8_NONBITMAP_MATCHES
- |ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES
- |ANYOF_LOC_FOLD)))
+ if ((flags
+ & ( ANYOF_MATCHES_ALL_ABOVE_BITMAP
+ |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP
+ |ANYOF_LOC_FOLD)))
{
if (do_sep) {
Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
if (*s == '\n') {
const char * const t = ++s;
- if (flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) {
- sv_catpvs(sv, "{outside bitmap}");
- }
- else {
- sv_catpvs(sv, "{utf8}");
+ if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) {
+ if (OP(o) == ANYOFD) {
+ sv_catpvs(sv, "{utf8}");
+ }
+ else {
+ sv_catpvs(sv, "{outside bitmap}");
+ }
}
if (byte_output) {
#define ANYOF_LOC_FOLD 0x04
/* If set, means to warn if runtime locale isn't a UTF-8 one. Only under /l.
- * If set, none of INVERT, LOC_FOLD, POSIXL, HAS_NONBITMAP_NON_UTF8_MATCHES can
+ * If set, none of INVERT, LOC_FOLD, POSIXL,
+ * ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP can
* be set. Can be in an SSC */
#define ANYOF_LOC_REQ_UTF8 0x08
* Can be in an SSC */
#define ANYOF_MATCHES_ALL_ABOVE_BITMAP 0x10
-/* If set, the node can match something outside the bitmap that isn't in utf8;
- * never set under /d nor in an SSC */
-#define ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES 0x20
+/* Spare: 0x20 */
-/* Are there things outside the bitmap that will match only if the target
- * string is encoded in UTF-8? (This is not set if ANYOF_ABOVE_BITMAP_ALL is
- * set). Can be in SSC */
-#define ANYOF_HAS_UTF8_NONBITMAP_MATCHES 0x40
+/* Shared bit:
+ * Under /d it means the ANYOFD node matches more things if the target
+ * string is encoded in UTF-8; any such things will be non-ASCII,
+ * characters that are < 256, and can be accessed via the swash.
+ * When not under /d, it means the ANYOF node contains a user-defined
+ * property that wasn't yet defined at the time the regex was compiled,
+ * and so must be looked up at runtime, by creating a swash
+ * (These uses are mutually exclusive because a user-defined property is
+ * specified by \p{}, and \p{} implies /u which deselects /d). The long macro
+ * name is to make sure that you are cautioned about its shared nature. Only
+ * the non-/d meaning can be in an SSC */
+#define ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP 0x40
/* Shared bit:
* Under /d it means the ANYOFD node matches all non-ASCII Latin1
/* These are the flags that apply to both regular ANYOF nodes and synthetic
* start class nodes during construction of the SSC. During finalization of
* the SSC, other of the flags may get added to it */
-#define ANYOF_COMMON_FLAGS ( ANYOF_HAS_UTF8_NONBITMAP_MATCHES \
- |ANYOF_LOC_REQ_UTF8)
+#define ANYOF_COMMON_FLAGS ANYOF_LOC_REQ_UTF8
/* Character classes for node->classflags of ANYOF */
/* Should be synchronized with a table in regprop() */
{
match = TRUE; /* Everything above the bitmap matches */
}
- else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)
- || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES))
- || ((flags & ANYOF_LOC_FOLD)
- && IN_UTF8_CTYPE_LOCALE
- && ARG(n) != ANYOF_ONLY_HAS_BITMAP))
+ /* Here doesn't match everything above the bitmap. If there is
+ * some information available beyond the bitmap, we may find a
+ * match in it. If so, this is most likely because the code point
+ * is outside the bitmap range. But rarely, it could be because of
+ * some other reason. If so, various flags are set to indicate
+ * this possibility. On ANYOFD nodes, there may be matches that
+ * happen only when the target string is UTF-8; or for other node
+ * types, because runtime lookup is needed, regardless of the
+ * UTF-8ness of the target string. Finally, under /il, there may
+ * be some matches only possible if the locale is a UTF-8 one. */
+ else if ( ARG(n) != ANYOF_ONLY_HAS_BITMAP
+ && ( c >= NUM_ANYOF_CODE_POINTS
+ || ( (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
+ && ( UNLIKELY(OP(n) != ANYOFD)
+ || (utf8_target && ! isASCII_uni(c)
+# if NUM_ANYOF_CODE_POINTS > 256
+ && c < 256
+# endif
+ )))
+ || (( flags & ANYOF_LOC_FOLD)
+ && IN_UTF8_CTYPE_LOCALE)))
{
SV* only_utf8_locale = NULL;
SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,