/* This is the stuff that used to live in regexp.h that was truly
private to the engine itself. It now lives here. */
-
-
typedef struct regexp_internal {
int name_list_idx; /* Optional data index of an array of paren names */
union {
#define PREGf_ANCH_MBOL 0x00000400
#define PREGf_ANCH_SBOL 0x00000800
#define PREGf_ANCH_GPOS 0x00001000
+#define PREGf_RECURSE_SEEN 0x00002000
#define PREGf_ANCH \
( PREGf_ANCH_SBOL | PREGf_ANCH_GPOS | PREGf_ANCH_MBOL )
#define REG_MAGIC 0234
-#define SIZE_ONLY (RExC_emit == (regnode *) & RExC_emit_dummy)
+#define SIZE_ONLY cBOOL(RExC_emit == (regnode *) & RExC_emit_dummy)
#define PASS1 SIZE_ONLY
#define PASS2 (! SIZE_ONLY)
* 2) A subset of item 1) is if all possible code points outside the bitmap
* match. This is a common occurrence when the class is complemented,
* like /[^ij]/. Therefore a bit is reserved to indicate this,
- * ANYOF_MATCHES_ALL_ABOVE_BITMAP. If it became necessary, this flag could
- * be replaced by using the normal swash mechanism, but with a performance
- * penalty.
+ * rather than having an expensive swash created,
+ * ANYOF_MATCHES_ALL_ABOVE_BITMAP.
* 3) Under /d rules, it can happen that code points that are in the upper
* latin1 range (\x80-\xFF or their equivalents on EBCDIC platforms) match
* only if the runtime target string being matched against is UTF-8. For
* example /[\w[:punct:]]/d. This happens only for posix classes (with a
- * couple of exceptions, like \d), and all such ones also have
- * above-bitmap matches. Thus, 3) implies 1) as well. Note that /d rules
- * are no longer encouraged; 'use 5.14' or higher deselects them. But a
- * flag is required so that they can be properly handled. But it can be a
- * shared flag: see 5) below.
- * 4) Also under /d rules, something like /[\Wfoo] will match everything in
+ * couple of exceptions, like \d where it doesn't happen), and all such
+ * ones also have above-bitmap matches. Thus, 3) implies 1) as well.
+ * Note that /d rules are no longer encouraged; 'use 5.14' or higher
+ * deselects them. But a flag is required so that they can be properly
+ * handled. But it can be a shared flag: see 5) below.
+ * 4) Also under /d rules, something like /[\Wfoo]/ will match everything in
* the \x80-\xFF range, unless the string being matched against is UTF-8.
* A swash could be created for this case, but this is relatively common,
* and it turns out that it's all or nothing: if any one of these code
* traditionally not worried too much about its performance. And this
* condition requires the ANYOFL_FOLD flag to be set, so testing for
* that flag would be sufficient to rule out most cases of this. So it is
- * unclear if this should have a flag or not. But, one is currently
- * allocated for this purpose, ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE (and
- * the text below indicates how to share it, should another bit be
- * needed).
+ * unclear if this should have a flag or not. But, this flag can be
+ * shared with another, so it doesn't occupy extra space.
+ *
+ * At the moment, there is one spare bit, but this could be increased by
+ * various tricks.
*
- * At the moment, there are no spare bits, but this could be changed by various
- * tricks.
+ * If just one more bit is needed, at this writing it seems to khw that the
+ * best choice would be to make ANYOF_MATCHES_ALL_ABOVE_BITMAP not a flag, but
+ * something like
*
- * Note that item ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE is not independent of
- * the ANYOFL_FOLD flag below. Also, the ANYOFL_UTF8_LOCALE_REQD flag is set
- * only if both these aren't. We can therefore share
- * ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE with ANYOFL_UTF8_LOCALE_REQD, so what
- * the shared flag means depends on the ANYOFL_FOLD flag.
+ * #define ANYOF_MATCHES_ALL_ABOVE_BITMAP ((U32) -2)
*
- * Beyond that, note that the information may be conveyed by creating new
- * regnode types. This is not the best solution, as shown later in this
- * paragraph, but it is something that is feasible. We could have a regnode
- * for ANYOF_INVERT, for example. A complication of this is that the regexec.c
- * REGINCLASS macro assumes that it can just use the bitmap if no flags are
- * set. This would have to be changed to add extra tests for the node type, or
- * a special flag reserved that means unspecified special handling, and then the
- * node-type would be used internally to sort that out. So we could gain a bit
- * by having an ANYOF_SPECIAL flag, and a node type for INVERT, and another for
- * POSIXL, and still another for INVERT_POSIXL. This example illustrates one
- * problem with this, a combinatorial explosion of node types. The one node
- * type khw can think of that doesn't have this explosion issue is
- * ANYOFL_UTF8_LOCALE_REQD. This flag is a natural candidate for being a separate
- * node type because it is a specialization of the current ANYOFL, and because
- * no other ANYOFL-only flags are set when it is; also most of its uses are
- * actually outside the reginclass() function, so this could be done with no
- * performance penalty. But since it can be shared, as noted above, it doesn't
- * take up space anyway. Another issue when turning a flag into a node type, is
- * that a SSC may use that flag -- not just a regular ANYOF[DL]?. In the case
- * of ANYOFL_UTF8_LOCALE_REQD, the only likely problem is accurately settting the
- * SSC node-type to the new one, which would likely involve S_ssc_or and
- * S_ssc_and, and not how the SSC currently gets set to ANYOFL.
+ * and access it through the ARG like ANYOF_ONLY_HAS_BITMAP is. This flag is
+ * used by all ANYOF node types, and it could be used to avoid calling the
+ * handler function, as the macro REGINCLASS in regexec.c does now for other
+ * cases.
*
- * Another possibility is to instead rename the ANYOF_POSIXL flag to be
- * ANYOFL_LARGE, to mean that the ANYOF node has an extra 32 bits beyond what a
- * regular one does. That's what it effectively means now, with the extra
- * space all for the POSIX class flags. But those classes actually only occupy
- * 30 bits, so 2 of the locale flags could be moved to that extra space. The
- * downside of this is that ANYOFL nodes with whichever of the flags get moved
- * would have to have the extra space always allocated.
+ * Another possibility is to instead (or additionally) rename the ANYOF_POSIXL
+ * flag to be ANYOFL_LARGE, to mean that the ANYOF node has an extra 32 bits
+ * beyond what a regular one does. That's what it effectively means now, with
+ * the extra space all for the POSIX class flags. But those classes actually
+ * only occupy 30 bits, so the ANYOFL_FOLD and
+ * ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD flags could be moved
+ * to that extra space. The 30 bits in the extra word would indicate if a
+ * posix class should be looked up or not. The downside of this is that ANYOFL
+ * nodes with folding would always have to have the extra space allocated, even
+ * if they didn't use the 30 posix bits. There isn't an SSC problem as all
+ * SSCs are this large anyway.
*
* One could completely remove ANYOFL_LARGE and make all ANYOFL nodes large.
- * The 30 bits in the extra word would indicate if a posix class should be
- * looked up or not. There isn't an SSC problem as all SSCs are this large
- * anyway, and the SSC could be set to this node type. REGINCLASS would have
- * to be modified so that if the node type were this, it would call
- * reginclass(), as the flag bit that indicates to do this now would be gone.
- * If 2 locale flags are moved to the larger structure, this would free up a
- * total of 4 bits. If this were done, we could create an ANYOF_INVERT
- * node-type without a combinatorial explosion, getting us to 5 bits. And,
- * keep in mind that ANYOF_MATCHES_ALL_ABOVE_BITMAP is solely for performance,
- * so could be removed. The other performance-related flags are shareable with
- * flags that are required.
+ * REGINCLASS would have to be modified so that if the node type were this, it
+ * would call reginclass(), as the flag bit that indicates to do this now would
+ * be gone.
*
- * Several flags are not used in synthetic start class (SSC) nodes, so could be
+ * All told, 5 bits could be available for other uses if all of the above were
+ * done.
+ *
+ * Some flags are not used in synthetic start class (SSC) nodes, so could be
* shared should new flags be needed for SSCs, like SSC_MATCHES_EMPTY_STRING
* now. */
* then. Only set under /l; never in an SSC */
#define ANYOFL_FOLD 0x04
-/* If set, ANYOFL_FOLD is also set, and there are potential matches that
- * will be valid only if the locale is a UTF-8 one. */
-#define ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE 0x08
-
-/* If set, means to warn if runtime locale isn't a UTF-8 one. Only under /l.
- * If set, none of INVERT, LOC_FOLD, POSIXL,
- * ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP can
- * be set. Can be in an SSC */
-#define ANYOFL_UTF8_LOCALE_REQD 0x10
+/* Shared bit set only with ANYOFL and SSC nodes:
+ * If ANYOFL_FOLD is set, this means there are potential matches valid
+ * only if the locale is a UTF-8 one.
+ * If ANYOFL_FOLD is NOT set, this means to warn if the runtime locale
+ * isn't a UTF-8 one (and the generated node assumes a UTF-8 locale).
+ * None of INVERT, POSIXL,
+ * ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP
+ * can be set. */
+#define ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD 0x08
+
+/* Convenience macros for teasing apart the meanings when reading the above bit
+ * */
+#define ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags) \
+ ((flags & ( ANYOFL_FOLD /* Both bits are set */ \
+ |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD)) \
+ == ( ANYOFL_FOLD \
+ |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD))
+
+#define ANYOFL_UTF8_LOCALE_REQD(flags) \
+ ((flags & ( ANYOFL_FOLD /* Only REQD bit is set */ \
+ |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD)) \
+ == ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD)
+
+/* Spare: Be sure to change ANYOF_FLAGS_ALL if this gets used 0x10 */
/* If set, the node matches every code point NUM_ANYOF_CODE_POINTS and above.
* Can be in an SSC */
* are cautioned about its shared nature */
#define ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER 0x80
-#define ANYOF_FLAGS_ALL (0xff)
+#define ANYOF_FLAGS_ALL (0xff & ~0x10)
#define ANYOF_LOCALE_FLAGS (ANYOFL_FOLD | ANYOF_MATCHES_POSIXL)
/* These are the flags that apply to both regular ANYOF nodes and synthetic
* start class nodes during construction of the SSC. During finalization of
* the SSC, other of the flags may get added to it */
-#define ANYOF_COMMON_FLAGS ANYOFL_UTF8_LOCALE_REQD
+#define ANYOF_COMMON_FLAGS 0
/* Character classes for node->classflags of ANYOF */
/* Should be synchronized with a table in regprop() */
#define ANYOF_POSIXL_AND(source, dest) STMT_START { (dest)->classflags &= (source)->classflags ; } STMT_END
-#define ANYOF_BITMAP_ZERO(ret) Zero(((struct regnode_charclass*)(ret))->bitmap, ANYOF_BITMAP_SIZE, char)
-#define ANYOF_BITMAP(p) (((struct regnode_charclass*)(p))->bitmap)
-#define ANYOF_BITMAP_BYTE(p, c) (ANYOF_BITMAP(p)[(((U8)(c)) >> 3) & 31])
+#define ANYOF_BITMAP_ZERO(ret) Zero(((regnode_charclass*)(ret))->bitmap, ANYOF_BITMAP_SIZE, char)
+#define ANYOF_BITMAP(p) ((regnode_charclass*)(p))->bitmap
+#define ANYOF_BITMAP_BYTE(p, c) BITMAP_BYTE(ANYOF_BITMAP(p), c)
#define ANYOF_BITMAP_SET(p, c) (ANYOF_BITMAP_BYTE(p, c) |= ANYOF_BIT(c))
#define ANYOF_BITMAP_CLEAR(p,c) (ANYOF_BITMAP_BYTE(p, c) &= ~ANYOF_BIT(c))
#define ANYOF_BITMAP_TEST(p, c) cBOOL(ANYOF_BITMAP_BYTE(p, c) & ANYOF_BIT(c))
#define ANYOF_BITMAP_CLEARALL(p) \
Zero (ANYOF_BITMAP(p), ANYOF_BITMAP_SIZE)
-#define ANYOF_SKIP EXTRA_SIZE(struct regnode_charclass)
+#define ANYOF_SKIP EXTRA_SIZE(regnode_charclass)
#define ANYOF_POSIXL_SKIP EXTRA_SIZE(regnode_charclass_posixl)
/*
#define REG_CUTGROUP_SEEN 0x00000100
#define REG_RUN_ON_COMMENT_SEEN 0x00000200
#define REG_UNFOLDED_MULTI_SEEN 0x00000400
-#define REG_GOSTART_SEEN 0x00000800
+/* spare */
#define REG_UNBOUNDED_QUANTIFIER_SEEN 0x00001000
three different sets... */
#define TRIE_BITMAP(p) (((reg_trie_data *)(p))->bitmap)
-#define TRIE_BITMAP_BYTE(p, c) (TRIE_BITMAP(p)[(((U8)(c)) >> 3) & 31])
+#define TRIE_BITMAP_BYTE(p, c) BITMAP_BYTE(TRIE_BITMAP(p), c)
#define TRIE_BITMAP_SET(p, c) (TRIE_BITMAP_BYTE(p, c) |= ANYOF_BIT((U8)c))
#define TRIE_BITMAP_CLEAR(p,c) (TRIE_BITMAP_BYTE(p, c) &= ~ANYOF_BIT((U8)c))
#define TRIE_BITMAP_TEST(p, c) (TRIE_BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c))
#define IS_TRIE_AC(op) ((op)>=AHOCORASICK)
-#define BITMAP_BYTE(p, c) (((U8*)p)[(((U8)(c)) >> 3) & 31])
+#define BITMAP_BYTE(p, c) (( (U8*) p) [ ( ( (UV) (c)) >> 3) ] )
#define BITMAP_TEST(p, c) (BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c))
/* these defines assume uniquecharcount is the correct variable, and state may be evaluated twice */
/* get_sv() can return NULL during global destruction. */
#define GET_RE_DEBUG_FLAGS DEBUG_r({ \
SV * re_debug_flags_sv = NULL; \
- re_debug_flags_sv = get_sv(RE_DEBUG_FLAGS, 1); \
+ re_debug_flags_sv = PL_curcop ? get_sv(RE_DEBUG_FLAGS, GV_ADD) : NULL; \
if (re_debug_flags_sv) { \
if (!SvIOK(re_debug_flags_sv)) \
sv_setuv(re_debug_flags_sv, RE_DEBUG_COMPILE_DUMP | RE_DEBUG_EXECUTE_MASK ); \
PERL_UNUSED_VAR(re_debug_flags); GET_RE_DEBUG_FLAGS;
#define RE_PV_COLOR_DECL(rpv,rlen,isuni,dsv,pv,l,m,c1,c2) \
- const char * const rpv = \
- pv_pretty((dsv), (pv), (l), (m), \
- PL_colors[(c1)],PL_colors[(c2)], \
+ const char * const rpv = \
+ pv_pretty((dsv), (pv), (l), \
+ (PL_dump_re_max_len) ? PL_dump_re_max_len : (m), \
+ PL_colors[(c1)],PL_colors[(c2)], \
PERL_PV_ESCAPE_RE|PERL_PV_ESCAPE_NONASCII |((isuni) ? PERL_PV_ESCAPE_UNI : 0) ); \
const int rlen = SvCUR(dsv)
-#define RE_SV_ESCAPE(rpv,isuni,dsv,sv,m) \
- const char * const rpv = \
- pv_pretty((dsv), (SvPV_nolen_const(sv)), (SvCUR(sv)), (m), \
- PL_colors[(c1)],PL_colors[(c2)], \
+#define RE_SV_ESCAPE(rpv,isuni,dsv,sv,m) \
+ const char * const rpv = \
+ pv_pretty((dsv), (SvPV_nolen_const(sv)), (SvCUR(sv)), \
+ (PL_dump_re_max_len) ? PL_dump_re_max_len : (m), \
+ PL_colors[(c1)],PL_colors[(c2)], \
PERL_PV_ESCAPE_RE|PERL_PV_ESCAPE_NONASCII |((isuni) ? PERL_PV_ESCAPE_UNI : 0) )
#define RE_PV_QUOTED_DECL(rpv,isuni,dsv,pv,l,m) \
const char * const rpv = \
- pv_pretty((dsv), (pv), (l), (m), \
- PL_colors[0], PL_colors[1], \
+ pv_pretty((dsv), (pv), (l), \
+ (PL_dump_re_max_len) ? PL_dump_re_max_len : (m), \
+ PL_colors[0], PL_colors[1], \
( PERL_PV_PRETTY_QUOTE | PERL_PV_ESCAPE_RE | PERL_PV_ESCAPE_NONASCII | PERL_PV_PRETTY_ELLIPSES | \
((isuni) ? PERL_PV_ESCAPE_UNI : 0)) \
)
typedef enum {
TRADITIONAL_BOUND = _CC_WORDCHAR,
GCB_BOUND,
+ LB_BOUND,
SB_BOUND,
WB_BOUND
} bound_type;