#endif
/*
- * The "internal use only" fields in regexp.h are present to pass info from
- * compile to execute that permits the execute phase to run lots faster on
- * simple cases. They are:
- *
- * regstart sv that must begin a match; NULL if none obvious
- * reganch is the match anchored (at beginning-of-line only)?
- * regmust string (pointer into program) that match must include, or NULL
- * [regmust changed to SV* for bminstr()--law]
- * regmlen length of regmust string
- * [regmlen not used currently]
- *
- * Regstart and reganch permit very fast decisions on suitable starting points
- * for a match, cutting down the work a lot. Regmust permits fast rejection
- * of lines that cannot possibly match. The regmust tests are costly enough
- * that pregcomp() supplies a regmust only if the r.e. contains something
- * potentially expensive (at present, the only such thing detected is * or +
- * at the start of the r.e., which can involve a lot of backup). Regmlen is
- * supplied because the test in pregexec() needs it and pregcomp() is computing
- * it anyway.
- * [regmust is now supplied always. The tests that use regmust have a
- * heuristic that disables the test if it usually matches.]
- *
- * [In fact, we now use regmust in many cases to locate where the search
- * starts in the string, so if regback is >= 0, the regmust search is never
- * wasted effort. The regback variable says how many characters back from
- * where regmust matched is the earliest possible start of the match.
- * For instance, /[a-z].foo/ has a regmust of 'foo' and a regback of 2.]
- */
-
-/*
* Structure for regexp "program". This is essentially a linear encoding
* of a nondeterministic finite-state machine (aka syntax charts or
* "railroad normal form" in parsing technology). Each node is an opcode
private to the engine itself. It now lives here. */
typedef struct regexp_internal {
- int name_list_idx; /* Optional data index of an array of paren names */
union {
U32 *offsets; /* offset annotations 20001228 MJD
data about mapping the program to the
data that the regops need. Often the ARG field of
a regop is an index into this structure */
struct reg_code_blocks *code_blocks;/* positions of literal (?{}) */
+ int name_list_idx; /* Optional data index of an array of paren names */
regnode program[1]; /* Unwarranted chumminess with compiler. */
} regexp_internal;
char string[1];
};
+struct regnode_lstring { /* Constructed this way to keep the string aligned. */
+ U8 flags;
+ U8 type;
+ U16 next_off;
+ U32 str_len; /* Only 18 bits allowed before would overflow 'next_off' */
+ char string[1];
+};
+
+struct regnode_anyofhs { /* Constructed this way to keep the string aligned. */
+ U8 str_len;
+ U8 type;
+ U16 next_off;
+ U32 arg1; /* set by set_ANYOF_arg() */
+ char string[1];
+};
+
/* Argument bearing node - workhorse,
arg1 is often for the data field */
struct regnode_1 {
U32 arg1;
};
+/* Node whose argument is 'void *', a pointer to void. This needs to be used
+ * very carefully in situations where pointers won't become invalid because of,
+ * say re-mallocs */
+struct regnode_p {
+ U8 flags;
+ U8 type;
+ U16 next_off;
+ void * arg1;
+};
+
/* Similar to a regnode_1 but with an extra signed argument */
struct regnode_2L {
U8 flags;
U16 arg2;
};
-/* This give the number of code points that can be in the bitmap of an ANYOF
- * node. The shift number must currently be one of: 8..12. It can't be less
- * than 8 (256) because some code relies on it being at least that. Above 12
- * (4096), and you start running into warnings that some data structure widths
- * have been exceeded, though the test suite as of this writing still passes
- * for up through 16, which is as high as anyone would ever want to go,
- * encompassing all of the Unicode BMP, and thus including all the economically
- * important world scripts. At 12 most of them are: including Arabic,
- * Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han,
- * Japanese, nor Korean. (The regarglen structure in regnodes.h is a U8, and
- * the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values
- * above 12.) Be sure to benchmark before changing, as larger sizes do
- * significantly slow down the test suite */
-#define NUM_ANYOF_CODE_POINTS (1 << 8)
-
#define ANYOF_BITMAP_SIZE (NUM_ANYOF_CODE_POINTS / 8) /* 8 bits/Byte */
/* Note that these form structs which are supersets of the next smaller one, by
#undef ARG2
#define ARG(p) ARG_VALUE(ARG_LOC(p))
+#define ARGp(p) ARG_VALUE(ARGp_LOC(p))
#define ARG1(p) ARG_VALUE(ARG1_LOC(p))
#define ARG2(p) ARG_VALUE(ARG2_LOC(p))
#define ARG2L(p) ARG_VALUE(ARG2L_LOC(p))
#define ARG_SET(p, val) ARG__SET(ARG_LOC(p), (val))
+#define ARGp_SET(p, val) ARG__SET(ARGp_LOC(p), (val))
#define ARG1_SET(p, val) ARG__SET(ARG1_LOC(p), (val))
#define ARG2_SET(p, val) ARG__SET(ARG2_LOC(p), (val))
#define ARG2L_SET(p, val) ARG__SET(ARG2L_LOC(p), (val))
#define FLAGS(p) ((p)->flags) /* Caution: Doesn't apply to all \
regnode types. For some, it's the \
character set of the regnode */
+#define STR_LENs(p) (__ASSERT_(OP(p) != LEXACT && OP(p) != LEXACT_REQ8) \
+ ((struct regnode_string *)p)->str_len)
+#define STRINGs(p) (__ASSERT_(OP(p) != LEXACT && OP(p) != LEXACT_REQ8) \
+ ((struct regnode_string *)p)->string)
+#define OPERANDs(p) STRINGs(p)
+
+/* Long strings. Currently limited to length 18 bits, which handles a 262000
+ * byte string. The limiting factor is the 16 bit 'next_off' field, which
+ * points to the next regnode, so the furthest away it can be is 2**16. On
+ * most architectures, regnodes are 2**2 bytes long, so that yields 2**18
+ * bytes. Should a longer string be desired, we could increase it to 26 bits
+ * fairly easily, by changing this node to have longj type which causes the ARG
+ * field to be used for the link to the next regnode (although code would have
+ * to be changed to account for this), and then use a combination of the flags
+ * and next_off fields for the length. To get 34 bit length, also change the
+ * node to be an ARG2L, using the second 32 bit field for the length, and not
+ * using the flags nor next_off fields at all. One could have an llstring node
+ * and even an lllstring type. */
+#define STR_LENl(p) (__ASSERT_(OP(p) == LEXACT || OP(p) == LEXACT_REQ8) \
+ (((struct regnode_lstring *)p)->str_len))
+#define STRINGl(p) (__ASSERT_(OP(p) == LEXACT || OP(p) == LEXACT_REQ8) \
+ (((struct regnode_lstring *)p)->string))
+#define OPERANDl(p) STRINGl(p)
+
+#define STR_LEN(p) ((OP(p) == LEXACT || OP(p) == LEXACT_REQ8) \
+ ? STR_LENl(p) : STR_LENs(p))
+#define STRING(p) ((OP(p) == LEXACT || OP(p) == LEXACT_REQ8) \
+ ? STRINGl(p) : STRINGs(p))
#define OPERAND(p) STRING(p)
-#define STR_LEN(p) (((struct regnode_string *)p)->str_len)
-#define STRING(p) (((struct regnode_string *)p)->string)
-
/* The number of (smallest) regnode equivalents that a string of length l bytes
* occupies */
#define STR_SZ(l) (((l) + sizeof(regnode) - 1) / sizeof(regnode))
/* The number of (smallest) regnode equivalents that the EXACTISH node 'p'
* occupies */
-#define NODE_SZ_STR(p) (STR_SZ(STR_LEN(p))+1)
+#define NODE_SZ_STR(p) (STR_SZ(STR_LEN(p)) + 1 + regarglen[(p)->type])
#define setSTR_LEN(p,v) \
- ((struct regnode_string *)(p))->str_len = (v);
+ STMT_START{ \
+ if (OP(p) == LEXACT || OP(p) == LEXACT_REQ8) \
+ ((struct regnode_lstring *)(p))->str_len = (v); \
+ else \
+ ((struct regnode_string *)(p))->str_len = (v); \
+ } STMT_END
+
+#define ANYOFR_BASE_BITS 20
+#define ANYOFRbase(p) (ARG(p) & ((1 << ANYOFR_BASE_BITS) - 1))
+#define ANYOFRdelta(p) (ARG(p) >> ANYOFR_BASE_BITS)
#undef NODE_ALIGN
#undef ARG_LOC
#define NODE_ALIGN(node)
#define ARG_LOC(p) (((struct regnode_1 *)p)->arg1)
+#define ARGp_LOC(p) (((struct regnode_p *)p)->arg1)
#define ARG1_LOC(p) (((struct regnode_2 *)p)->arg1)
#define ARG2_LOC(p) (((struct regnode_2 *)p)->arg2)
#define ARG2L_LOC(p) (((struct regnode_2L *)p)->arg2)
#define NODE_STEP_REGNODE 1 /* sizeof(regnode)/sizeof(regnode) */
#define EXTRA_STEP_2ARGS EXTRA_SIZE(struct regnode_2)
-#define NODE_STEP_B 4
-
#define NEXTOPER(p) ((p) + NODE_STEP_REGNODE)
#define PREVOPER(p) ((p) - NODE_STEP_REGNODE)
* that have a longer argument */ \
(offset) += regarglen[op]; \
} STMT_END
+#define FILL_ADVANCE_NODE_ARGp(offset, op, arg) \
+ STMT_START { \
+ ARGp_SET(REGNODE_p(offset), arg); \
+ FILL_ADVANCE_NODE(offset, op); \
+ (offset) += regarglen[op]; \
+ } STMT_END
#define FILL_ADVANCE_NODE_2L_ARG(offset, op, arg1, arg2) \
STMT_START { \
ARG_SET(REGNODE_p(offset), arg1); \
* 2) A subset of item 1) is if all possible code points outside the bitmap
* match. This is a common occurrence when the class is complemented,
* like /[^ij]/. Therefore a bit is reserved to indicate this,
- * rather than having a more expensive inversion list created,
+ * rather than having an inversion list created,
* ANYOF_MATCHES_ALL_ABOVE_BITMAP.
* 3) Under /d rules, it can happen that code points that are in the upper
* latin1 range (\x80-\xFF or their equivalents on EBCDIC platforms) match
#define RE_DEBUG_EXECUTE_TRIE 0x000400
/* Extra */
-#define RE_DEBUG_EXTRA_MASK 0x1FF0000
+#define RE_DEBUG_EXTRA_MASK 0x3FF0000
#define RE_DEBUG_EXTRA_TRIE 0x0010000
#define RE_DEBUG_EXTRA_OFFSETS 0x0020000
#define RE_DEBUG_EXTRA_OFFDEBUG 0x0040000
#define RE_DEBUG_EXTRA_BUFFERS 0x0400000
#define RE_DEBUG_EXTRA_GPOS 0x0800000
#define RE_DEBUG_EXTRA_DUMP_PRE_OPTIMIZE 0x1000000
+#define RE_DEBUG_EXTRA_WILDCARD 0x2000000
/* combined */
#define RE_DEBUG_EXTRA_STACK 0x0280000
-#define RE_DEBUG_FLAG(x) (re_debug_flags & x)
+#define RE_DEBUG_FLAG(x) (re_debug_flags & (x))
/* Compile */
#define DEBUG_COMPILE_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_COMPILE_MASK)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_COMPILE_MASK)) x )
#define DEBUG_PARSE_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_COMPILE_PARSE)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_COMPILE_PARSE)) x )
#define DEBUG_OPTIMISE_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_COMPILE_OPTIMISE)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE)) x )
#define DEBUG_DUMP_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_COMPILE_DUMP)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_COMPILE_DUMP)) x )
#define DEBUG_TRIE_COMPILE_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_COMPILE_TRIE)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_COMPILE_TRIE)) x )
#define DEBUG_FLAGS_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_COMPILE_FLAGS)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_COMPILE_FLAGS)) x )
#define DEBUG_TEST_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_COMPILE_TEST)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_COMPILE_TEST)) x )
/* Execute */
#define DEBUG_EXECUTE_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXECUTE_MASK)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXECUTE_MASK)) x )
#define DEBUG_INTUIT_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXECUTE_INTUIT)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXECUTE_INTUIT)) x )
#define DEBUG_MATCH_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXECUTE_MATCH)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXECUTE_MATCH)) x )
#define DEBUG_TRIE_EXECUTE_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXECUTE_TRIE)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXECUTE_TRIE)) x )
/* Extra */
#define DEBUG_EXTRA_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_MASK)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_MASK)) x )
#define DEBUG_OFFSETS_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_OFFSETS)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_OFFSETS)) x )
#define DEBUG_STATE_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_STATE)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STATE)) x )
#define DEBUG_STACK_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_STACK)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) x )
#define DEBUG_BUFFERS_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_BUFFERS)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_BUFFERS)) x )
#define DEBUG_OPTIMISE_MORE_r(x) DEBUG_r( \
if (DEBUG_v_TEST || ((RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE) == \
- (re_debug_flags & (RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE)))) x )
+ RE_DEBUG_FLAG(RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE))) x )
#define MJD_OFFSET_DEBUG(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_OFFDEBUG)) \
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_OFFDEBUG)) \
Perl_warn_nocontext x )
#define DEBUG_TRIE_COMPILE_MORE_r(x) DEBUG_TRIE_COMPILE_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_TRIE)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_TRIE)) x )
#define DEBUG_TRIE_EXECUTE_MORE_r(x) DEBUG_TRIE_EXECUTE_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_TRIE)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_TRIE)) x )
#define DEBUG_TRIE_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & (RE_DEBUG_COMPILE_TRIE \
- | RE_DEBUG_EXECUTE_TRIE ))) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_COMPILE_TRIE \
+ | RE_DEBUG_EXECUTE_TRIE )) x )
#define DEBUG_GPOS_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_GPOS)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_GPOS)) x )
#define DEBUG_DUMP_PRE_OPTIMIZE_r(x) DEBUG_r( \
- if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_DUMP_PRE_OPTIMIZE)) x )
+ if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_DUMP_PRE_OPTIMIZE)) x )
/* initialization */
-/* get_sv() can return NULL during global destruction. */
-#define GET_RE_DEBUG_FLAGS DEBUG_r({ \
- SV * re_debug_flags_sv = NULL; \
- re_debug_flags_sv = PL_curcop ? get_sv(RE_DEBUG_FLAGS, GV_ADD) : NULL; \
- if (re_debug_flags_sv) { \
- if (!SvIOK(re_debug_flags_sv)) \
- sv_setuv(re_debug_flags_sv, RE_DEBUG_COMPILE_DUMP | RE_DEBUG_EXECUTE_MASK ); \
- re_debug_flags=SvIV(re_debug_flags_sv); \
- }\
-})
-
#ifdef DEBUGGING
-#define GET_RE_DEBUG_FLAGS_DECL volatile IV re_debug_flags = 0; \
- PERL_UNUSED_VAR(re_debug_flags); GET_RE_DEBUG_FLAGS;
+#define DECLARE_AND_GET_RE_DEBUG_FLAGS \
+ volatile IV re_debug_flags = 0; PERL_UNUSED_VAR(re_debug_flags); \
+ STMT_START { \
+ SV * re_debug_flags_sv = NULL; \
+ /* get_sv() can return NULL during global destruction. */ \
+ re_debug_flags_sv = PL_curcop ? get_sv(RE_DEBUG_FLAGS, GV_ADD) : NULL; \
+ if (re_debug_flags_sv) { \
+ if (!SvIOK(re_debug_flags_sv)) /* If doesnt exist set to default */\
+ sv_setuv(re_debug_flags_sv, \
+ /* These defaults should be kept in sync with re.pm */ \
+ RE_DEBUG_COMPILE_DUMP | RE_DEBUG_EXECUTE_MASK ); \
+ re_debug_flags=SvIV(re_debug_flags_sv); \
+ } \
+ } STMT_END
+
+#define isDEBUG_WILDCARD (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_WILDCARD))
#define RE_PV_COLOR_DECL(rpv,rlen,isuni,dsv,pv,l,m,c1,c2) \
const char * const rpv = \
#else /* if not DEBUGGING */
-#define GET_RE_DEBUG_FLAGS_DECL
+#define DECLARE_AND_GET_RE_DEBUG_FLAGS
#define RE_PV_COLOR_DECL(rpv,rlen,isuni,dsv,pv,l,m,c1,c2)
#define RE_SV_ESCAPE(rpv,isuni,dsv,sv,m)
#define RE_PV_QUOTED_DECL(rpv,isuni,dsv,pv,l,m)
#define RE_SV_DUMPLEN(ItEm)
#define RE_SV_TAIL(ItEm)
+#define isDEBUG_WILDCARD 0
#endif /* DEBUG RELATED DEFINES */
WB_BOUND
} bound_type;
-/* This unpacks the FLAGS field of ANYOFHx nodes. The value it contains
+/* This unpacks the FLAGS field of ANYOF[HR]x nodes. The value it contains
* gives the strict lower bound for the UTF-8 start byte of any code point
* matchable by the node, and a loose upper bound as well.
*