* but allows patterns to get big without disasters.
*
* [The "next" pointer is always aligned on an even
- * boundary, and reads the offset directly as a short. Also, there is no
- * special test to reverse the sign of BACK pointers since the offset is
- * stored negative.]
+ * boundary, and reads the offset directly as a short.]
*/
/* This is the stuff that used to live in regexp.h that was truly
#define PREGf_GPOS_SEEN 0x00000100
#define PREGf_GPOS_FLOAT 0x00000200
-#define PREGf_ANCH_BOL 0x00000400
-#define PREGf_ANCH_MBOL 0x00000800
-#define PREGf_ANCH_SBOL 0x00001000
-#define PREGf_ANCH_GPOS 0x00002000
+#define PREGf_ANCH_MBOL 0x00000400
+#define PREGf_ANCH_SBOL 0x00000800
+#define PREGf_ANCH_GPOS 0x00001000
-#define PREGf_ANCH (PREGf_ANCH_SBOL | PREGf_ANCH_GPOS | \
- PREGf_ANCH_MBOL | PREGf_ANCH_BOL )
+#define PREGf_ANCH \
+ ( PREGf_ANCH_SBOL | PREGf_ANCH_GPOS | PREGf_ANCH_MBOL )
/* this is where the old regcomp.h started */
U16 arg2;
};
-#define NUM_ANYOF_CODE_POINTS 256
+/* This give the number of code points that can be in the bitmap of an ANYOF
+ * node. The shift number must currently be one of: 8..12. It can't be less
+ * than 8 (256) because some code relies on it being at least that. Above 12
+ * (4096), and you start running into warnings that some data structure widths
+ * have been exceeded, though the test suite as of this writing still passes
+ * for up through 16, which is as high as anyone would ever want to go,
+ * encompassing all of the Unicode BMP, and thus including all the economically
+ * important world scripts. At 12 most of them are: including Arabic,
+ * Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han,
+ * Japanese, nor Korean. (The regarglen structure in regnodes.h is a U8, and
+ * the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values
+ * below above 12.) Be sure to benchmark before changing, as larger sizes do
+ * significantly slow down the test suite */
+#define NUM_ANYOF_CODE_POINTS (1 << 8)
#define ANYOF_BITMAP_SIZE (NUM_ANYOF_CODE_POINTS / 8) /* 8 bits/Byte */
U8 flags;
U8 type;
U16 next_off;
- U32 arg1;
+ U32 arg1; /* set by set_ANYOF_arg() */
char bitmap[ANYOF_BITMAP_SIZE]; /* only compile-time */
};
U8 type;
U16 next_off;
U32 arg1;
- char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */
+ char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time ... */
U32 classflags; /* and run-time */
};
* extra SV*, used only during its construction and which is not used by
* regexec.c. Note that the 'next_off' field is unused, as the SSC stands
* alone, so there is never a next node. Also, there is no alignment issue,
- * becase these are declared or allocated as a complete unit so the compiler
+ * because these are declared or allocated as a complete unit so the compiler
* takes care of alignment. This is unlike the other regnodes which are
* allocated in terms of multiples of a single-argument regnode. SSC nodes can
* have a pointer field because there is no alignment issue, and because it is
U8 type;
U16 next_off;
U32 arg1;
- char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */
- U32 classflags; /* and run-time */
+ char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time ... */
+ U32 classflags; /* ... and run-time */
/* Auxiliary, only used during construction; NULL afterwards: list of code
* points matched */
#define NEXT_OFF(p) ((p)->next_off)
#define NODE_ALIGN(node)
-#define NODE_ALIGN_FILL(node) ((node)->flags = 0xde) /* deadbeef */
+/* the following define was set to 0xde in 075abff3
+ * as part of some linting logic. I have set it to 0
+ * as otherwise in every place where we /might/ set flags
+ * we have to set it 0 explicitly, which duplicates
+ * assignments and IMO adds an unacceptable level of
+ * surprise to working in the regex engine. If this
+ * is changed from 0 then at the very least make sure
+ * that SBOL for /^/ sets the flags to 0 explicitly.
+ * -- Yves */
+#define NODE_ALIGN_FILL(node) ((node)->flags = 0)
#define SIZE_ALIGN NODE_ALIGN
(ptr)->type = op; (ptr)->next_off = 0; (ptr)++; } STMT_END
#define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \
ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 1; } STMT_END
+#define FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2) \
+ STMT_START { \
+ ARG_SET(ptr, arg1); \
+ ARG2L_SET(ptr, arg2); \
+ FILL_ADVANCE_NODE(ptr, op); \
+ (ptr) += 2; \
+ } STMT_END
#define REG_MAGIC 0234
* probably better than that commit anyway. But it could be reinstated if we
* need a bit. The LOC flags are only for /l nodes; the reverted commit was
* only for /d, so there are no combinatorial issues. The LOC flag to use is
- * probably the POSIXL one.
+ * probably the POSIXL one. Now that there is an ANYOFL (locale) node, another
+ * option would be to make all of those include the POSIXL data structure,
+ * which would get rid of needing a separate POSIXL flag. But it would
+ * increase the size of all such nodes, so it's probably not as atractive as
+ * having an ANYOF_POSIXL node type. But if we did do it, note that not all 32
+ * bits of that extra space are used, one bit of that could be set aside for
+ * the LOC_FOLD flag, yielding yet another bit. This would require extra code
+ * for masking, so again not the most attractive solution.
+ *
* Several flags are not used in synthetic start class (SSC) nodes, so could be
* shared should new flags be needed for SSCs, like SSC_MATCHES_EMPTY_STRING
* now. */
#define ANYOF_NLOWER ((ANYOF_LOWER) + 1)
#define ANYOF_PRINT ((_CC_PRINT) * 2)
#define ANYOF_NPRINT ((ANYOF_PRINT) + 1)
-#define ANYOF_PSXSPC ((_CC_PSXSPC) * 2) /* POSIX space: \s plus the vertical tab */
-#define ANYOF_NPSXSPC ((ANYOF_PSXSPC) + 1)
#define ANYOF_PUNCT ((_CC_PUNCT) * 2)
#define ANYOF_NPUNCT ((ANYOF_PUNCT) + 1)
#define ANYOF_SPACE ((_CC_SPACE) * 2) /* \s */
/* Utility macros for the bitmap and classes of ANYOF */
-#define ANYOF_SIZE (sizeof(struct regnode_charclass))
-#define ANYOF_POSIXL_SIZE (sizeof(regnode_charclass_posixl))
-#define ANYOF_CLASS_SIZE ANYOF_POSIXL_SIZE
-
#define ANYOF_FLAGS(p) ((p)->flags)
#define ANYOF_BIT(c) (1U << ((c) & 7))
#define ANYOF_BITMAP_CLEARALL(p) \
Zero (ANYOF_BITMAP(p), ANYOF_BITMAP_SIZE)
-#define ANYOF_SKIP ((ANYOF_SIZE - 1)/sizeof(regnode))
-#define ANYOF_POSIXL_SKIP ((ANYOF_POSIXL_SIZE - 1)/sizeof(regnode))
-#define ANYOF_CLASS_SKIP ANYOF_POSIXL_SKIP
+#define ANYOF_SKIP EXTRA_SIZE(struct regnode_charclass)
+#define ANYOF_POSIXL_SKIP EXTRA_SIZE(regnode_charclass_posixl)
/*
* Utility definitions.
#define RE_DEBUG_COMPILE_TRIE 0x000004
#define RE_DEBUG_COMPILE_DUMP 0x000008
#define RE_DEBUG_COMPILE_FLAGS 0x000010
+#define RE_DEBUG_COMPILE_TEST 0x000020
/* Execute */
#define RE_DEBUG_EXECUTE_MASK 0x00FF00
if (re_debug_flags & RE_DEBUG_COMPILE_TRIE) x )
#define DEBUG_FLAGS_r(x) DEBUG_r( \
if (re_debug_flags & RE_DEBUG_COMPILE_FLAGS) x )
+#define DEBUG_TEST_r(x) DEBUG_r( \
+ if (re_debug_flags & RE_DEBUG_COMPILE_TEST) x )
/* Execute */
#define DEBUG_EXECUTE_r(x) DEBUG_r( \
if (re_debug_flags & RE_DEBUG_EXECUTE_MASK) x )
#endif /* DEBUG RELATED DEFINES */
+typedef enum {
+ TRADITIONAL_BOUND = _CC_WORDCHAR,
+ GCB_BOUND,
+ SB_BOUND,
+ WB_BOUND
+} bound_type;
+
/*
* Local variables:
* c-indentation-style: bsd