* License or the Artistic License, as specified in the README file.
*
*/
+
+#ifndef PERL_REGCOMP_H_
+#define PERL_REGCOMP_H_
+
#include "regcharclass.h"
/* Convert branch sequences to more efficient trie ops? */
/* This is the stuff that used to live in regexp.h that was truly
private to the engine itself. It now lives here. */
-
-
typedef struct regexp_internal {
int name_list_idx; /* Optional data index of an array of paren names */
union {
Used to make it easier to clone and free arbitrary
data that the regops need. Often the ARG field of
a regop is an index into this structure */
- struct reg_code_block *code_blocks;/* positions of literal (?{}) */
- int num_code_blocks; /* size of code_blocks[] */
+ struct reg_code_blocks *code_blocks;/* positions of literal (?{}) */
regnode program[1]; /* Unwarranted chumminess with compiler. */
} regexp_internal;
#define PREGf_ANCH_MBOL 0x00000400
#define PREGf_ANCH_SBOL 0x00000800
#define PREGf_ANCH_GPOS 0x00001000
+#define PREGf_RECURSE_SEEN 0x00002000
#define PREGf_ANCH \
( PREGf_ANCH_SBOL | PREGf_ANCH_GPOS | PREGf_ANCH_MBOL )
char string[1];
};
+struct regnode_lstring { /* Constructed this way to keep the string aligned. */
+ U8 flags;
+ U8 type;
+ U16 next_off;
+ U32 str_len; /* Only 16 bits allowed before would overflow 'next_off' */
+ char string[1];
+};
+
/* Argument bearing node - workhorse,
arg1 is often for the data field */
struct regnode_1 {
* Cyrillic, Greek, Hebrew, Indian subcontinent, Latin, and Thai; but not Han,
* Japanese, nor Korean. (The regarglen structure in regnodes.h is a U8, and
* the trie types TRIEC and AHOCORASICKC are larger than U8 for shift values
- * below above 12.) Be sure to benchmark before changing, as larger sizes do
+ * above 12.) Be sure to benchmark before changing, as larger sizes do
* significantly slow down the test suite */
#define NUM_ANYOF_CODE_POINTS (1 << 8)
};
/* has runtime (locale) \d, \w, ..., [:posix:] classes */
-struct regnode_charclass_class {
+struct regnode_charclass_posixl {
U8 flags; /* ANYOF_MATCHES_POSIXL bit must go here */
U8 type;
U16 next_off;
Impose a limit of REG_INFTY on various pattern matching operations
to limit stack growth and to avoid "infinite" recursions.
*/
-/* The default size for REG_INFTY is I16_MAX, which is the same as
- SHORT_MAX (see perl.h). Unfortunately I16 isn't necessarily 16 bits
- (see handy.h). On the Cray C90, sizeof(short)==4 and hence I16_MAX is
- ((1<<31)-1), while on the Cray T90, sizeof(short)==8 and I16_MAX is
- ((1<<63)-1). To limit stack growth to reasonable sizes, supply a
+/* The default size for REG_INFTY is U16_MAX, which is the same as
+ USHORT_MAX (see perl.h). Unfortunately U16 isn't necessarily 16 bits
+ (see handy.h). On the Cray C90, sizeof(short)==4 and hence U16_MAX is
+ ((1<<32)-1), while on the Cray T90, sizeof(short)==8 and U16_MAX is
+ ((1<<64)-1). To limit stack growth to reasonable sizes, supply a
smaller default.
--Andy Dougherty 11 June 1998
*/
#if SHORTSIZE > 2
# ifndef REG_INFTY
-# define REG_INFTY ((1<<15)-1)
+# define REG_INFTY ((1<<16)-1)
# endif
#endif
#ifndef REG_INFTY
-# define REG_INFTY I16_MAX
+# define REG_INFTY U16_MAX
#endif
#define ARG_VALUE(arg) (arg)
#undef OP
#undef OPERAND
-#undef MASK
#undef STRING
#define OP(p) ((p)->type)
#define FLAGS(p) ((p)->flags) /* Caution: Doesn't apply to all \
regnode types. For some, it's the \
character set of the regnode */
-#define OPERAND(p) (((struct regnode_string *)p)->string)
-#define MASK(p) ((char*)OPERAND(p))
-#define STR_LEN(p) (((struct regnode_string *)p)->str_len)
-#define STRING(p) (((struct regnode_string *)p)->string)
-#define STR_SZ(l) ((l + sizeof(regnode) - 1) / sizeof(regnode))
-#define NODE_SZ_STR(p) (STR_SZ(STR_LEN(p))+1)
+#define STR_LENs(p) (__ASSERT_(OP(p) != LEXACT && OP(p) != LEXACT_ONLY8) \
+ ((struct regnode_string *)p)->str_len)
+#define STRINGs(p) (__ASSERT_(OP(p) != LEXACT && OP(p) != LEXACT_ONLY8) \
+ ((struct regnode_string *)p)->string)
+#define OPERANDs(p) STRINGs(p)
+
+/* Long strings. Currently limited to length 18 bits, which handles a 262000
+ * byte string. The limiting factor is the 16 bit 'next_off' field, which
+ * points to the next regnode, so the furthest away it can be is 2**16. On
+ * most architectures, regnodes are 2**2 bytes long, so that yields 2**18
+ * bytes. Should a longer string be desired, we could increase it to 26 bits
+ * fairly easily, by changing this node to have longj type which causes the ARG
+ * field to be used for the link to the next regnode (although code would have
+ * to be changed to account for this), and then use a combination of the flags
+ * and next_off fields for the length. To get 34 bit length, also change the
+ * node to be an ARG2L, using the second 32 bit field for the length, and not
+ * using the flags nor next_off fields at all. One could have an llstring node
+ * and even an lllstring type. */
+#define STR_LENl(p) (__ASSERT_(OP(p) == LEXACT || OP(p) == LEXACT_ONLY8) \
+ (((struct regnode_lstring *)p)->str_len))
+#define STRINGl(p) (__ASSERT_(OP(p) == LEXACT || OP(p) == LEXACT_ONLY8) \
+ (((struct regnode_lstring *)p)->string))
+#define OPERANDl(p) STRINGl(p)
+
+#define STR_LEN(p) ((OP(p) == LEXACT || OP(p) == LEXACT_ONLY8) \
+ ? STR_LENl(p) : STR_LENs(p))
+#define STRING(p) ((OP(p) == LEXACT || OP(p) == LEXACT_ONLY8) \
+ ? STRINGl(p) : STRINGs(p))
+#define OPERAND(p) STRING(p)
+
+/* The number of (smallest) regnode equivalents that a string of length l bytes
+ * occupies */
+#define STR_SZ(l) (((l) + sizeof(regnode) - 1) / sizeof(regnode))
+
+/* The number of (smallest) regnode equivalents that the EXACTISH node 'p'
+ * occupies */
+#define NODE_SZ_STR(p) (STR_SZ(STR_LEN(p)) + 1 + regarglen[(p)->type])
+
+#define setSTR_LEN(p,v) \
+ STMT_START{ \
+ if (OP(p) == LEXACT || OP(p) == LEXACT_ONLY8) \
+ ((struct regnode_lstring *)(p))->str_len = (v); \
+ else \
+ ((struct regnode_string *)(p))->str_len = (v); \
+ } STMT_END
#undef NODE_ALIGN
#undef ARG_LOC
#define NEXTOPER(p) ((p) + NODE_STEP_REGNODE)
#define PREVOPER(p) ((p) - NODE_STEP_REGNODE)
-#define FILL_ADVANCE_NODE(ptr, op) STMT_START { \
- (ptr)->type = op; (ptr)->next_off = 0; (ptr)++; } STMT_END
-#define FILL_ADVANCE_NODE_ARG(ptr, op, arg) STMT_START { \
- ARG_SET(ptr, arg); FILL_ADVANCE_NODE(ptr, op); (ptr) += 1; } STMT_END
-#define FILL_ADVANCE_NODE_2L_ARG(ptr, op, arg1, arg2) \
- STMT_START { \
- ARG_SET(ptr, arg1); \
- ARG2L_SET(ptr, arg2); \
- FILL_ADVANCE_NODE(ptr, op); \
- (ptr) += 2; \
- } STMT_END
+#define FILL_NODE(offset, op) \
+ STMT_START { \
+ OP(REGNODE_p(offset)) = op; \
+ NEXT_OFF(REGNODE_p(offset)) = 0; \
+ } STMT_END
+#define FILL_ADVANCE_NODE(offset, op) \
+ STMT_START { \
+ FILL_NODE(offset, op); \
+ (offset)++; \
+ } STMT_END
+#define FILL_ADVANCE_NODE_ARG(offset, op, arg) \
+ STMT_START { \
+ ARG_SET(REGNODE_p(offset), arg); \
+ FILL_ADVANCE_NODE(offset, op); \
+ /* This is used generically for other operations \
+ * that have a longer argument */ \
+ (offset) += regarglen[op]; \
+ } STMT_END
+#define FILL_ADVANCE_NODE_2L_ARG(offset, op, arg1, arg2) \
+ STMT_START { \
+ ARG_SET(REGNODE_p(offset), arg1); \
+ ARG2L_SET(REGNODE_p(offset), arg2); \
+ FILL_ADVANCE_NODE(offset, op); \
+ (offset) += 2; \
+ } STMT_END
#define REG_MAGIC 0234
-#define SIZE_ONLY (RExC_emit == (regnode *) & RExC_emit_dummy)
-#define PASS1 SIZE_ONLY
-#define PASS2 (! SIZE_ONLY)
-
/* An ANYOF node is basically a bitmap with the index being a code point. If
* the bit for that code point is 1, the code point matches; if 0, it doesn't
* match (complemented if inverted). There is an additional mechanism to deal
* never reach this high). */
#define ANYOF_ONLY_HAS_BITMAP ((U32) -1)
-/* When the bimap isn't completely sufficient for handling the ANYOF node,
+/* When the bitmap isn't completely sufficient for handling the ANYOF node,
* flags (in node->flags of the ANYOF node) get set to indicate this. These
* are perennially in short supply. Beyond several cases where warnings need
* to be raised under certain circumstances, currently, there are six cases
*
* 1) The bitmap has a compiled-in very finite size. So something else needs
* to be used to specify if a code point that is too large for the bitmap
- * actually matches. The mechanism currently is a swash or inversion
+ * actually matches. The mechanism currently is an inversion
* list. ANYOF_ONLY_HAS_BITMAP, described above, being TRUE indicates
* there are no matches of too-large code points. But if it is FALSE,
* then almost certainly there are matches too large for the bitmap. (The
* 2) A subset of item 1) is if all possible code points outside the bitmap
* match. This is a common occurrence when the class is complemented,
* like /[^ij]/. Therefore a bit is reserved to indicate this,
- * ANYOF_MATCHES_ALL_ABOVE_BITMAP. If it became necessary, this bit could
- * be replaced by using the normal swash mechanism, but with a performance
- * penalty.
+ * rather than having a more expensive inversion list created,
+ * ANYOF_MATCHES_ALL_ABOVE_BITMAP.
* 3) Under /d rules, it can happen that code points that are in the upper
* latin1 range (\x80-\xFF or their equivalents on EBCDIC platforms) match
* only if the runtime target string being matched against is UTF-8. For
* example /[\w[:punct:]]/d. This happens only for posix classes (with a
- * couple of exceptions, like \d), and all such ones also have
- * above-bitmap matches. Thus, 3) implies 1) as well. Note that /d rules
- * are no longer encouraged; 'use 5.14' or higher deselects them. But a
- * flag is required so that they can be properly handled. But it can be a
- * shared flag: see 5) below.
- * 4) Also under /d rules, something like /[\Wfoo] will match everything in
+ * couple of exceptions, like \d where it doesn't happen), and all such
+ * ones also have above-bitmap matches. Thus, 3) implies 1) as well.
+ * Note that /d rules are no longer encouraged; 'use 5.14' or higher
+ * deselects them. But a flag is required so that they can be properly
+ * handled. But it can be a shared flag: see 5) below.
+ * 4) Also under /d rules, something like /[\Wfoo]/ will match everything in
* the \x80-\xFF range, unless the string being matched against is UTF-8.
- * A swash could be created for this case, but this is relatively common,
- * and it turns out that it's all or nothing: if any one of these code
- * points matches, they all do. Hence a single bit suffices. We use a
- * shared bit that doesn't take up space by itself:
- * ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER.
- * This also implies 1), with one exception: [:^cntrl:].
+ * An inversion list could be created for this case, but this is
+ * relatively common, and it turns out that it's all or nothing: if any
+ * one of these code points matches, they all do. Hence a single bit
+ * suffices. We use a shared flag that doesn't take up space by itself:
+ * ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER. This
+ * also implies 1), with one exception: [:^cntrl:].
* 5) A user-defined \p{} property may not have been defined by the time the
* regex is compiled. In this case, we don't know until runtime what it
* will match, so we have to assume it could match anything, including
* is a better way to accomplish what this feature does. This case also
* implies 1).
* ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP
- * is the shared bit.
+ * is the shared flag.
* 6) /[foo]/il may have folds that are only valid if the runtime locale is a
* UTF-8 one. These are quite rare, so it would be good to avoid the
* expense of looking for them. But /l matching is slow anyway, and we've
- * traditionally not worried to much about its performance. And this
- * condition requires the ANYOF_LOC_FOLD flag to be set, so testing for
+ * traditionally not worried too much about its performance. And this
+ * condition requires the ANYOFL_FOLD flag to be set, so testing for
* that flag would be sufficient to rule out most cases of this. So it is
- * unclear if this should have a flag or not. But, one is currently
- * allocated for this purpose, ANYOF_ONLY_UTF8_LOC_FOLD_MATCHES (and the
- * text below indicates how to share it, should another bit be needed).
+ * unclear if this should have a flag or not. But, this flag can be
+ * shared with another, so it doesn't occupy extra space.
+ *
+ * At the moment, there is one spare bit, but this could be increased by
+ * various tricks:
*
- * At the moment, there are no spare bits, but this could be changed by various
- * tricks. Notice that item 6) is not independent of the ANYOF_LOC_FOLD flag
- * below. Also, the ANYOF_LOC_REQ_UTF8 flag is set only if both these aren't.
- * We can therefore use a 2-bit field to represent these 3 flags, as follows:
- * 00 => ANYOF_LOC_REQ_UTF8
- * 01 => no folding
- * 10 => ANYOF_LOC_FOLD alone
- * 11 => ANYOF_ONLY_UTF8_LOC_FOLD_MATCHES
+ * If just one more bit is needed, as of this writing it seems to khw that the
+ * best choice would be to make ANYOF_MATCHES_ALL_ABOVE_BITMAP not a flag, but
+ * something like
*
- * Beyond that, note that the information may be conveyed by creating new
- * regnode types. This is not the best solution, as shown later in this
- * paragraph, but it is something that is feasible. We could have a regnode
- * for ANYOF_INVERT, for example. A complication of this is that the regexec.c
- * REGINCLASS macro assumes that it can just use the bitmap if no flags are
- * set. This would have to be changed to add extra tests for the node type, or
- * a special bit reserved that means unspecified special handling, and then the
- * node-type would be used internally to sort that out. So we could gain a bit
- * by having an ANYOF_SPECIAL bit, and a node type for INVERT, and another for
- * POSIXL, and still another for INVERT_POSIXL. This example illustrates one
- * problem with this, a combinatorial explosion of node types. The one node
- * type khw can think of that doesn't have this explosion issue is
- * ANYOF_LOC_REQ_UTF8, but you'd do this only if you haven't done the 2-bit
- * field trick above. This bit is a natural candidate for being a separate
- * node type because it is a specialization of the current ANYOFL, and because
- * no other ANYOFL-only bits are set when it is; also most of its uses are
- * actually outside the reginclass() function, so this could be done with no
- * performance penalty. But again, the 2-bit field trick combines this bit so
- * it doesn't take up space anyway. Another issue when turning a bit into a
- * node type, is that a SSC may use that bit -- not just a regular ANYOF[DL]?.
- * In the case of ANYOF_LOC_REQ_UTF8, the only likely problem is accurately
- * settting the SSC node-type to the new one, which would likely involve
- * S_ssc_or and S_ssc_and, and not how the SSC currently gets set to ANYOFL.
+ * #define ANYOF_MATCHES_ALL_ABOVE_BITMAP ((U32) -2)
*
- * Another possibility is to instead rename the ANYOF_POSIXL bit to be
- * ANYOFL_LARGE, to mean that the ANYOF node has an extra 32 bits beyond what a
- * regular one does. That's what it effectively means now, with the extra
- * space all for the POSIX class bits. But those classes actually only occupy
- * 30 bits, so the 2-bit field or 2 of the locale bits could be moved to that
- * extra space. The downside of this is that ANYOFL nodes with whichever of
- * the bits get moved would have to have the extra space always allocated.
+ * and access it through the ARG like ANYOF_ONLY_HAS_BITMAP is. This flag is
+ * used by all ANYOF node types, and it could be used to avoid calling the
+ * handler function, as the macro REGINCLASS in regexec.c does now for other
+ * cases.
*
- * One could completely remove ANYOFL_LARGE and make all ANYOFL nodes large.
- * The 30 bits in the extra word would indicate if a posix class should be
- * looked up or not. There isn't an SSC problem as all SSCs are this large
- * anyway, and the SSC could be set to this node type. REGINCLASS would have
- * to be modified so that if the node type were this, it would call
- * reginclass(), as the flag bit that indicates to do this now would be gone.
- * If the 2-bit field is used and moved to the larger structure, this would
- * free up a total of 4 bits. If this were done, we could create an
- * ANYOF_INVERT node-type without a combinatorial explosion, getting us to 5
- * bits. And, keep in mind that ANYOF_MATCHES_ALL_ABOVE_BITMAP is solely for
- * performance, so could be removed. The other performance-related bits are
- * shareable with bits that are required.
+ * Another possibility is based on the fact that ANYOF_MATCHES_POSIXL is
+ * redundant with the node type ANYOFPOSIXL. That flag could be removed, but
+ * at the expense of extra code in regexec.c. The flag has been retained
+ * because it allows us to see if we need to call reginsert, or just use the
+ * bitmap in one test.
*
- * Several flags are not used in synthetic start class (SSC) nodes, so could be
+ * If this is done, an extension would be to make all ANYOFL nodes contain the
+ * extra 32 bits that ANYOFPOSIXL ones do. The posix flags only occupy 30
+ * bits, so the ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD flags
+ * and ANYOFL_FOLD could be moved to that extra space, but it would mean extra
+ * instructions, as there are currently places in the code that assume those
+ * two bits are zero.
+ *
+ * All told, 5 bits could be available for other uses if all of the above were
+ * done.
+ *
+ * Some flags are not used in synthetic start class (SSC) nodes, so could be
* shared should new flags be needed for SSCs, like SSC_MATCHES_EMPTY_STRING
* now. */
* time. However under locale, the actual folding varies depending on
* what the locale is at the time of execution, so it has to be deferred until
* then. Only set under /l; never in an SSC */
-#define ANYOF_LOC_FOLD 0x04
-
-/* If set, ANYOF_LOC_FOLD is also set, and there are potential matches that
- * will be valid only if the locale is a UTF-8 one. */
-#define ANYOF_ONLY_UTF8_LOC_FOLD_MATCHES 0x08
-
-/* If set, means to warn if runtime locale isn't a UTF-8 one. Only under /l.
- * If set, none of INVERT, LOC_FOLD, POSIXL,
- * ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP can
- * be set. Can be in an SSC */
-#define ANYOF_LOC_REQ_UTF8 0x10
+#define ANYOFL_FOLD 0x04
+
+/* Shared bit set only with ANYOFL and SSC nodes:
+ * If ANYOFL_FOLD is set, this flag indicates there are potential matches
+ * valid only if the locale is a UTF-8 one.
+ * If ANYOFL_FOLD is NOT set, this flag means to warn if the runtime locale
+ * isn't a UTF-8 one (and the generated node assumes a UTF-8 locale).
+ * None of INVERT, POSIXL,
+ * ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP
+ * can be set. */
+#define ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD 0x08
+
+/* Convenience macros for teasing apart the meanings when reading the above bit
+ * */
+#define ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags) \
+ ((flags & ( ANYOFL_FOLD /* Both bits are set */ \
+ |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD)) \
+ == ( ANYOFL_FOLD \
+ |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD))
+
+#define ANYOFL_UTF8_LOCALE_REQD(flags) \
+ ((flags & ( ANYOFL_FOLD /* Only REQD bit is set */ \
+ |ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD)) \
+ == ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD)
+
+/* Spare: Be sure to change ANYOF_FLAGS_ALL if this gets used 0x10 */
/* If set, the node matches every code point NUM_ANYOF_CODE_POINTS and above.
* Can be in an SSC */
/* Shared bit:
* Under /d it means the ANYOFD node matches more things if the target
* string is encoded in UTF-8; any such things will be non-ASCII,
- * characters that are < 256, and can be accessed via the swash.
+ * characters that are < 256, and can be accessed via the inversion
+ * list.
* When not under /d, it means the ANYOF node contains a user-defined
* property that wasn't yet defined at the time the regex was compiled,
- * and so must be looked up at runtime, by creating a swash
+ * and so must be looked up at runtime, by creating an inversion list.
* (These uses are mutually exclusive because a user-defined property is
* specified by \p{}, and \p{} implies /u which deselects /d). The long macro
* name is to make sure that you are cautioned about its shared nature. Only
* are cautioned about its shared nature */
#define ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER 0x80
-#define ANYOF_FLAGS_ALL (0xff)
+#define ANYOF_FLAGS_ALL (0xff & ~0x10)
-#define ANYOF_LOCALE_FLAGS (ANYOF_LOC_FOLD | ANYOF_MATCHES_POSIXL)
+#define ANYOF_LOCALE_FLAGS (ANYOFL_FOLD | ANYOF_MATCHES_POSIXL)
/* These are the flags that apply to both regular ANYOF nodes and synthetic
* start class nodes during construction of the SSC. During finalization of
* the SSC, other of the flags may get added to it */
-#define ANYOF_COMMON_FLAGS ANYOF_LOC_REQ_UTF8
+#define ANYOF_COMMON_FLAGS 0
/* Character classes for node->classflags of ANYOF */
/* Should be synchronized with a table in regprop() */
#define ANYOF_BIT(c) (1U << ((c) & 7))
-#define ANYOF_POSIXL_SET(p, c) (((regnode_charclass_posixl*) (p))->classflags |= (1U << (c)))
-#define ANYOF_CLASS_SET(p, c) ANYOF_POSIXL_SET((p), (c))
+#define POSIXL_SET(field, c) ((field) |= (1U << (c)))
+#define ANYOF_POSIXL_SET(p, c) POSIXL_SET(((regnode_charclass_posixl*) (p))->classflags, (c))
+
+#define POSIXL_CLEAR(field, c) ((field) &= ~ (1U <<(c)))
+#define ANYOF_POSIXL_CLEAR(p, c) POSIXL_CLEAR(((regnode_charclass_posixl*) (p))->classflags, (c))
-#define ANYOF_POSIXL_CLEAR(p, c) (((regnode_charclass_posixl*) (p))->classflags &= ~ (1U <<(c)))
-#define ANYOF_CLASS_CLEAR(p, c) ANYOF_POSIXL_CLEAR((p), (c))
+#define POSIXL_TEST(field, c) ((field) & (1U << (c)))
+#define ANYOF_POSIXL_TEST(p, c) POSIXL_TEST(((regnode_charclass_posixl*) (p))->classflags, (c))
-#define ANYOF_POSIXL_TEST(p, c) (((regnode_charclass_posixl*) (p))->classflags & (1U << (c)))
-#define ANYOF_CLASS_TEST(p, c) ANYOF_POSIXL_TEST((p), (c))
+#define POSIXL_ZERO(field) STMT_START { (field) = 0; } STMT_END
+#define ANYOF_POSIXL_ZERO(ret) POSIXL_ZERO(((regnode_charclass_posixl*) (ret))->classflags)
-#define ANYOF_POSIXL_ZERO(ret) STMT_START { ((regnode_charclass_posixl*) (ret))->classflags = 0; } STMT_END
-#define ANYOF_CLASS_ZERO(ret) ANYOF_POSIXL_ZERO(ret)
+#define ANYOF_POSIXL_SET_TO_BITMAP(p, bits) \
+ STMT_START { \
+ ((regnode_charclass_posixl*) (p))->classflags = (bits); \
+ } STMT_END
/* Shifts a bit to get, eg. 0x4000_0000, then subtracts 1 to get 0x3FFF_FFFF */
#define ANYOF_POSIXL_SETALL(ret) STMT_START { ((regnode_charclass_posixl*) (ret))->classflags = ((1U << ((ANYOF_POSIXL_MAX) - 1))) - 1; } STMT_END
#define ANYOF_POSIXL_AND(source, dest) STMT_START { (dest)->classflags &= (source)->classflags ; } STMT_END
-#define ANYOF_BITMAP_ZERO(ret) Zero(((struct regnode_charclass*)(ret))->bitmap, ANYOF_BITMAP_SIZE, char)
-#define ANYOF_BITMAP(p) (((struct regnode_charclass*)(p))->bitmap)
-#define ANYOF_BITMAP_BYTE(p, c) (ANYOF_BITMAP(p)[(((U8)(c)) >> 3) & 31])
+#define ANYOF_BITMAP_ZERO(ret) Zero(((regnode_charclass*)(ret))->bitmap, ANYOF_BITMAP_SIZE, char)
+#define ANYOF_BITMAP(p) ((regnode_charclass*)(p))->bitmap
+#define ANYOF_BITMAP_BYTE(p, c) BITMAP_BYTE(ANYOF_BITMAP(p), c)
#define ANYOF_BITMAP_SET(p, c) (ANYOF_BITMAP_BYTE(p, c) |= ANYOF_BIT(c))
#define ANYOF_BITMAP_CLEAR(p,c) (ANYOF_BITMAP_BYTE(p, c) &= ~ANYOF_BIT(c))
#define ANYOF_BITMAP_TEST(p, c) cBOOL(ANYOF_BITMAP_BYTE(p, c) & ANYOF_BIT(c))
#define ANYOF_BITMAP_CLEARALL(p) \
Zero (ANYOF_BITMAP(p), ANYOF_BITMAP_SIZE)
-#define ANYOF_SKIP EXTRA_SIZE(struct regnode_charclass)
-#define ANYOF_POSIXL_SKIP EXTRA_SIZE(regnode_charclass_posixl)
-
/*
* Utility definitions.
*/
# define UCHARAT(p) ((int)*(p)&CHARMASK)
#endif
+/* Number of regnode equivalents that 'guy' occupies beyond the size of the
+ * smallest regnode. */
#define EXTRA_SIZE(guy) ((sizeof(guy)-1)/sizeof(struct regnode))
#define REG_ZERO_LEN_SEEN 0x00000001
#define REG_CUTGROUP_SEEN 0x00000100
#define REG_RUN_ON_COMMENT_SEEN 0x00000200
#define REG_UNFOLDED_MULTI_SEEN 0x00000400
-#define REG_GOSTART_SEEN 0x00000800
+/* spare */
#define REG_UNBOUNDED_QUANTIFIER_SEEN 0x00001000
* l - start op for literal (?{EVAL}) item
* L - start op for literal (?{EVAL}) item, with separate CV (qr//)
* r - pointer to an embedded code-containing qr, e.g. /ab$qr/
- * s - swash for Unicode-style character class, and the multicharacter
- * strings resulting from casefolding the single-character entries
- * in the character class
+ * s - inversion list for Unicode-style character class, and the
+ * multicharacter strings resulting from casefolding the single-character
+ * entries in the character class
* t - trie struct
* u - trie struct's widecharmap (a HV, so can't share, must dup)
* also used for revcharmap and words under DEBUGGING
three different sets... */
#define TRIE_BITMAP(p) (((reg_trie_data *)(p))->bitmap)
-#define TRIE_BITMAP_BYTE(p, c) (TRIE_BITMAP(p)[(((U8)(c)) >> 3) & 31])
+#define TRIE_BITMAP_BYTE(p, c) BITMAP_BYTE(TRIE_BITMAP(p), c)
#define TRIE_BITMAP_SET(p, c) (TRIE_BITMAP_BYTE(p, c) |= ANYOF_BIT((U8)c))
#define TRIE_BITMAP_CLEAR(p,c) (TRIE_BITMAP_BYTE(p, c) &= ~ANYOF_BIT((U8)c))
#define TRIE_BITMAP_TEST(p, c) (TRIE_BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c))
#define IS_TRIE_AC(op) ((op)>=AHOCORASICK)
-#define BITMAP_BYTE(p, c) (((U8*)p)[(((U8)(c)) >> 3) & 31])
+#define BITMAP_BYTE(p, c) (( (U8*) p) [ ( ( (UV) (c)) >> 3) ] )
#define BITMAP_TEST(p, c) (BITMAP_BYTE(p, c) & ANYOF_BIT((U8)c))
/* these defines assume uniquecharcount is the correct variable, and state may be evaluated twice */
#define RE_TRIE_MAXBUF_NAME "\022E_TRIE_MAXBUF"
#define RE_DEBUG_FLAGS "\022E_DEBUG_FLAGS"
+#define RE_COMPILE_RECURSION_INIT 1000
+#define RE_COMPILE_RECURSION_LIMIT "\022E_COMPILE_RECURSION_LIMIT"
+
/*
RE_DEBUG_FLAGS is used to control what debug output is emitted
#define RE_DEBUG_EXECUTE_TRIE 0x000400
/* Extra */
-#define RE_DEBUG_EXTRA_MASK 0xFF0000
-#define RE_DEBUG_EXTRA_TRIE 0x010000
-#define RE_DEBUG_EXTRA_OFFSETS 0x020000
-#define RE_DEBUG_EXTRA_OFFDEBUG 0x040000
-#define RE_DEBUG_EXTRA_STATE 0x080000
-#define RE_DEBUG_EXTRA_OPTIMISE 0x100000
-#define RE_DEBUG_EXTRA_BUFFERS 0x400000
-#define RE_DEBUG_EXTRA_GPOS 0x800000
+#define RE_DEBUG_EXTRA_MASK 0x1FF0000
+#define RE_DEBUG_EXTRA_TRIE 0x0010000
+#define RE_DEBUG_EXTRA_OFFSETS 0x0020000
+#define RE_DEBUG_EXTRA_OFFDEBUG 0x0040000
+#define RE_DEBUG_EXTRA_STATE 0x0080000
+#define RE_DEBUG_EXTRA_OPTIMISE 0x0100000
+#define RE_DEBUG_EXTRA_BUFFERS 0x0400000
+#define RE_DEBUG_EXTRA_GPOS 0x0800000
+#define RE_DEBUG_EXTRA_DUMP_PRE_OPTIMIZE 0x1000000
/* combined */
-#define RE_DEBUG_EXTRA_STACK 0x280000
+#define RE_DEBUG_EXTRA_STACK 0x0280000
#define RE_DEBUG_FLAG(x) (re_debug_flags & x)
/* Compile */
#define DEBUG_COMPILE_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_COMPILE_MASK) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_COMPILE_MASK)) x )
#define DEBUG_PARSE_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_COMPILE_PARSE) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_COMPILE_PARSE)) x )
#define DEBUG_OPTIMISE_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_COMPILE_OPTIMISE) x )
-#define DEBUG_PARSE_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_COMPILE_PARSE) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_COMPILE_OPTIMISE)) x )
#define DEBUG_DUMP_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_COMPILE_DUMP) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_COMPILE_DUMP)) x )
#define DEBUG_TRIE_COMPILE_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_COMPILE_TRIE) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_COMPILE_TRIE)) x )
#define DEBUG_FLAGS_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_COMPILE_FLAGS) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_COMPILE_FLAGS)) x )
#define DEBUG_TEST_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_COMPILE_TEST) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_COMPILE_TEST)) x )
/* Execute */
#define DEBUG_EXECUTE_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_EXECUTE_MASK) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXECUTE_MASK)) x )
#define DEBUG_INTUIT_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_EXECUTE_INTUIT) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXECUTE_INTUIT)) x )
#define DEBUG_MATCH_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_EXECUTE_MATCH) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXECUTE_MATCH)) x )
#define DEBUG_TRIE_EXECUTE_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_EXECUTE_TRIE) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXECUTE_TRIE)) x )
/* Extra */
#define DEBUG_EXTRA_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_EXTRA_MASK) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_MASK)) x )
#define DEBUG_OFFSETS_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_EXTRA_OFFSETS) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_OFFSETS)) x )
#define DEBUG_STATE_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_EXTRA_STATE) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_STATE)) x )
#define DEBUG_STACK_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_EXTRA_STACK) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_STACK)) x )
#define DEBUG_BUFFERS_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_EXTRA_BUFFERS) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_BUFFERS)) x )
#define DEBUG_OPTIMISE_MORE_r(x) DEBUG_r( \
- if ((RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE) == \
- (re_debug_flags & (RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE)) ) x )
+ if (DEBUG_v_TEST || ((RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE) == \
+ (re_debug_flags & (RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE)))) x )
#define MJD_OFFSET_DEBUG(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_EXTRA_OFFDEBUG) \
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_OFFDEBUG)) \
Perl_warn_nocontext x )
#define DEBUG_TRIE_COMPILE_MORE_r(x) DEBUG_TRIE_COMPILE_r( \
- if (re_debug_flags & RE_DEBUG_EXTRA_TRIE) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_TRIE)) x )
#define DEBUG_TRIE_EXECUTE_MORE_r(x) DEBUG_TRIE_EXECUTE_r( \
- if (re_debug_flags & RE_DEBUG_EXTRA_TRIE) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_TRIE)) x )
#define DEBUG_TRIE_r(x) DEBUG_r( \
- if (re_debug_flags & (RE_DEBUG_COMPILE_TRIE \
- | RE_DEBUG_EXECUTE_TRIE )) x )
+ if (DEBUG_v_TEST || (re_debug_flags & (RE_DEBUG_COMPILE_TRIE \
+ | RE_DEBUG_EXECUTE_TRIE ))) x )
#define DEBUG_GPOS_r(x) DEBUG_r( \
- if (re_debug_flags & RE_DEBUG_EXTRA_GPOS) x )
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_GPOS)) x )
+
+#define DEBUG_DUMP_PRE_OPTIMIZE_r(x) DEBUG_r( \
+ if (DEBUG_v_TEST || (re_debug_flags & RE_DEBUG_EXTRA_DUMP_PRE_OPTIMIZE)) x )
/* initialization */
/* get_sv() can return NULL during global destruction. */
#define GET_RE_DEBUG_FLAGS DEBUG_r({ \
SV * re_debug_flags_sv = NULL; \
- re_debug_flags_sv = get_sv(RE_DEBUG_FLAGS, 1); \
+ re_debug_flags_sv = PL_curcop ? get_sv(RE_DEBUG_FLAGS, GV_ADD) : NULL; \
if (re_debug_flags_sv) { \
if (!SvIOK(re_debug_flags_sv)) \
sv_setuv(re_debug_flags_sv, RE_DEBUG_COMPILE_DUMP | RE_DEBUG_EXECUTE_MASK ); \
#ifdef DEBUGGING
-#define GET_RE_DEBUG_FLAGS_DECL VOL IV re_debug_flags = 0; \
+#define GET_RE_DEBUG_FLAGS_DECL volatile IV re_debug_flags = 0; \
PERL_UNUSED_VAR(re_debug_flags); GET_RE_DEBUG_FLAGS;
-#define RE_PV_COLOR_DECL(rpv,rlen,isuni,dsv,pv,l,m,c1,c2) \
- const char * const rpv = \
- pv_pretty((dsv), (pv), (l), (m), \
- PL_colors[(c1)],PL_colors[(c2)], \
+#define RE_PV_COLOR_DECL(rpv,rlen,isuni,dsv,pv,l,m,c1,c2) \
+ const char * const rpv = \
+ pv_pretty((dsv), (pv), (l), (m), \
+ PL_colors[(c1)],PL_colors[(c2)], \
PERL_PV_ESCAPE_RE|PERL_PV_ESCAPE_NONASCII |((isuni) ? PERL_PV_ESCAPE_UNI : 0) ); \
const int rlen = SvCUR(dsv)
-#define RE_SV_ESCAPE(rpv,isuni,dsv,sv,m) \
- const char * const rpv = \
- pv_pretty((dsv), (SvPV_nolen_const(sv)), (SvCUR(sv)), (m), \
- PL_colors[(c1)],PL_colors[(c2)], \
+/* This is currently unsed in the core */
+#define RE_SV_ESCAPE(rpv,isuni,dsv,sv,m) \
+ const char * const rpv = \
+ pv_pretty((dsv), (SvPV_nolen_const(sv)), (SvCUR(sv)), (m), \
+ PL_colors[(c1)],PL_colors[(c2)], \
PERL_PV_ESCAPE_RE|PERL_PV_ESCAPE_NONASCII |((isuni) ? PERL_PV_ESCAPE_UNI : 0) )
-#define RE_PV_QUOTED_DECL(rpv,isuni,dsv,pv,l,m) \
- const char * const rpv = \
- pv_pretty((dsv), (pv), (l), (m), \
- PL_colors[0], PL_colors[1], \
+#define RE_PV_QUOTED_DECL(rpv,isuni,dsv,pv,l,m) \
+ const char * const rpv = \
+ pv_pretty((dsv), (pv), (l), (m), \
+ PL_colors[0], PL_colors[1], \
( PERL_PV_PRETTY_QUOTE | PERL_PV_ESCAPE_RE | PERL_PV_ESCAPE_NONASCII | PERL_PV_PRETTY_ELLIPSES | \
((isuni) ? PERL_PV_ESCAPE_UNI : 0)) \
)
#endif /* DEBUG RELATED DEFINES */
+#define FIRST_NON_ASCII_DECIMAL_DIGIT 0x660 /* ARABIC_INDIC_DIGIT_ZERO */
+
typedef enum {
TRADITIONAL_BOUND = _CC_WORDCHAR,
GCB_BOUND,
+ LB_BOUND,
SB_BOUND,
WB_BOUND
} bound_type;
+/* This unpacks the FLAGS field of ANYOFHx nodes. The value it contains
+ * gives the strict lower bound for the UTF-8 start byte of any code point
+ * matchable by the node, and a loose upper bound as well.
+ *
+ * The low bound is stored in the upper 6 bits, plus 0xC0.
+ * The loose upper bound is determined from the lowest 2 bits and the low bound
+ * (called x) as follows:
+ *
+ * 11 The upper limit of the range can be as much as (EF - x) / 8
+ * 10 The upper limit of the range can be as much as (EF - x) / 4
+ * 01 The upper limit of the range can be as much as (EF - x) / 2
+ * 00 The upper limit of the range can be as much as EF
+ *
+ * For motivation of this design, see commit message in
+ * 3146c00a633e9cbed741e10146662fbcedfdb8d3 */
+#ifdef EBCDIC
+# define MAX_ANYOF_HRx_BYTE 0xF4
+#else
+# define MAX_ANYOF_HRx_BYTE 0xEF
+#endif
+#define LOWEST_ANYOF_HRx_BYTE(b) (((b) >> 2) + 0xC0)
+#define HIGHEST_ANYOF_HRx_BYTE(b) \
+ (LOWEST_ANYOF_HRx_BYTE(b) \
+ + ((MAX_ANYOF_HRx_BYTE - LOWEST_ANYOF_HRx_BYTE(b)) >> ((b) & 3)))
+
+#endif /* PERL_REGCOMP_H_ */
+
/*
* ex: set ts=8 sts=4 sw=4 et:
*/