/* Be really agressive about optimising patterns with trie sequences? */
#define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1
+/* Use old style unicode mappings for perl and posix character classes
+ *
+ * NOTE: Enabling this essentially breaks character class matching against unicode
+ * strings, so that POSIX char classes match when they shouldn't, and \d matches
+ * way more than 10 characters, and sometimes a charclass and its complement either
+ * both match or neither match.
+ * NOTE: Disabling this will cause various backwards compatibility issues to rear
+ * their head, and tests to fail. However it will make the charclass behaviour
+ * consistant regardless of internal string type, and make character class inversions
+ * consistant. The tests that fail in the regex engine are basically broken tests.
+ *
+ * Personally I think 5.12 should disable this for sure. Its a bit more debatable for
+ * 5.10, so for now im leaving it enabled.
+ * XXX: It is now enabled for 5.11/5.12
+ *
+ * -demerphq
+ */
+#define PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 1
+
/* Should the optimiser take positive assertions into account? */
#define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 0
#define ANYOF_BITMAP_SIZE 32 /* 256 b/(8 b/B) */
-#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 40 (8*5) named classes */
+#define ANYOF_CLASSBITMAP_SIZE 4 /* up to 32 (8*4) named classes */
/* also used by trie */
struct regnode_charclass {
U8 flags;
U8 type;
U16 next_off;
- U32 arg1;
+ U32 arg1; /* used as ptr in S_regclass */
char bitmap[ANYOF_BITMAP_SIZE]; /* only compile-time */
};
-struct regnode_charclass_class { /* has [[:blah:]] classes */
- U8 flags; /* should have ANYOF_CLASS here */
+/* has runtime (locale) \d, \w, ..., [:posix:] classes */
+struct regnode_charclass_class {
+ U8 flags; /* ANYOF_CLASS bit must go here */
U8 type;
U16 next_off;
- U32 arg1;
+ U32 arg1; /* used as ptr in S_regclass */
char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */
char classflags[ANYOF_CLASSBITMAP_SIZE]; /* and run-time */
};
#undef STRING
#define OP(p) ((p)->type)
+#define FLAGS(p) ((p)->flags) /* Caution: Doesn't apply to all \
+ regnode types */
#define OPERAND(p) (((struct regnode_string *)p)->string)
#define MASK(p) ((char*)OPERAND(p))
#define STR_LEN(p) (((struct regnode_string *)p)->str_len)
#define SIZE_ONLY (RExC_emit == &PL_regdummy)
+/* Flags for node->flags of several of the node types */
+#define USE_UNI 0x01
+
/* Flags for node->flags of ANYOF */
-#define ANYOF_CLASS 0x08 /* has [[:blah:]] classes */
-#define ANYOF_INVERT 0x04
-#define ANYOF_FOLD 0x02
#define ANYOF_LOCALE 0x01
+#define ANYOF_FOLD 0x02
+#define ANYOF_INVERT 0x04
-/* Used for regstclass only */
-#define ANYOF_EOS 0x10 /* Can match an empty string too */
-
-/* There is a character or a range past 0xff */
-#define ANYOF_UNICODE 0x20
-#define ANYOF_UNICODE_ALL 0x40 /* Can match any char past 0xff */
+/* CLASS is never set unless LOCALE is too: has runtime \d, \w, [:posix:], ... */
+#define ANYOF_CLASS 0x08
+#define ANYOF_LARGE ANYOF_CLASS /* Same; name retained for back compat */
-/* size of node is large (includes class pointer) */
-#define ANYOF_LARGE 0x80
+/* EOS used for regstclass only */
+#define ANYOF_EOS 0x10 /* Can match an empty string too */
-/* Are there any runtime flags on in this node? */
-#define ANYOF_RUNTIME(s) (ANYOF_FLAGS(s) & 0x0f)
+#define ANYOF_UNICODE 0x20 /* Matches >= one thing past 0xff */
+#define ANYOF_UNICODE_ALL 0x40 /* Matches 0x100 - infinity */
#define ANYOF_FLAGS_ALL 0xff
#define ANYOF_NALNUM 1
#define ANYOF_SPACE 2 /* \s */
#define ANYOF_NSPACE 3
-#define ANYOF_DIGIT 4
+#define ANYOF_DIGIT 4 /* \d */
#define ANYOF_NDIGIT 5
-#define ANYOF_ALNUMC 6 /* isalnum(3), utf8::IsAlnum, ALNUMC */
+#define ANYOF_ALNUMC 6 /* [[:alnum:]] isalnum(3), utf8::IsAlnum, ALNUMC */
#define ANYOF_NALNUMC 7
#define ANYOF_ALPHA 8
#define ANYOF_NALPHA 9
#include "regnodes.h"
#endif
-/* The following have no fixed length. U8 so we can do strchr() on it. */
-#ifndef DOINIT
-EXTCONST U8 PL_varies[];
-#else
-EXTCONST U8 PL_varies[] = {
- BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL,
- WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, CLUMP,
- NREF, NREFF, NREFFL,
- 0
-};
-#endif
-
-/* The following always have a length of 1. U8 we can do strchr() on it. */
-/* (Note that length 1 means "one character" under UTF8, not "one octet".) */
-#ifndef DOINIT
-EXTCONST U8 PL_simple[];
-#else
-EXTCONST U8 PL_simple[] = {
- REG_ANY, SANY, CANY,
- ANYOF,
- ALNUM, ALNUML,
- NALNUM, NALNUML,
- SPACE, SPACEL,
- NSPACE, NSPACEL,
- DIGIT, NDIGIT,
- VERTWS, NVERTWS,
- HORIZWS, NHORIZWS,
- 0
-};
-#endif
-
#ifndef PLUGGABLE_RE_EXTENSION
#ifndef DOINIT
EXTCONST regexp_engine PL_core_reg_engine;
/* .what is a character array with one character for each member of .data
* The character describes the function of the corresponding .data item:
+ * a - AV for paren_name_list under DEBUGGING
* f - start-class data for regstclass optimization
* n - Root of op tree for (?{EVAL}) item
* o - Start op for (?{EVAL}) item
#define check_offset_max substrs->data[2].max_offset
#define check_end_shift substrs->data[2].end_shift
-
+#define RX_ANCHORED_SUBSTR(rx) (((struct regexp *)SvANY(rx))->anchored_substr)
+#define RX_ANCHORED_UTF8(rx) (((struct regexp *)SvANY(rx))->anchored_utf8)
+#define RX_FLOAT_SUBSTR(rx) (((struct regexp *)SvANY(rx))->float_substr)
+#define RX_FLOAT_UTF8(rx) (((struct regexp *)SvANY(rx))->float_utf8)
/* trie related stuff */
} trans;
};
+/* info per word; indexed by wordnum */
+typedef struct {
+ U16 prev; /* previous word in acceptance chain; eg in
+ * zzz|abc|ab/ after matching the chars abc, the
+ * accepted word is #2, and the previous accepted
+ * word is #3 */
+ U32 len; /* how many chars long is this word? */
+ U32 accept; /* accept state for this word */
+} reg_trie_wordinfo;
typedef struct _reg_trie_state reg_trie_state;
reg_trie_state *states; /* state data */
reg_trie_trans *trans; /* array of transition elements */
char *bitmap; /* stclass bitmap */
- U32 *wordlen; /* array of lengths of words */
U16 *jump; /* optional 1 indexed array of offsets before tail
for the node following a given word. */
- U16 *nextword; /* optional 1 indexed array to support linked list
- of duplicate wordnums */
+ reg_trie_wordinfo *wordinfo; /* array of info per word */
U16 uniquecharcount; /* unique chars in trie (width of trans table) */
U32 startstate; /* initial state - used for common prefix optimisation */
STRLEN minlen; /* minimum length of words in trie - build/opt only? */
STRLEN maxlen; /* maximum length of words in trie - build/opt only? */
+ U32 prefixlen; /* #chars in common prefix */
U32 statecount; /* Build only - number of states in the states array
(including the unused zero state) */
U32 wordcount; /* Build only */
#define RE_DEBUG_EXTRA_STATE 0x080000
#define RE_DEBUG_EXTRA_OPTIMISE 0x100000
#define RE_DEBUG_EXTRA_BUFFERS 0x400000
+#define RE_DEBUG_EXTRA_GPOS 0x800000
/* combined */
#define RE_DEBUG_EXTRA_STACK 0x280000
#define DEBUG_TRIE_r(x) DEBUG_r( \
if (re_debug_flags & (RE_DEBUG_COMPILE_TRIE \
| RE_DEBUG_EXECUTE_TRIE )) x )
+#define DEBUG_GPOS_r(x) DEBUG_r( \
+ if (re_debug_flags & RE_DEBUG_EXTRA_GPOS) x )
/* initialization */
-/* get_sv() can return NULL during global destruction. */
+/* get_sv() can return NULL during global destruction. re_debug_flags can get
+ * clobbered by a longjmp, so must be initialized */
#define GET_RE_DEBUG_FLAGS DEBUG_r({ \
SV * re_debug_flags_sv = NULL; \
+ re_debug_flags = 0; \
re_debug_flags_sv = get_sv(RE_DEBUG_FLAGS, 1); \
if (re_debug_flags_sv) { \
if (!SvIOK(re_debug_flags_sv)) \
#ifdef DEBUGGING
-#define GET_RE_DEBUG_FLAGS_DECL IV re_debug_flags = 0; GET_RE_DEBUG_FLAGS;
+#define GET_RE_DEBUG_FLAGS_DECL VOL IV re_debug_flags = 0; GET_RE_DEBUG_FLAGS;
#define RE_PV_COLOR_DECL(rpv,rlen,isuni,dsv,pv,l,m,c1,c2) \
const char * const rpv = \