X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/486ec47ab73770ab60bf9cfb6d398a4371463266..41e1edf564ba7bc3c29e7e77dfc506fc40668b7a:/regcomp.h diff --git a/regcomp.h b/regcomp.h index c15d681..6826ae0 100644 --- a/regcomp.h +++ b/regcomp.h @@ -18,25 +18,6 @@ typedef OP OP_4tree; /* Will be redefined later. */ /* Be really aggressive about optimising patterns with trie sequences? */ #define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1 -/* Use old style unicode mappings for perl and posix character classes - * - * NOTE: Enabling this essentially breaks character class matching against unicode - * strings, so that POSIX char classes match when they shouldn't, and \d matches - * way more than 10 characters, and sometimes a charclass and its complement either - * both match or neither match. - * NOTE: Disabling this will cause various backwards compatibility issues to rear - * their head, and tests to fail. However it will make the charclass behaviour - * consistent regardless of internal string type, and make character class inversions - * consistent. The tests that fail in the regex engine are basically broken tests. - * - * Personally I think 5.12 should disable this for sure. Its a bit more debatable for - * 5.10, so for now im leaving it enabled. - * XXX: It is now enabled for 5.11/5.12 - * - * -demerphq - */ -#define PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 1 - /* Should the optimiser take positive assertions into account? */ #define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 0 @@ -272,8 +253,9 @@ struct regnode_charclass_class { #undef STRING #define OP(p) ((p)->type) -#define FLAGS(p) ((p)->flags) /* Caution: Doesn't apply to all \ - regnode types */ +#define FLAGS(p) ((p)->flags) /* Caution: Doesn't apply to all \ + regnode types. For some, it's the \ + character set of the regnode */ #define OPERAND(p) (((struct regnode_string *)p)->string) #define MASK(p) ((char*)OPERAND(p)) #define STR_LEN(p) (((struct regnode_string *)p)->str_len) @@ -309,12 +291,33 @@ struct regnode_charclass_class { #define SIZE_ONLY (RExC_emit == &PL_regdummy) -/* Flags for node->flags of several of the node types */ -#define USE_UNI 0x01 - -/* Flags for node->flags of ANYOF */ - -#define ANYOF_LOCALE 0x01 +/* If the bitmap doesn't fully represent what this ANYOF node can match, the + * ARG is set to this special value (since 0, 1, ... are legal, but will never + * reach this high). */ +#define ANYOF_NONBITMAP_EMPTY ((U32) -1) + +/* The information used to be stored as as combination of the ANYOF_UTF8 and + * ANYOF_NONBITMAP_NON_UTF8 bits in the flags field, but was moved out of there + * to free up a bit for other uses. This tries to hide the change from + * existing code as much as possible. Now, the data structure that goes in ARG + * is not allocated unless it is needed, and that is what is used to determine + * if there is something outside the bitmap. The code now assumes that if + * that structure exists, that any UTF-8 encoded string should be tried against + * it, but a non-UTF8-encoded string will be tried only if the + * ANYOF_NONBITMAP_NON_UTF8 bit is also set. */ +#define ANYOF_NONBITMAP(node) (ARG(node) != ANYOF_NONBITMAP_EMPTY) + +/* Flags for node->flags of ANYOF. These are in short supply, so some games + * are done to share them, as described below. If necessary, the ANYOF_LOCALE + * and ANYOF_CLASS bits could be shared with a space penalty for locale nodes, + * but this isn't quite so easy, as the optimizer also uses ANYOF_CLASS. + * Once the planned change to compile all the above-latin1 code points is done, + * then the UNICODE_ALL bit can be freed up, with a small performance penalty. + * If flags need to be added that are applicable to the synthetic start class + * only, with some work, they could be put in the next-node field, or in an + * unused bit of the classflags field. */ + +#define ANYOF_LOCALE 0x01 /* /l modifier */ /* The fold is calculated and stored in the bitmap where possible at compile * time. However there are two cases where it isn't possible. These share @@ -327,28 +330,48 @@ struct regnode_charclass_class { #define ANYOF_INVERT 0x04 -/* CLASS is never set unless LOCALE is too: has runtime \d, \w, [:posix:], ... */ +/* Set if this is a struct regnode_charclass_class vs a regnode_charclass. This + * is used for runtime \d, \w, [:posix:], ..., which are used only in locale + * and the optimizer's synthetic start class. Non-locale \d, etc are resolved + * at compile-time */ #define ANYOF_CLASS 0x08 #define ANYOF_LARGE ANYOF_CLASS /* Same; name retained for back compat */ -/* Can match something outside the bitmap that is expressible only in utf8 */ -#define ANYOF_UTF8 0x10 +/* EOS, meaning that it can match an empty string too, is used for the + * synthetic start class only. */ +#define ANYOF_EOS 0x10 + +/* ? Is this node the synthetic start class (ssc). This bit is shared with + * ANYOF_EOS, as the latter is used only for the ssc, and then not used by + * regexec.c. And, the code is structured so that if it is set, the ssc is + * not used, so it is guaranteed to be 0 for the ssc by the time regexec.c + * gets executed, and 0 for a non-ssc ANYOF node, as it only ever gets set for + * a potential ssc candidate. Thus setting it to 1 after it has been + * determined that the ssc will be used is not ambiguous */ +#define ANYOF_IS_SYNTHETIC ANYOF_EOS /* Can match something outside the bitmap that isn't in utf8 */ #define ANYOF_NONBITMAP_NON_UTF8 0x20 -/* Set if the bitmap doesn't fully represent what this node can match */ -#define ANYOF_NONBITMAP (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8) -#define ANYOF_UNICODE ANYOF_NONBITMAP /* old name, for back compat */ - /* Matches every code point 0x100 and above*/ #define ANYOF_UNICODE_ALL 0x40 -/* EOS used for regstclass only */ -#define ANYOF_EOS 0x80 /* Can match an empty string too */ +/* Match all Latin1 characters that aren't ASCII when the target string is not + * in utf8. */ +#define ANYOF_NON_UTF8_LATIN1_ALL 0x80 #define ANYOF_FLAGS_ALL 0xff +/* These are the flags that ANYOF_INVERT being set or not doesn't affect + * whether they are operative or not. e.g., the node still has LOCALE + * regardless of being inverted; whereas ANYOF_UNICODE_ALL means something + * different if inverted */ +#define INVERSION_UNAFFECTED_FLAGS (ANYOF_LOCALE \ + |ANYOF_LOC_NONBITMAP_FOLD \ + |ANYOF_CLASS \ + |ANYOF_EOS \ + |ANYOF_NONBITMAP_NON_UTF8) + /* Character classes for node->classflags of ANYOF */ /* Should be synchronized with a table in regprop() */ /* 2n should pair with 2n+1 */ @@ -416,6 +439,8 @@ struct regnode_charclass_class { #define ANYOF_CLASS_TEST(p, c) (ANYOF_CLASS_BYTE(p, c) & ANYOF_BIT(c)) #define ANYOF_CLASS_ZERO(ret) Zero(((struct regnode_charclass_class*)(ret))->classflags, ANYOF_CLASSBITMAP_SIZE, char) +#define ANYOF_CLASS_SETALL(ret) \ + memset (((struct regnode_charclass_class*)(ret))->classflags, 255, ANYOF_CLASSBITMAP_SIZE) #define ANYOF_BITMAP_ZERO(ret) Zero(((struct regnode_charclass*)(ret))->bitmap, ANYOF_BITMAP_SIZE, char) #define ANYOF_BITMAP(p) (((struct regnode_charclass*)(p))->bitmap) @@ -435,28 +460,14 @@ struct regnode_charclass_class { #define ANYOF_SKIP ((ANYOF_SIZE - 1)/sizeof(regnode)) #define ANYOF_CLASS_SKIP ((ANYOF_CLASS_SIZE - 1)/sizeof(regnode)) -/* The class bit can be set to the locale one if necessary to save bits at the - * expense of having locale ANYOF nodes always have a class bit map, and hence - * take up extra space. This allows convenient changing it as development - * proceeds on this */ -#if ANYOF_CLASS == ANYOF_LOCALE -# undef ANYOF_CLASS_ADD_SKIP -# define ANYOF_ADD_LOC_SKIP (ANYOF_CLASS_SKIP - ANYOF_SKIP) - - /* Quicker way to see if there are actually any tests. This is because - * currently the set of tests can be empty even when the class bitmap is - * allocated */ -# if ANYOF_CLASSBITMAP_SIZE != 4 -# error ANYOF_CLASSBITMAP_SIZE is expected to be 4 -# endif -# define ANYOF_CLASS_TEST_ANY_SET(p) /* assumes sizeof(p) = 4 */ \ - memNE (((struct regnode_charclass_class*)(p))->classflags, \ - "\0\0\0\0", ANYOF_CLASSBITMAP_SIZE) -#else -# define ANYOF_CLASS_ADD_SKIP (ANYOF_CLASS_SKIP - ANYOF_SKIP) -# undef ANYOF_ADD_LOC_SKIP -# define ANYOF_CLASS_TEST_ANY_SET(p) (ANYOF_FLAGS(p) & ANYOF_CLASS) +#if ANYOF_CLASSBITMAP_SIZE != 4 +# error ANYOF_CLASSBITMAP_SIZE is expected to be 4 #endif +#define ANYOF_CLASS_TEST_ANY_SET(p) ((ANYOF_FLAGS(p) & ANYOF_CLASS) \ + && memNE (((struct regnode_charclass_class*)(p))->classflags, \ + "\0\0\0\0", ANYOF_CLASSBITMAP_SIZE)) +/*#define ANYOF_CLASS_ADD_SKIP (ANYOF_CLASS_SKIP - ANYOF_SKIP) + * */ /* @@ -825,7 +836,8 @@ re.pm, especially to the documentation. #ifdef DEBUGGING -#define GET_RE_DEBUG_FLAGS_DECL VOL IV re_debug_flags = 0; GET_RE_DEBUG_FLAGS; +#define GET_RE_DEBUG_FLAGS_DECL VOL IV re_debug_flags \ + __attribute__unused__ = 0; GET_RE_DEBUG_FLAGS; #define RE_PV_COLOR_DECL(rpv,rlen,isuni,dsv,pv,l,m,c1,c2) \ const char * const rpv = \