X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/1f1031fe96c14865e4f60fdd3a6a6ce073d190c1..4b844e06c99d9c2e251dde1c8abc47508b801786:/regcomp.h diff --git a/regcomp.h b/regcomp.h index 7df47d3..8f0b828 100644 --- a/regcomp.h +++ b/regcomp.h @@ -1,12 +1,13 @@ /* regcomp.h * * Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, - * 2000, 2001, 2002, 2003, 2005 by Larry Wall and others + * 2000, 2001, 2002, 2003, 2005, 2006, 2007, by Larry Wall and others * * You may distribute under the terms of either the GNU General Public * License or the Artistic License, as specified in the README file. * */ +#include "regcharclass.h" typedef OP OP_4tree; /* Will be redefined later. */ @@ -17,12 +18,36 @@ typedef OP OP_4tree; /* Will be redefined later. */ /* Be really agressive about optimising patterns with trie sequences? */ #define PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION 1 +/* Use old style unicode mappings for perl and posix character classes + * + * NOTE: Enabling this essentially breaks character class matching against unicode + * strings, so that POSIX char classes match when they shouldn't, and \d matches + * way more than 10 characters, and sometimes a charclass and its complement either + * both match or neither match. + * NOTE: Disabling this will cause various backwards compatibility issues to rear + * their head, and tests to fail. However it will make the charclass behaviour + * consistant regardless of internal string type, and make character class inversions + * consistant. The tests that fail in the regex engine are basically broken tests. + * + * Personally I think 5.12 should disable this for sure. Its a bit more debatable for + * 5.10, so for now im leaving it enabled. + * XXX: It is now enabled for 5.11/5.12 + * + * -demerphq + */ +#define PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS 1 + /* Should the optimiser take positive assertions into account? */ -#define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 1 +#define PERL_ENABLE_POSITIVE_ASSERTION_STUDY 0 /* Not for production use: */ #define PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS 0 +/* Activate offsets code - set to if 1 to enable */ +#ifdef DEBUGGING +#define RE_TRACK_PATTERN_OFFSETS +#endif + /* Unless the next line is uncommented it is illegal to combine lazy matching with possessive matching. Frankly it doesn't make much sense to allow it as X*?+ matches nothing, X+?+ matches a single char only, @@ -96,21 +121,19 @@ typedef OP OP_4tree; /* Will be redefined later. */ /* This is the stuff that used to live in regexp.h that was truly private to the engine itself. It now lives here. */ -/* swap buffer for paren structs */ -typedef struct regexp_paren_ofs { - I32 *startp; - I32 *endp; -} regexp_paren_ofs; + typedef struct regexp_internal { -#ifdef DEBUGGING int name_list_idx; /* Optional data index of an array of paren names */ -#endif + union { + U32 *offsets; /* offset annotations 20001228 MJD + data about mapping the program to the + string - + offsets[0] is proglen when this is used + */ + U32 proglen; + } u; - U32 *offsets; /* offset annotations 20001228 MJD - data about mapping the program to the - string*/ - regexp_paren_ofs *swap; /* Swap copy of *startp / *endp */ regnode *regstclass; /* Optional startclass as identified or constructed by the optimiser */ struct reg_data *data; /* Additional miscellaneous data used by the program. @@ -181,15 +204,16 @@ struct regnode_charclass { U8 flags; U8 type; U16 next_off; - U32 arg1; + U32 arg1; /* used as ptr in S_regclass */ char bitmap[ANYOF_BITMAP_SIZE]; /* only compile-time */ }; -struct regnode_charclass_class { /* has [[:blah:]] classes */ - U8 flags; /* should have ANYOF_CLASS here */ +/* has runtime (locale) \d, \w, ..., [:posix:] classes */ +struct regnode_charclass_class { + U8 flags; /* ANYOF_CLASS bit must go here */ U8 type; U16 next_off; - U32 arg1; + U32 arg1; /* used as ptr in S_regclass */ char bitmap[ANYOF_BITMAP_SIZE]; /* both compile-time */ char classflags[ANYOF_CLASSBITMAP_SIZE]; /* and run-time */ }; @@ -248,6 +272,8 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #undef STRING #define OP(p) ((p)->type) +#define FLAGS(p) ((p)->flags) /* Caution: Doesn't apply to all \ + regnode types */ #define OPERAND(p) (((struct regnode_string *)p)->string) #define MASK(p) ((char*)OPERAND(p)) #define STR_LEN(p) (((struct regnode_string *)p)->str_len) @@ -283,25 +309,22 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define SIZE_ONLY (RExC_emit == &PL_regdummy) +/* Flags for node->flags of several of the node types */ +#define USE_UNI 0x01 + /* Flags for node->flags of ANYOF */ -#define ANYOF_CLASS 0x08 /* has [[:blah:]] classes */ +#define ANYOF_CLASS 0x08 /* has runtime \d, \w, [:posix:], ... */ +#define ANYOF_LARGE ANYOF_CLASS /* Same; name retained for back compat */ #define ANYOF_INVERT 0x04 #define ANYOF_FOLD 0x02 #define ANYOF_LOCALE 0x01 -/* Used for regstclass only */ -#define ANYOF_EOS 0x10 /* Can match an empty string too */ +/* EOS used for regstclass only */ +#define ANYOF_EOS 0x10 /* Can match an empty string too */ -/* There is a character or a range past 0xff */ -#define ANYOF_UNICODE 0x20 -#define ANYOF_UNICODE_ALL 0x40 /* Can match any char past 0xff */ - -/* size of node is large (includes class pointer) */ -#define ANYOF_LARGE 0x80 - -/* Are there any runtime flags on in this node? */ -#define ANYOF_RUNTIME(s) (ANYOF_FLAGS(s) & 0x0f) +#define ANYOF_UNICODE 0x20 /* Matches >= one thing past 0xff */ +#define ANYOF_UNICODE_ALL 0x40 /* Matches 0x100 - infinity */ #define ANYOF_FLAGS_ALL 0xff @@ -313,9 +336,9 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ANYOF_NALNUM 1 #define ANYOF_SPACE 2 /* \s */ #define ANYOF_NSPACE 3 -#define ANYOF_DIGIT 4 +#define ANYOF_DIGIT 4 /* \d */ #define ANYOF_NDIGIT 5 -#define ANYOF_ALNUMC 6 /* isalnum(3), utf8::IsAlnum, ALNUMC */ +#define ANYOF_ALNUMC 6 /* [[:alnum:]] isalnum(3), utf8::IsAlnum, ALNUMC */ #define ANYOF_NALNUMC 7 #define ANYOF_ALPHA 8 #define ANYOF_NALPHA 9 @@ -342,6 +365,14 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define ANYOF_MAX 32 +/* pseudo classes, not stored in the class bitmap, but used as flags + during compilation of char classes */ + +#define ANYOF_VERTWS (ANYOF_MAX+1) +#define ANYOF_NVERTWS (ANYOF_MAX+2) +#define ANYOF_HORIZWS (ANYOF_MAX+3) +#define ANYOF_NHORIZWS (ANYOF_MAX+4) + /* Backward source code compatibility. */ #define ANYOF_ALNUML ANYOF_ALNUM @@ -406,6 +437,7 @@ struct regnode_charclass_class { /* has [[:blah:]] classes */ #define REG_TOP_LEVEL_BRANCHES 0x00000040 #define REG_SEEN_VERBARG 0x00000080 #define REG_SEEN_CUTGROUP 0x00000100 +#define REG_SEEN_RUN_ON_COMMENT 0x00000200 START_EXTERN_C @@ -415,45 +447,22 @@ START_EXTERN_C #include "regnodes.h" #endif -/* The following have no fixed length. U8 so we can do strchr() on it. */ -#ifndef DOINIT -EXTCONST U8 PL_varies[]; -#else -EXTCONST U8 PL_varies[] = { - BRANCH, BACK, STAR, PLUS, CURLY, CURLYX, REF, REFF, REFFL, - WHILEM, CURLYM, CURLYN, BRANCHJ, IFTHEN, SUSPEND, CLUMP, - NREF, NREFF, NREFFL, - 0 -}; -#endif - -/* The following always have a length of 1. U8 we can do strchr() on it. */ -/* (Note that length 1 means "one character" under UTF8, not "one octet".) */ -#ifndef DOINIT -EXTCONST U8 PL_simple[]; -#else -EXTCONST U8 PL_simple[] = { - REG_ANY, SANY, CANY, - ANYOF, - ALNUM, ALNUML, - NALNUM, NALNUML, - SPACE, SPACEL, - NSPACE, NSPACEL, - DIGIT, NDIGIT, - 0 -}; -#endif - #ifndef PLUGGABLE_RE_EXTENSION #ifndef DOINIT EXTCONST regexp_engine PL_core_reg_engine; #else /* DOINIT */ EXTCONST regexp_engine PL_core_reg_engine = { - Perl_re_compile, - Perl_regexec_flags, + Perl_re_compile, + Perl_regexec_flags, Perl_re_intuit_start, Perl_re_intuit_string, - Perl_regfree_internal, + Perl_regfree_internal, + Perl_reg_numbered_buff_fetch, + Perl_reg_numbered_buff_store, + Perl_reg_numbered_buff_length, + Perl_reg_named_buff, + Perl_reg_named_buff_iter, + Perl_reg_qr_package, #if defined(USE_ITHREADS) Perl_regdupe_internal #endif @@ -467,11 +476,12 @@ END_EXTERN_C /* .what is a character array with one character for each member of .data * The character describes the function of the corresponding .data item: + * a - AV for paren_name_list under DEBUGGING * f - start-class data for regstclass optimization * n - Root of op tree for (?{EVAL}) item * o - Start op for (?{EVAL}) item * p - Pad for (?{EVAL}) item - * s - swash for unicode-style character class, and the multicharacter + * s - swash for Unicode-style character class, and the multicharacter * strings resulting from casefolding the single-character entries * in the character class * t - trie struct @@ -507,7 +517,10 @@ struct reg_data { #define check_offset_max substrs->data[2].max_offset #define check_end_shift substrs->data[2].end_shift - +#define RX_ANCHORED_SUBSTR(rx) (((struct regexp *)SvANY(rx))->anchored_substr) +#define RX_ANCHORED_UTF8(rx) (((struct regexp *)SvANY(rx))->anchored_utf8) +#define RX_FLOAT_SUBSTR(rx) (((struct regexp *)SvANY(rx))->float_substr) +#define RX_FLOAT_UTF8(rx) (((struct regexp *)SvANY(rx))->float_utf8) /* trie related stuff */ @@ -543,6 +556,15 @@ struct _reg_trie_state { } trans; }; +/* info per word; indexed by wordnum */ +typedef struct { + U16 prev; /* previous word in acceptance chain; eg in + * zzz|abc|ab/ after matching the chars abc, the + * accepted word is #2, and the previous accepted + * word is #3 */ + U32 len; /* how many chars long is this word? */ + U32 accept; /* accept state for this word */ +} reg_trie_wordinfo; typedef struct _reg_trie_state reg_trie_state; @@ -555,20 +577,19 @@ typedef struct _reg_trie_trans reg_trie_trans; optimisation in Perl_regdupe. */ struct _reg_trie_data { U32 refcount; /* number of times this trie is referenced */ - U16 uniquecharcount; /* unique chars in trie (width of trans table) */ U32 lasttrans; /* last valid transition element */ U16 *charmap; /* byte to charid lookup array */ reg_trie_state *states; /* state data */ reg_trie_trans *trans; /* array of transition elements */ char *bitmap; /* stclass bitmap */ + U16 *jump; /* optional 1 indexed array of offsets before tail + for the node following a given word. */ + reg_trie_wordinfo *wordinfo; /* array of info per word */ + U16 uniquecharcount; /* unique chars in trie (width of trans table) */ U32 startstate; /* initial state - used for common prefix optimisation */ STRLEN minlen; /* minimum length of words in trie - build/opt only? */ STRLEN maxlen; /* maximum length of words in trie - build/opt only? */ - U32 *wordlen; /* array of lengths of words */ - U16 *jump; /* optional 1 indexed array of offsets before tail - for the node following a given word. */ - U16 *nextword; /* optional 1 indexed array to support linked list - of duplicate wordnums */ + U32 prefixlen; /* #chars in common prefix */ U32 statecount; /* Build only - number of states in the states array (including the unused zero state) */ U32 wordcount; /* Build only */ @@ -594,9 +615,9 @@ typedef struct _reg_trie_data reg_trie_data; optimisation in Perl_regdupe. */ struct _reg_ac_data { U32 refcount; + U32 trie; U32 *fail; reg_trie_state *states; - U32 trie; }; typedef struct _reg_ac_data reg_ac_data; @@ -670,6 +691,7 @@ re.pm, especially to the documentation. #define RE_DEBUG_COMPILE_OPTIMISE 0x000002 #define RE_DEBUG_COMPILE_TRIE 0x000004 #define RE_DEBUG_COMPILE_DUMP 0x000008 +#define RE_DEBUG_COMPILE_FLAGS 0x000010 /* Execute */ #define RE_DEBUG_EXECUTE_MASK 0x00FF00 @@ -684,6 +706,8 @@ re.pm, especially to the documentation. #define RE_DEBUG_EXTRA_OFFDEBUG 0x040000 #define RE_DEBUG_EXTRA_STATE 0x080000 #define RE_DEBUG_EXTRA_OPTIMISE 0x100000 +#define RE_DEBUG_EXTRA_BUFFERS 0x400000 +#define RE_DEBUG_EXTRA_GPOS 0x800000 /* combined */ #define RE_DEBUG_EXTRA_STACK 0x280000 @@ -701,7 +725,8 @@ re.pm, especially to the documentation. if (re_debug_flags & RE_DEBUG_COMPILE_DUMP) x ) #define DEBUG_TRIE_COMPILE_r(x) DEBUG_r( \ if (re_debug_flags & RE_DEBUG_COMPILE_TRIE) x ) - +#define DEBUG_FLAGS_r(x) DEBUG_r( \ + if (re_debug_flags & RE_DEBUG_COMPILE_FLAGS) x ) /* Execute */ #define DEBUG_EXECUTE_r(x) DEBUG_r( \ if (re_debug_flags & RE_DEBUG_EXECUTE_MASK) x ) @@ -721,6 +746,9 @@ re.pm, especially to the documentation. if (re_debug_flags & RE_DEBUG_EXTRA_STATE) x ) #define DEBUG_STACK_r(x) DEBUG_r( \ if (re_debug_flags & RE_DEBUG_EXTRA_STACK) x ) +#define DEBUG_BUFFERS_r(x) DEBUG_r( \ + if (re_debug_flags & RE_DEBUG_EXTRA_BUFFERS) x ) + #define DEBUG_OPTIMISE_MORE_r(x) DEBUG_r( \ if ((RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE) == \ (re_debug_flags & (RE_DEBUG_EXTRA_OPTIMISE|RE_DEBUG_COMPILE_OPTIMISE)) ) x ) @@ -735,11 +763,15 @@ re.pm, especially to the documentation. #define DEBUG_TRIE_r(x) DEBUG_r( \ if (re_debug_flags & (RE_DEBUG_COMPILE_TRIE \ | RE_DEBUG_EXECUTE_TRIE )) x ) +#define DEBUG_GPOS_r(x) DEBUG_r( \ + if (re_debug_flags & RE_DEBUG_EXTRA_GPOS) x ) /* initialization */ -/* get_sv() can return NULL during global destruction. */ +/* get_sv() can return NULL during global destruction. re_debug_flags can get + * clobbered by a longjmp, so must be initialized */ #define GET_RE_DEBUG_FLAGS DEBUG_r({ \ SV * re_debug_flags_sv = NULL; \ + re_debug_flags = 0; \ re_debug_flags_sv = get_sv(RE_DEBUG_FLAGS, 1); \ if (re_debug_flags_sv) { \ if (!SvIOK(re_debug_flags_sv)) \ @@ -750,28 +782,28 @@ re.pm, especially to the documentation. #ifdef DEBUGGING -#define GET_RE_DEBUG_FLAGS_DECL IV re_debug_flags = 0; GET_RE_DEBUG_FLAGS; +#define GET_RE_DEBUG_FLAGS_DECL VOL IV re_debug_flags = 0; GET_RE_DEBUG_FLAGS; #define RE_PV_COLOR_DECL(rpv,rlen,isuni,dsv,pv,l,m,c1,c2) \ const char * const rpv = \ pv_pretty((dsv), (pv), (l), (m), \ PL_colors[(c1)],PL_colors[(c2)], \ - ((isuni) ? PERL_PV_ESCAPE_UNI : 0) ); \ + PERL_PV_ESCAPE_RE |((isuni) ? PERL_PV_ESCAPE_UNI : 0) ); \ const int rlen = SvCUR(dsv) #define RE_SV_ESCAPE(rpv,isuni,dsv,sv,m) \ const char * const rpv = \ pv_pretty((dsv), (SvPV_nolen_const(sv)), (SvCUR(sv)), (m), \ PL_colors[(c1)],PL_colors[(c2)], \ - ((isuni) ? PERL_PV_ESCAPE_UNI : 0) ) + PERL_PV_ESCAPE_RE |((isuni) ? PERL_PV_ESCAPE_UNI : 0) ) #define RE_PV_QUOTED_DECL(rpv,isuni,dsv,pv,l,m) \ const char * const rpv = \ pv_pretty((dsv), (pv), (l), (m), \ PL_colors[0], PL_colors[1], \ - ( PERL_PV_PRETTY_QUOTE | PERL_PV_PRETTY_ELIPSES | \ + ( PERL_PV_PRETTY_QUOTE | PERL_PV_ESCAPE_RE | PERL_PV_PRETTY_ELLIPSES | \ ((isuni) ? PERL_PV_ESCAPE_UNI : 0)) \ - ) + ) #define RE_SV_DUMPLEN(ItEm) (SvCUR(ItEm) - (SvTAIL(ItEm)!=0)) #define RE_SV_TAIL(ItEm) (SvTAIL(ItEm) ? "$" : "") @@ -787,4 +819,12 @@ re.pm, especially to the documentation. #endif /* DEBUG RELATED DEFINES */ - +/* + * Local variables: + * c-indentation-style: bsd + * c-basic-offset: 4 + * indent-tabs-mode: t + * End: + * + * ex: set ts=8 sts=4 sw=4 noet: + */