X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/62fed28b592e017778cf07b732b66755ea7b0b61..deabda197e63bdf85e3277cea5e6a0782d7213c9:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 01fb087..7c7f526 100644 --- a/regcomp.c +++ b/regcomp.c @@ -27,7 +27,7 @@ */ /* The names of the functions have been changed from regcomp and - * regexec to pregcomp and pregexec in order to avoid conflicts + * regexec to pregcomp and pregexec in order to avoid conflicts * with the POSIX routines of the same names. */ @@ -85,6 +85,8 @@ # include "regcomp.h" #endif +#include "dquote_static.c" + #ifdef op #undef op #endif /* op */ @@ -221,7 +223,11 @@ typedef struct RExC_state_t { #define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren) #define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren)) -#define REQUIRE_UTF8 RExC_utf8 = 1 +/* If not already in utf8, do a longjmp back to the beginning */ +#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */ +#define REQUIRE_UTF8 STMT_START { \ + if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \ + } STMT_END /* About scan_data_t. @@ -229,7 +235,7 @@ typedef struct RExC_state_t { various inplace (keyhole style) optimisations. In addition study_chunk and scan_commit populate this data structure with information about what strings MUST appear in the pattern. We look for the longest - string that must appear for at a fixed location, and we look for the + string that must appear at a fixed location, and we look for the longest string that may appear at a floating location. So for instance in the pattern: @@ -250,14 +256,14 @@ typedef struct RExC_state_t { - offset or min_offset This is the position the string must appear at, or not before. It also implicitly (when combined with minlenp) tells us how many - character must match before the string we are searching. - Likewise when combined with minlenp and the length of the string + characters must match before the string we are searching for. + Likewise when combined with minlenp and the length of the string it tells us how many characters must appear after the string we have found. - max_offset Only used for floating strings. This is the rightmost point that - the string can appear at. Ifset to I32 max it indicates that the + the string can appear at. If set to I32 max it indicates that the string can occur infinitely far to the right. - minlenp @@ -362,9 +368,10 @@ static const scan_data_t zero_scan_data = #define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */ #define SCF_SEEN_ACCEPT 0x8000 -#define UTF (RExC_utf8 != 0) -#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0) -#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0) +#define UTF cBOOL(RExC_utf8) +#define LOC cBOOL(RExC_flags & RXf_PMf_LOCALE) +#define UNI_SEMANTICS cBOOL(RExC_flags & RXf_PMf_UNICODE) +#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) #define OOB_UNICODE 12345678 #define OOB_NAMEDCLASS -1 @@ -599,7 +606,7 @@ static const scan_data_t zero_scan_data = #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS #define EXPERIMENTAL_INPLACESCAN -#endif /*RE_TRACK_PATTERN_OFFSETS*/ +#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ #define DEBUG_STUDYDATA(str,data,depth) \ DEBUG_OPTIMISE_MORE_r(if(data){ \ @@ -703,6 +710,7 @@ S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *c cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; if (LOC) cl->flags |= ANYOF_LOCALE; + cl->flags |= ANYOF_FOLD; } /* Can match anything (initialization) */ @@ -772,6 +780,9 @@ S_cl_and(struct regnode_charclass_class *cl, if (!(and_with->flags & ANYOF_EOS)) cl->flags &= ~ANYOF_EOS; + if (!(and_with->flags & ANYOF_FOLD)) + cl->flags &= ~ANYOF_FOLD; + if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE && !(and_with->flags & ANYOF_INVERT)) { cl->flags &= ~ANYOF_UNICODE_ALL; @@ -837,6 +848,9 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con if (or_with->flags & ANYOF_EOS) cl->flags |= ANYOF_EOS; + if (or_with->flags & ANYOF_FOLD) + cl->flags |= ANYOF_FOLD; + if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE && ARG(cl) != ARG(or_with)) { cl->flags |= ANYOF_UNICODE_ALL; @@ -874,7 +888,7 @@ S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, con Dumps the final compressed table form of the trie to Perl_debug_log. Used for debugging make_trie(). */ - + STATIC void S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, AV *revcharmap, U32 depth) @@ -1417,7 +1431,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs *TODO* If we keep track of how many times each character is used we can remap the columns so that the table compression later on is more - efficient in terms of memory by ensuring most common value is in the + efficient in terms of memory by ensuring the most common value is in the middle and the least common are on the outside. IMO this would be better than a most to least common mapping as theres a decent chance the most common letter will share a node with the least common, meaning the node @@ -1719,7 +1733,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs We then construct the trie using only the .next slots of the entry structs. - We use the .check field of the first entry of the node temporarily to + We use the .check field of the first entry of the node temporarily to make compression both faster and easier by keeping track of how many non zero fields are in the node. @@ -1824,7 +1838,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs - Each states[] entry contains a .base field which indicates the index in the state[] array wheres its transition data is stored. - - If .base is 0 there are no valid transitions from that node. + - If .base is 0 there are no valid transitions from that node. - If .base is nonzero then charid is added to it to find an entry in the trans array. @@ -1838,11 +1852,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs XXX - wrong maybe? The following process inplace converts the table to the compressed - table: We first do not compress the root node 1,and mark its all its + table: We first do not compress the root node 1,and mark all its .check pointers as 1 and set its .base pointer as 1 as well. This - allows to do a DFA construction from the compressed table later, and - ensures that any .base pointers we calculate later are greater than - 0. + allows us to do a DFA construction from the compressed table later, + and ensures that any .base pointers we calculate later are greater + than 0. - We set 'pos' to indicate the first entry of the second node. @@ -1938,7 +1952,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs PerlMemShared_realloc( trie->trans, trie->lasttrans * sizeof(reg_trie_trans) ); - { /* Modify the program and insert the new TRIE node*/ + { /* Modify the program and insert the new TRIE node */ U8 nodetype =(U8)(flags & 0xFF); char *str=NULL; @@ -1955,7 +1969,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs depending on whether the thing following (in 'last') is a branch or not and whther first is the startbranch (ie is it a sub part of the alternation or is it the whole thing.) - Assuming its a sub part we conver the EXACT otherwise we convert + Assuming its a sub part we convert the EXACT otherwise we convert the whole branch sequence, including the first. */ /* Find the node we are going to overwrite */ @@ -2122,9 +2136,15 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if (trie->jump) trie->jump[0] = (U16)(nextbranch - convert); - /* XXXX */ - if ( !trie->states[trie->startstate].wordnum && trie->bitmap && - ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) + /* If the start state is not accepting (meaning there is no empty string/NOTHING) + * and there is a bitmap + * and the first "jump target" node we found leaves enough room + * then convert the TRIE node into a TRIEC node, with the bitmap + * embedded inline in the opcode - this is hypothetically faster. + */ + if ( !trie->states[trie->startstate].wordnum + && trie->bitmap + && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) ) { OP( convert ) = TRIEC; Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char); @@ -2169,8 +2189,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs * so, point the first word's .prev field at the second word. If the * second already has a .prev field set, stop now. This will be the * case either if we've already processed that word's accept state, - * or that that state had multiple words, and the overspill words - * were already linked up earlier. + * or that state had multiple words, and the overspill words were + * already linked up earlier. */ { U16 word; @@ -2216,22 +2236,22 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STATIC void S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) { -/* The Trie is constructed and compressed now so we can build a fail array now if its needed +/* The Trie is constructed and compressed now so we can build a fail array if it's needed This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88 ISBN 0-201-10088-6 We find the fail state for each state in the trie, this state is the longest proper - suffix of the current states 'word' that is also a proper prefix of another word in our - trie. State 1 represents the word '' and is the thus the default fail state. This allows + suffix of the current state's 'word' that is also a proper prefix of another word in our + trie. State 1 represents the word '' and is thus the default fail state. This allows the DFA not to have to restart after its tried and failed a word at a given point, it simply continues as though it had been matching the other word in the first place. Consider 'abcdgu'=~/abcdefg|cdgu/ When we get to 'd' we are still matching the first word, we would encounter 'g' which would - fail, which would bring use to the state representing 'd' in the second word where we would - try 'g' and succeed, prodceding to match 'cdgu'. + fail, which would bring us to the state representing 'd' in the second word where we would + try 'g' and succeed, proceeding to match 'cdgu'. */ /* add a fail transition */ const U32 trie_offset = ARG(source); @@ -2755,13 +2775,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, We have two cases - 1. patterns where the whole set of branch can be converted. + 1. patterns where the whole set of branches can be converted. 2. patterns where only a subset can be converted. In case 1 we can replace the whole set with a single regop for the trie. In case 2 we need to keep the start and end - branchs so + branches so 'BRANCH EXACT; BRANCH EXACT; BRANCH X' becomes BRANCH TRIE; BRANCH X; @@ -3200,7 +3220,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* These are the cases when once a subexpression fails at a particular position, it cannot succeed even after backtracking at the enclosing scope. - + XXXX what if minimal match and we are at the initial run of {n,m}? */ if ((mincount != maxcount - 1) && (maxcount != REG_INFTY)) @@ -3341,7 +3361,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, #if 0 while ( nxt1 && (OP(nxt1) != WHILEM)) { regnode *nnxt = regnext(nxt1); - if (nnxt == nxt) { if (reg_off_by_arg[OP(nxt1)]) ARG_SET(nxt1, nxt2 - nxt1); @@ -3363,7 +3382,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else if ((OP(oscan) == CURLYX) && (flags & SCF_WHILEM_VISITED_POS) /* See the comment on a similar expression above. - However, this time it not a subexpression + However, this time it's not a subexpression we care about, but the expression itself. */ && (maxcount == REG_INFTY) && data && ++data->whilem_c < 16) { @@ -3408,7 +3427,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (UTF) old = utf8_hop((U8*)s, old) - (U8*)s; - l -= old; /* Get the added string: */ last_str = newSVpvn_utf8(s + old, l, UTF); @@ -3496,13 +3514,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_AND) { for (value = 0; value < 256; value++) if (!is_VERTWS_cp(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); - } - else { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + else { for (value = 0; value < 256; value++) if (is_VERTWS_cp(value)) - ANYOF_BITMAP_SET(data->start_class, value); - } + ANYOF_BITMAP_SET(data->start_class, value); + } if (flags & SCF_DO_STCLASS_OR) cl_and(data->start_class, and_withp); flags &= ~SCF_DO_STCLASS; @@ -3515,7 +3533,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->pos_delta += 1; data->longest = &(data->longest_float); } - } else if (OP(scan) == FOLDCHAR) { int d = ARG(scan)==0xDF ? 1 : 2; @@ -3573,19 +3590,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM); - for (value = 0; value < 256; value++) - if (!isALNUM(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + if (FLAGS(scan) & USE_UNI) { + for (value = 0; value < 256; value++) { + if (!isWORDCHAR_L1(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (!isALNUM(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } } } else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM); - else { - for (value = 0; value < 256; value++) - if (isALNUM(value)) - ANYOF_BITMAP_SET(data->start_class, value); - } + else if (FLAGS(scan) & USE_UNI) { + for (value = 0; value < 256; value++) { + if (isWORDCHAR_L1(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (isALNUM(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } } break; case ALNUML: @@ -3602,9 +3637,19 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM); - for (value = 0; value < 256; value++) - if (isALNUM(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + if (FLAGS(scan) & USE_UNI) { + for (value = 0; value < 256; value++) { + if (isWORDCHAR_L1(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (isALNUM(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } } } else { @@ -3613,7 +3658,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { for (value = 0; value < 256; value++) if (!isALNUM(value)) - ANYOF_BITMAP_SET(data->start_class, value); + ANYOF_BITMAP_SET(data->start_class, value); } } break; @@ -3631,18 +3676,37 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE); - for (value = 0; value < 256; value++) - if (!isSPACE(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + if (FLAGS(scan) & USE_UNI) { + for (value = 0; value < 256; value++) { + if (!isSPACE_L1(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (!isSPACE(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } } } else { - if (data->start_class->flags & ANYOF_LOCALE) + if (data->start_class->flags & ANYOF_LOCALE) { ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE); - else { - for (value = 0; value < 256; value++) - if (isSPACE(value)) - ANYOF_BITMAP_SET(data->start_class, value); + } + else if (FLAGS(scan) & USE_UNI) { + for (value = 0; value < 256; value++) { + if (isSPACE_L1(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (isSPACE(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } } } break; @@ -3660,19 +3724,38 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (flags & SCF_DO_STCLASS_AND) { if (!(data->start_class->flags & ANYOF_LOCALE)) { ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE); - for (value = 0; value < 256; value++) - if (isSPACE(value)) - ANYOF_BITMAP_CLEAR(data->start_class, value); + if (FLAGS(scan) & USE_UNI) { + for (value = 0; value < 256; value++) { + if (isSPACE_L1(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } else { + for (value = 0; value < 256; value++) { + if (isSPACE(value)) { + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + } + } } } else { if (data->start_class->flags & ANYOF_LOCALE) ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE); - else { - for (value = 0; value < 256; value++) - if (!isSPACE(value)) - ANYOF_BITMAP_SET(data->start_class, value); - } + else if (FLAGS(scan) & USE_UNI) { + for (value = 0; value < 256; value++) { + if (!isSPACE_L1(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } + else { + for (value = 0; value < 256; value++) { + if (!isSPACE(value)) { + ANYOF_BITMAP_SET(data->start_class, value); + } + } + } } break; case NSPACEL: @@ -3702,7 +3785,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { for (value = 0; value < 256; value++) if (isDIGIT(value)) - ANYOF_BITMAP_SET(data->start_class, value); + ANYOF_BITMAP_SET(data->start_class, value); } } break; @@ -3719,7 +3802,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, else { for (value = 0; value < 256; value++) if (!isDIGIT(value)) - ANYOF_BITMAP_SET(data->start_class, value); + ANYOF_BITMAP_SET(data->start_class, value); } } break; @@ -3823,7 +3906,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, int f = 0; /* We use SAVEFREEPV so that when the full compile is finished perl will clean up the allocated - minlens when its all done. This was we don't + minlens when it's all done. This way we don't have to worry about freeing them when we know they wont be used, which would be a pain. */ @@ -4266,18 +4349,25 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags) struct regexp *r; register regexp_internal *ri; STRLEN plen; - char *exp = SvPV(pattern, plen); - char* xend = exp + plen; + char *exp; + char* xend; regnode *scan; I32 flags; I32 minlen = 0; + + /* these are all flags - maybe they should be turned + * into a single int with different bit masks */ + I32 sawlookahead = 0; I32 sawplus = 0; I32 sawopen = 0; + + U8 jump_ret = 0; + dJMPENV; scan_data_t data; RExC_state_t RExC_state; RExC_state_t * const pRExC_state = &RExC_state; #ifdef TRIE_STUDY_OPT - int restudied= 0; + int restudied; RExC_state_t copyRExC_state; #endif GET_RE_DEBUG_FLAGS_DECL; @@ -4288,15 +4378,56 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 pm_flags) RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); - DEBUG_COMPILE_r({ - SV *dsv= sv_newmortal(); - RE_PV_QUOTED_DECL(s, RExC_utf8, - dsv, exp, plen, 60); - PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", - PL_colors[4],PL_colors[5],s); - }); -redo_first_pass: + /* Longjmp back to here if have to switch in midstream to utf8 */ + if (! RExC_orig_utf8) { + JMPENV_PUSH(jump_ret); + } + + if (jump_ret == 0) { /* First time through */ + exp = SvPV(pattern, plen); + xend = exp + plen; + + DEBUG_COMPILE_r({ + SV *dsv= sv_newmortal(); + RE_PV_QUOTED_DECL(s, RExC_utf8, + dsv, exp, plen, 60); + PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", + PL_colors[4],PL_colors[5],s); + }); + } + else { /* longjumped back */ + STRLEN len = plen; + + /* If the cause for the longjmp was other than changing to utf8, pop + * our own setjmp, and longjmp to the correct handler */ + if (jump_ret != UTF8_LONGJMP) { + JMPENV_POP; + JMPENV_JUMP(jump_ret); + } + + GET_RE_DEBUG_FLAGS; + + /* It's possible to write a regexp in ascii that represents Unicode + codepoints outside of the byte range, such as via \x{100}. If we + detect such a sequence we have to convert the entire pattern to utf8 + and then recompile, as our sizing calculation will have been based + on 1 byte == 1 character, but we will need to use utf8 to encode + at least some part of the pattern, and therefore must convert the whole + thing. + -- dmq */ + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); + exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)SvPV(pattern, plen), &len); + xend = exp + len; + RExC_orig_utf8 = RExC_utf8 = 1; + SAVEFREEPV(exp); + } + +#ifdef TRIE_STUDY_OPT + restudied = 0; +#endif + RExC_precomp = exp; RExC_flags = pm_flags; RExC_sawback = 0; @@ -4335,24 +4466,14 @@ redo_first_pass: RExC_precomp = NULL; return(NULL); } - if (RExC_utf8 && !RExC_orig_utf8) { - /* It's possible to write a regexp in ascii that represents Unicode - codepoints outside of the byte range, such as via \x{100}. If we - detect such a sequence we have to convert the entire pattern to utf8 - and then recompile, as our sizing calculation will have been based - on 1 byte == 1 character, but we will need to use utf8 to encode - at least some part of the pattern, and therefore must convert the whole - thing. - XXX: somehow figure out how to make this less expensive... - -- dmq */ - STRLEN len = plen; - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, - "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); - exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); - xend = exp + len; - RExC_orig_utf8 = RExC_utf8; - SAVEFREEPV(exp); - goto redo_first_pass; + + /* Here, finished first pass. Get rid of our setjmp, which we added for + * efficiency only if the passed-in string wasn't in utf8, as shown by + * RExC_orig_utf8. But if the first pass was redone, that variable will be + * 1 here even though the original string wasn't utf8, but in this case + * there will have been a long jump */ + if (jump_ret == UTF8_LONGJMP || ! RExC_orig_utf8) { + JMPENV_POP; } DEBUG_PARSE_r({ PerlIO_printf(Perl_debug_log, @@ -4394,39 +4515,56 @@ redo_first_pass: r->extflags = pm_flags; { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); - bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); + bool has_charset = cBOOL(r->extflags & (RXf_PMf_LOCALE|RXf_PMf_UNICODE)); + + /* The caret is output if there are any defaults: if not all the STD + * flags are set, or if no character set specifier is needed */ + bool has_default = + (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD) + || ! has_charset); bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); const char *fptr = STD_PAT_MODS; /*"msix"*/ char *p; - const STRLEN wraplen = plen + has_minus + has_p + has_runon + /* Allocate for the worst case, which is all the std flags are turned + * on. If more precision is desired, we could do a population count of + * the flags set. This could be done with a small lookup table, or by + * shifting, masking and adding, or even, when available, assembly + * language for a machine-language population count. + * We never output a minus, as all those are defaults, so are + * covered by the caret */ + const STRLEN wraplen = plen + has_p + has_runon + + has_default /* If needs a caret */ + + has_charset /* If needs a character set specifier */ + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); - p = sv_grow(MUTABLE_SV(rx), wraplen + 1); - SvCUR_set(rx, wraplen); + p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */ SvPOK_on(rx); SvFLAGS(rx) |= SvUTF8(pattern); *p++='('; *p++='?'; + + /* If a default, cover it using the caret */ + if (has_default) { + *p++= DEFAULT_PAT_MOD; + } + if (has_charset) { + if (r->extflags & RXf_PMf_LOCALE) { + *p++ = LOCALE_PAT_MOD; + } else { + *p++ = UNICODE_PAT_MOD; + } + } if (has_p) *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ { - char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; - char *colon = r + 1; char ch; - while((ch = *fptr++)) { if(reganch & 1) *p++ = ch; - else - *r-- = ch; reganch >>= 1; } - if(has_minus) { - *r = '-'; - p = colon; - } } *p++ = ':'; @@ -4438,6 +4576,7 @@ redo_first_pass: *p++ = '\n'; *p++ = ')'; *p = 0; + SvCUR_set(rx, p - SvPVX_const(rx)); } r->intflags = 0; @@ -4489,7 +4628,7 @@ redo_first_pass: } reStudy: - r->minlen = minlen = sawplus = sawopen = 0; + r->minlen = minlen = sawlookahead = sawplus = sawopen = 0; Zero(r->substrs, 1, struct reg_substr_data); #ifdef TRIE_STUDY_OPT @@ -4537,7 +4676,6 @@ reStudy: I32 last_close = 0; /* pointed to by data */ regnode *first= scan; regnode *first_next= regnext(first); - /* * Skip introductions and multiplicators >= 1 * so that we can extract the 'meat' of the pattern that must @@ -4553,7 +4691,7 @@ reStudy: /* An OR of *one* alternative - should not happen now. */ (OP(first) == BRANCH && OP(first_next) != BRANCH) || /* for now we can't handle lookbehind IFMATCH*/ - (OP(first) == IFMATCH && !first->flags) || + (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) || (OP(first) == PLUS) || (OP(first) == MINMOD) || /* An {n,m} with n>0 */ @@ -4640,7 +4778,7 @@ reStudy: first = NEXTOPER(first); goto again; } - if (sawplus && (!sawopen || !RExC_sawback) + if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback) && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->intflags |= PREGf_SKIP; @@ -5638,6 +5776,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (*RExC_parse == '?') { /* (?...) */ bool is_logical = 0; const char * const seqstart = RExC_parse; + bool has_use_defaults = FALSE; RExC_parse++; paren = *RExC_parse++; @@ -5767,7 +5906,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse++; case '=': /* (?=...) */ RExC_seen_zerolen++; - break; + break; case '!': /* (?!...) */ RExC_seen_zerolen++; if (*RExC_parse == ')') { @@ -5849,7 +5988,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* Diagram of capture buffer numbering. Top line is the normal capture buffer numbers - Botton line is the negative indexing as from + Bottom line is the negative indexing as from the X (the (?-2)) + 1 2 3 4 5 X 6 7 @@ -6092,12 +6231,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse--; /* for vFAIL to print correctly */ vFAIL("Sequence (? incomplete"); break; + case DEFAULT_PAT_MOD: /* Use default flags with the exceptions + that follow */ + has_use_defaults = TRUE; + STD_PMMOD_FLAGS_CLEAR(&RExC_flags); + RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE); + goto parse_flags; default: --RExC_parse; parse_flags: /* (?i) */ { U32 posflags = 0, negflags = 0; U32 *flagsp = &posflags; + bool has_charset_modifier = 0; while (*RExC_parse) { /* && strchr("iogcmsx", *RExC_parse) */ @@ -6105,6 +6251,32 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) and must be globally applied -- japhy */ switch (*RExC_parse) { CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); + case LOCALE_PAT_MOD: + if (has_charset_modifier || flagsp == &negflags) { + goto fail_modifiers; + } + posflags |= RXf_PMf_LOCALE; + negflags |= RXf_PMf_UNICODE; + has_charset_modifier = 1; + break; + case UNICODE_PAT_MOD: + if (has_charset_modifier || flagsp == &negflags) { + goto fail_modifiers; + } + posflags |= RXf_PMf_UNICODE; + negflags |= RXf_PMf_LOCALE; + has_charset_modifier = 1; + break; + case DUAL_PAT_MOD: + if (has_use_defaults + || has_charset_modifier + || flagsp == &negflags) + { + goto fail_modifiers; + } + negflags |= (RXf_PMf_LOCALE|RXf_PMf_UNICODE); + has_charset_modifier = 1; + break; case ONCE_PAT_MOD: /* 'o' */ case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { @@ -6145,7 +6317,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } break; case '-': - if (flagsp == &negflags) { + /* A flag is a default iff it is following a minus, so + * if there is a minus, it means will be trying to + * re-specify a default which is an error */ + if (has_use_defaults || flagsp == &negflags) { + fail_modifiers: RExC_parse++; vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); /*NOTREACHED*/ @@ -7136,31 +7312,61 @@ tryagain: *flagp |= HASWIDTH; goto finish_meta_pat; case 'w': - ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM)); + if (LOC) { + ret = reg_node(pRExC_state, (U8)(ALNUML)); + } else { + ret = reg_node(pRExC_state, (U8)(ALNUM)); + FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0; + } *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'W': - ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM)); + if (LOC) { + ret = reg_node(pRExC_state, (U8)(NALNUML)); + } else { + ret = reg_node(pRExC_state, (U8)(NALNUM)); + FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0; + } *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'b': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND)); + if (LOC) { + ret = reg_node(pRExC_state, (U8)(BOUNDL)); + } else { + ret = reg_node(pRExC_state, (U8)(BOUND)); + FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0; + } *flagp |= SIMPLE; goto finish_meta_pat; case 'B': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND)); + if (LOC) { + ret = reg_node(pRExC_state, (U8)(NBOUNDL)); + } else { + ret = reg_node(pRExC_state, (U8)(NBOUND)); + FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0; + } *flagp |= SIMPLE; goto finish_meta_pat; case 's': - ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE)); + if (LOC) { + ret = reg_node(pRExC_state, (U8)(SPACEL)); + } else { + ret = reg_node(pRExC_state, (U8)(SPACE)); + FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0; + } *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'S': - ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE)); + if (LOC) { + ret = reg_node(pRExC_state, (U8)(NSPACEL)); + } else { + ret = reg_node(pRExC_state, (U8)(NSPACE)); + FLAGS(ret) = (UNI_SEMANTICS) ? USE_UNI : 0; + } *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'd': @@ -7527,8 +7733,9 @@ tryagain: case '0': case '1': case '2': case '3':case '4': case '5': case '6': case '7': case '8':case '9': if (*p == '0' || - (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) { - I32 flags = 0; + (isDIGIT(p[1]) && atoi(p) >= RExC_npar)) + { + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; STRLEN numlen = 3; ender = grok_oct(p, &numlen, &flags, NULL); if (ender > 0xff) { @@ -7866,43 +8073,55 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) } } - -#define _C_C_T_(NAME,TEST,WORD) \ +/* No locale test */ +#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ ANYOF_##NAME: \ - if (LOC) \ - ANYOF_CLASS_SET(ret, ANYOF_##NAME); \ - else { \ for (value = 0; value < 256; value++) \ if (TEST) \ ANYOF_BITMAP_SET(ret, value); \ - } \ yesno = '+'; \ what = WORD; \ break; \ case ANYOF_N##NAME: \ - if (LOC) \ - ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \ - else { \ for (value = 0; value < 256; value++) \ if (!TEST) \ ANYOF_BITMAP_SET(ret, value); \ - } \ yesno = '!'; \ what = WORD; \ break -#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ +/* Like the above, but there are differences if we are in uni-8-bit or not, so + * there are two tests passed in, to use depending on that. There aren't any + * cases where the label is different from the name, so no need for that + * parameter */ +#define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \ ANYOF_##NAME: \ - for (value = 0; value < 256; value++) \ - if (TEST) \ - ANYOF_BITMAP_SET(ret, value); \ + if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \ + else if (UNI_SEMANTICS) { \ + for (value = 0; value < 256; value++) { \ + if (TEST_8) ANYOF_BITMAP_SET(ret, value); \ + } \ + } \ + else { \ + for (value = 0; value < 256; value++) { \ + if (TEST_7) ANYOF_BITMAP_SET(ret, value); \ + } \ + } \ yesno = '+'; \ what = WORD; \ break; \ case ANYOF_N##NAME: \ - for (value = 0; value < 256; value++) \ - if (!TEST) \ - ANYOF_BITMAP_SET(ret, value); \ + if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \ + else if (UNI_SEMANTICS) { \ + for (value = 0; value < 256; value++) { \ + if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \ + } \ + } \ + else { \ + for (value = 0; value < 256; value++) { \ + if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \ + } \ + } \ yesno = '!'; \ what = WORD; \ break @@ -7950,7 +8169,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) #ifdef EBCDIC UV literal_endpoint = 0; #endif - UV stored = 0; /* number of chars stored in the class */ + UV stored = 0; /* 0, 1, or more than 1 chars stored in the class */ regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ @@ -8151,9 +8370,10 @@ parseit: value = grok_bslash_c(*RExC_parse++, SIZE_ONLY); break; case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': + case '5': case '6': case '7': { - I32 flags = 0; + /* Take 1-3 octal digits */ + I32 flags = PERL_SCAN_SILENT_ILLDIGIT; numlen = 3; value = grok_oct(--RExC_parse, &numlen, &flags, NULL); RExC_parse += numlen; @@ -8171,10 +8391,12 @@ parseit: break; } default: - if (!SIZE_ONLY && isALPHA(value)) + /* Allow \_ to not give an error */ + if (!SIZE_ONLY && isALNUM(value) && value != '_') { ckWARN2reg(RExC_parse, "Unrecognized escape \\%c in character class passed through", (int)value); + } break; } } /* end of \blah */ @@ -8185,10 +8407,23 @@ parseit: if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ - if (!SIZE_ONLY && !need_class) - ANYOF_CLASS_ZERO(ret); - - need_class = 1; + /* What matches in a locale is not known until runtime, so need to + * (one time per class) allocate extra space to pass to regexec. + * The space will contain a bit for each named class that is to be + * matched against. This isn't needed for \p{} and pseudo-classes, + * as they are not affected by locale, and hence are dealt with + * separately */ + if (LOC && namedclass < ANYOF_MAX && ! need_class) { + need_class = 1; + if (SIZE_ONLY) { + RExC_size += ANYOF_CLASS_ADD_SKIP; + } + else { + RExC_emit += ANYOF_CLASS_ADD_SKIP; + ANYOF_CLASS_ZERO(ret); + } + ANYOF_FLAGS(ret) |= ANYOF_CLASS|ANYOF_LARGE; + } /* a bad range like a-\d, a-[:digit:] ? */ if (range) { @@ -8228,24 +8463,26 @@ parseit: * --jhi */ switch ((I32)namedclass) { - case _C_C_T_(ALNUMC, isALNUMC(value), POSIX_CC_UNI_NAME("Alnum")); - case _C_C_T_(ALPHA, isALPHA(value), POSIX_CC_UNI_NAME("Alpha")); - case _C_C_T_(BLANK, isBLANK(value), POSIX_CC_UNI_NAME("Blank")); - case _C_C_T_(CNTRL, isCNTRL(value), POSIX_CC_UNI_NAME("Cntrl")); - case _C_C_T_(GRAPH, isGRAPH(value), POSIX_CC_UNI_NAME("Graph")); - case _C_C_T_(LOWER, isLOWER(value), POSIX_CC_UNI_NAME("Lower")); - case _C_C_T_(PRINT, isPRINT(value), POSIX_CC_UNI_NAME("Print")); - case _C_C_T_(PSXSPC, isPSXSPC(value), POSIX_CC_UNI_NAME("Space")); - case _C_C_T_(PUNCT, isPUNCT(value), POSIX_CC_UNI_NAME("Punct")); - case _C_C_T_(UPPER, isUPPER(value), POSIX_CC_UNI_NAME("Upper")); + case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum"); + case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha"); + case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank"); + case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl"); + case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph"); + case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower"); + case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint"); + case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace"); + case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct"); + case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper"); #ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS - case _C_C_T_(ALNUM, isALNUM(value), "Word"); - case _C_C_T_(SPACE, isSPACE(value), "SpacePerl"); + /* \s, \w match all unicode if utf8. */ + case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl"); + case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word"); #else - case _C_C_T_(SPACE, isSPACE(value), "PerlSpace"); - case _C_C_T_(ALNUM, isALNUM(value), "PerlWord"); + /* \s, \w match ascii and locale only */ + case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace"); + case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord"); #endif - case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); + case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit"); case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); case ANYOF_ASCII: @@ -8317,8 +8554,7 @@ parseit: /* Strings such as "+utf8::isWord\n" */ Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what); } - if (LOC) - ANYOF_FLAGS(ret) |= ANYOF_CLASS; + stored+=2; /* can't optimize this class */ continue; } } /* end of namedclass \blah */ @@ -8477,13 +8713,6 @@ parseit: range = 0; /* this range (if it was one) is done now */ } - if (need_class) { - ANYOF_FLAGS(ret) |= ANYOF_LARGE; - if (SIZE_ONLY) - RExC_size += ANYOF_CLASS_ADD_SKIP; - else - RExC_emit += ANYOF_CLASS_ADD_SKIP; - } if (SIZE_ONLY) @@ -8587,7 +8816,7 @@ S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) /* nextchar() - Advance that parse position, and optionall absorbs + Advances the parse position, and optionally absorbs "whitespace" from the inputstream. Without /x "whitespace" means (?#...) style comments only, @@ -8969,31 +9198,6 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, #endif /* - - regcurly - a little FSA that accepts {\d+,?\d*} - */ -#ifndef PERL_IN_XSUB_RE -I32 -Perl_regcurly(register const char *s) -{ - PERL_ARGS_ASSERT_REGCURLY; - - if (*s++ != '{') - return FALSE; - if (!isDIGIT(*s)) - return FALSE; - while (isDIGIT(*s)) - s++; - if (*s == ',') - s++; - while (isDIGIT(*s)) - s++; - if (*s != '}') - return FALSE; - return TRUE; -} -#endif - -/* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ #ifdef DEBUGGING @@ -9441,7 +9645,7 @@ Perl_re_intuit_string(pTHX_ REGEXP * const r) handles refcounting and freeing the perl core regexp structure. When it is necessary to actually free the structure the first thing it - does is call the 'free' method of the regexp_engine associated to to + does is call the 'free' method of the regexp_engine associated to the regexp, allowing the handling of the void *pprivate; member first. (This routine is not overridable by extensions, which is why the extensions free is called first.) @@ -9554,7 +9758,7 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) Free the private data in a regexp. This is overloadable by extensions. Perl takes care of the regexp structure in pregfree(), - this covers the *pprivate pointer which technically perldoesnt + this covers the *pprivate pointer which technically perl doesn't know about, however of course we have to handle the regexp_internal structure when no extension is in use.