X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/2a53d3314d380af5ab5283758219417c6dfa36e9..89e1cabd813107cb11f001560294e7b2f902fe30:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 9f239ff..bba966b 100644 --- a/regcomp.c +++ b/regcomp.c @@ -109,8 +109,10 @@ #define STATIC static #endif + typedef struct RExC_state_t { - U32 flags; /* are we folding, multilining? */ + U32 flags; /* RXf_* are we folding, multilining? */ + U32 pm_flags; /* PMf_* stuff from the calling PMOP */ char *precomp; /* uncompiled string. */ REGEXP *rx_sv; /* The SV that is the regexp. */ regexp *rx; /* perl core regexp structure */ @@ -131,7 +133,6 @@ typedef struct RExC_state_t { I32 nestroot; /* root parens we are in - used by accept */ I32 extralen; I32 seen_zerolen; - I32 seen_evals; regnode **open_parens; /* pointers to open parens */ regnode **close_parens; /* pointers to close parens */ regnode *opend; /* END node in program */ @@ -149,10 +150,15 @@ typedef struct RExC_state_t { I32 in_lookbehind; I32 contains_locale; I32 override_recoding; + struct reg_code_block *code_blocks; /* positions of literal (?{}) + within pattern */ + int num_code_blocks; /* size of code_blocks[] */ + int code_index; /* next code_blocks[] slot */ #if ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) #endif + SV *runtime_code_qr; /* qr with the runtime code blocks */ #ifdef DEBUGGING const char *lastparse; I32 lastnum; @@ -164,6 +170,7 @@ typedef struct RExC_state_t { } RExC_state_t; #define RExC_flags (pRExC_state->flags) +#define RExC_pm_flags (pRExC_state->pm_flags) #define RExC_precomp (pRExC_state->precomp) #define RExC_rx_sv (pRExC_state->rx_sv) #define RExC_rx (pRExC_state->rx) @@ -186,7 +193,6 @@ typedef struct RExC_state_t { #define RExC_nestroot (pRExC_state->nestroot) #define RExC_extralen (pRExC_state->extralen) #define RExC_seen_zerolen (pRExC_state->seen_zerolen) -#define RExC_seen_evals (pRExC_state->seen_evals) #define RExC_utf8 (pRExC_state->utf8) #define RExC_uni_semantics (pRExC_state->uni_semantics) #define RExC_orig_utf8 (pRExC_state->orig_utf8) @@ -214,7 +220,7 @@ typedef struct RExC_state_t { #define WORST 0 /* Worst case. */ #define HASWIDTH 0x01 /* Known to match non-null strings. */ -/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single +/* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single * character, and if utf8, must be invariant. Note that this is not the same * thing as REGNODE_SIMPLE */ #define SIMPLE 0x02 @@ -392,14 +398,18 @@ static const scan_data_t zero_scan_data = #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) -#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) -#define OOB_UNICODE 12345678 #define OOB_NAMEDCLASS -1 +/* There is no code point that is out-of-bounds, so this is problematic. But + * its only current use is to initialize a variable that is always set before + * looked at. */ +#define OOB_UNICODE 0xDEADBEEF + #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) @@ -1555,7 +1565,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if (!SvIOK(re_trie_maxbuff)) { sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); } - DEBUG_OPTIMISE_r({ + DEBUG_TRIE_COMPILE_r({ PerlIO_printf( Perl_debug_log, "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", (int)depth * 2 + 2, "", @@ -1597,9 +1607,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs */ for ( cur = first ; cur < last ; cur = regnext( cur ) ) { - regnode * const noper = NEXTOPER( cur ); + regnode *noper = NEXTOPER( cur ); const U8 *uc = (U8*)STRING( noper ); - const U8 * const e = uc + STR_LEN( noper ); + const U8 *e = uc + STR_LEN( noper ); STRLEN foldlen = 0; U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; STRLEN skiplen = 0; @@ -1609,9 +1619,18 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ if (OP(noper) == NOTHING) { - trie->minlen= 0; - continue; + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + trie->minlen= STR_LEN(noper); + } else { + trie->minlen= 0; + continue; + } } + if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte regardless of encoding */ @@ -1750,9 +1769,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs for ( cur = first ; cur < last ; cur = regnext( cur ) ) { - regnode * const noper = NEXTOPER( cur ); + regnode *noper = NEXTOPER( cur ); U8 *uc = (U8*)STRING( noper ); - const U8 * const e = uc + STR_LEN( noper ); + const U8 *e = uc + STR_LEN( noper ); U32 state = 1; /* required init */ U16 charid = 0; /* sanity init */ U8 *scan = (U8*)NULL; /* sanity init */ @@ -1761,6 +1780,15 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; STRLEN skiplen = 0; + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } + } + if (OP(noper) != NOTHING) { for ( ; uc < e ; uc += len ) { @@ -1948,9 +1976,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs for ( cur = first ; cur < last ; cur = regnext( cur ) ) { - regnode * const noper = NEXTOPER( cur ); + regnode *noper = NEXTOPER( cur ); const U8 *uc = (U8*)STRING( noper ); - const U8 * const e = uc + STR_LEN( noper ); + const U8 *e = uc + STR_LEN( noper ); U32 state = 1; /* required init */ @@ -1963,6 +1991,14 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs STRLEN skiplen = 0; U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; + if (OP(noper) == NOTHING) { + regnode *noper_next= regnext(noper); + if (noper_next != tail && OP(noper_next) == flags) { + noper = noper_next; + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } + } if ( OP(noper) != NOTHING ) { for ( ; uc < e ; uc += len ) { @@ -2598,9 +2634,9 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * problematic sequences. This delta is used by the caller to adjust the * min length of the match, and the delta between min and max, so that the * optimizer doesn't reject these possibilities based on size constraints. - * 2) These sequences are not currently correctly handled by the trie code - * either, so it changes the joined node type to ops that are not handled - * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD. + * 2) These sequences require special handling by the trie code, so it + * changes the joined node type to ops for the trie's benefit, those new + * ops being EXACTFU_SS and EXACTFU_TRICKYFOLD. * 3) This is sufficient for the two Greek sequences (described below), but * the one involving the Sharp s (\xDF) needs more. The node type * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss" @@ -3237,7 +3273,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } - DEBUG_OPTIMISE_r({ + DEBUG_TRIE_COMPILE_r({ regprop(RExC_rx, mysv, tail ); PerlIO_printf( Perl_debug_log, "%*s%s%s\n", (int)depth * 2 + 2, "", @@ -3303,9 +3339,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, U8 noper_trietype = TRIE_TYPE( noper_type ); #if defined(DEBUGGING) || defined(NOJUMPTRIE) regnode * const noper_next = regnext( noper ); + U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0; #endif - DEBUG_OPTIMISE_r({ + DEBUG_TRIE_COMPILE_r({ regprop(RExC_rx, mysv, cur); PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) ); @@ -3319,8 +3357,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, PerlIO_printf( Perl_debug_log,"\t=> %s\t", SvPV_nolen_const(mysv)); } - PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n", - REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) ); + PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n", + REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), + PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] + ); }); /* Is noper a trieable nodetype that can be merged with the @@ -3328,22 +3368,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( noper_trietype && ( - /* XXX: Currently we cannot allow a NOTHING node to be the first element - * of a TRIEABLE sequence, Otherwise we will overwrite the regop following - * the NOTHING with the TRIE regop later on. This is because a NOTHING node - * is only one regnode wide, and a TRIE is two regnodes. An example of a - * problematic pattern is: "x" =~ /\A(?>(?:(?:)A|B|C?x))\z/ - * At a later point of time we can somewhat workaround this by handling - * NOTHING -> EXACT sequences as generated by /(?:)A|(?:)B/ type patterns, - * as we can effectively ignore the NOTHING regop in that case. - * This clause, which allows NOTHING to start a sequence is left commented - * out as a reference. - * - Yves - - ( noper_trietype == NOTHING) - || ( trietype == NOTHING ) - */ - ( noper_trietype == NOTHING && trietype ) + ( noper_trietype == NOTHING) + || ( trietype == NOTHING ) || ( trietype == noper_trietype ) ) #ifdef NOJUMPTRIE @@ -3355,15 +3381,32 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * Either we are the first node in a new trieable sequence, * in which case we do some bookkeeping, otherwise we update * the end pointer. */ - count++; if ( !first ) { first = cur; - trietype = noper_trietype; + if ( noper_trietype == NOTHING ) { +#if !defined(DEBUGGING) && !defined(NOJUMPTRIE) + regnode * const noper_next = regnext( noper ); + U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; +#endif + + if ( noper_next_trietype ) { + trietype = noper_next_trietype; + } else if (noper_next_type) { + /* a NOTHING regop is 1 regop wide. We need at least two + * for a trie so we can't merge this in */ + first = NULL; + } + } else { + trietype = noper_trietype; + } } else { if ( trietype == NOTHING ) trietype = noper_trietype; last = cur; } + if (first) + count++; } /* end handle mergable triable node */ else { /* handle unmergable node - @@ -3399,31 +3442,53 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } /* end handle unmergable node */ } /* loop over branches */ - DEBUG_OPTIMISE_r({ + DEBUG_TRIE_COMPILE_r({ regprop(RExC_rx, mysv, cur); PerlIO_printf( Perl_debug_log, "%*s- %s (%d) \n", (int)depth * 2 + 2, "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); }); - if ( last && trietype != NOTHING ) { - /* the last branch of the sequence was part of a trie, - * so we have to construct it here outside of the loop - */ - made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 ); + if ( last ) { + if ( trietype != NOTHING ) { + /* the last branch of the sequence was part of a trie, + * so we have to construct it here outside of the loop + */ + made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 ); #ifdef TRIE_STUDY_OPT - if ( ((made == MADE_EXACT_TRIE && - startbranch == first) - || ( first_non_open == first )) && - depth==0 ) { - flags |= SCF_TRIE_RESTUDY; - if ( startbranch == first - && scan == tail ) - { - RExC_seen &=~REG_TOP_LEVEL_BRANCHES; + if ( ((made == MADE_EXACT_TRIE && + startbranch == first) + || ( first_non_open == first )) && + depth==0 ) { + flags |= SCF_TRIE_RESTUDY; + if ( startbranch == first + && scan == tail ) + { + RExC_seen &=~REG_TOP_LEVEL_BRANCHES; + } } - } #endif + } else { + /* at this point we know whatever we have is a NOTHING sequence/branch + * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING + */ + if ( startbranch == first ) { + regnode *opt; + /* the entire thing is a NOTHING sequence, something like this: + * (?:|) So we can turn it into a plain NOTHING op. */ + DEBUG_TRIE_COMPILE_r({ + regprop(RExC_rx, mysv, cur); + PerlIO_printf( Perl_debug_log, + "%*s- %s (%d) \n", (int)depth * 2 + 2, + "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur)); + + }); + OP(startbranch)= NOTHING; + NEXT_OFF(startbranch)= tail - startbranch; + for ( opt= startbranch + 1; opt < tail ; opt++ ) + OP(opt)= OPTIMIZED; + } + } } /* end if ( last) */ } /* TRIE_MAXBUF is non zero */ @@ -4343,11 +4408,39 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->flags |= (OP(scan) == MEOL ? SF_BEFORE_MEOL : SF_BEFORE_SEOL); + SCAN_COMMIT(pRExC_state, data, minlenp); + } else if ( PL_regkind[OP(scan)] == BRANCHJ /* Lookbehind, or need to calculate parens/evals/stclass: */ && (scan->flags || data || (flags & SCF_DO_STCLASS)) && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { + if ( OP(scan) == UNLESSM && + scan->flags == 0 && + OP(NEXTOPER(NEXTOPER(scan))) == NOTHING && + OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED + ) { + regnode *opt; + regnode *upto= regnext(scan); + DEBUG_PARSE_r({ + SV * const mysv_val=sv_newmortal(); + DEBUG_STUDYDATA("OPFAIL",data,depth); + + /*DEBUG_PARSE_MSG("opfail");*/ + regprop(RExC_rx, mysv_val, upto); + PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val), + (IV)REG_NODE_NUM(upto), + (IV)(upto - scan) + ); + }); + OP(scan) = OPFAIL; + NEXT_OFF(scan) = upto - scan; + for (opt= scan + 1; opt < upto ; opt++) + OP(opt) = OPTIMIZED; + scan= upto; + continue; + } if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY || OP(scan) == UNLESSM ) { @@ -4512,8 +4605,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } } - - } #endif } @@ -4814,71 +4905,359 @@ Perl_reginitcolors(pTHX) #endif /* - - pregcomp - compile a regular expression into internal code - * - * We can't allocate space until we know how big the compiled form will be, - * but we can't compile it (and thus know how big it is) until we've got a - * place to put the code. So we cheat: we compile it twice, once with code - * generation turned off and size counting turned on, and once "for real". - * This also means that we don't allocate space until we are sure that the - * thing really will compile successfully, and we never have to move the - * code and thus invalidate pointers into it. (Note that it has to be in - * one piece because free() must be able to free it all.) [NB: not true in perl] + * pregcomp - compile a regular expression into internal code * - * Beware that the optimization-preparation code in here knows about some - * of the structure of the compiled regexp. [I'll say.] + * Decides which engine's compiler to call based on the hint currently in + * scope */ +#ifndef PERL_IN_XSUB_RE +/* return the currently in-scope regex engine (or the default if none) */ + +regexp_engine const * +Perl_current_re_engine(pTHX) +{ + dVAR; + + if (IN_PERL_COMPILETIME) { + HV * const table = GvHV(PL_hintgv); + SV **ptr; + + if (!table) + return &PL_core_reg_engine; + ptr = hv_fetchs(table, "regcomp", FALSE); + if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr))) + return &PL_core_reg_engine; + return INT2PTR(regexp_engine*,SvIV(*ptr)); + } + else { + SV *ptr; + if (!PL_curcop->cop_hints_hash) + return &PL_core_reg_engine; + ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0); + if ( !(ptr && SvIOK(ptr) && SvIV(ptr))) + return &PL_core_reg_engine; + return INT2PTR(regexp_engine*,SvIV(ptr)); + } +} -#ifndef PERL_IN_XSUB_RE -#define RE_ENGINE_PTR &PL_core_reg_engine -#else -extern const struct regexp_engine my_reg_engine; -#define RE_ENGINE_PTR &my_reg_engine -#endif -#ifndef PERL_IN_XSUB_RE REGEXP * Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) { dVAR; - HV * const table = GvHV(PL_hintgv); + regexp_engine const *eng = current_re_engine(); + GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_PREGCOMP; - /* Dispatch a request to compile a regexp to correct - regexp engine. */ - if (table) { - SV **ptr= hv_fetchs(table, "regcomp", FALSE); - GET_RE_DEBUG_FLAGS_DECL; - if (ptr && SvIOK(*ptr) && SvIV(*ptr)) { - const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr)); - DEBUG_COMPILE_r({ - PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", - SvIV(*ptr)); - }); - return CALLREGCOMP_ENG(eng, pattern, flags); - } - } - return Perl_re_compile(aTHX_ pattern, flags); + /* Dispatch a request to compile a regexp to correct regexp engine. */ + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", + PTR2UV(eng)); + }); + return CALLREGCOMP_ENG(eng, pattern, flags); } #endif +/* public(ish) wrapper for Perl_re_op_compile that only takes an SV + * pattern rather than a list of OPs */ + +REGEXP * +Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) +{ + SV *pat = pattern; /* defeat constness! */ + PERL_ARGS_ASSERT_RE_COMPILE; + return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(), + NULL, NULL, rx_flags, 0); +} + +/* see if there are any run-time code blocks in the pattern. + * False positives are allowed */ + +static bool +S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr, + U32 pm_flags, char *pat, STRLEN plen) +{ + int n = 0; + STRLEN s; + + /* avoid infinitely recursing when we recompile the pattern parcelled up + * as qr'...'. A single constant qr// string can't have have any + * run-time component in it, and thus, no runtime code. (A non-qr + * string, however, can, e.g. $x =~ '(?{})') */ + if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST) + return 0; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + s = pRExC_state->code_blocks[n].end; + n++; + continue; + } + /* TODO ideally should handle [..], (#..), /#.../x to reduce false + * positives here */ + if (pat[s] == '(' && pat[s+1] == '?' && + (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{')) + ) + return 1; + } + return 0; +} + +/* Handle run-time code blocks. We will already have compiled any direct + * or indirect literal code blocks. Now, take the pattern 'pat' and make a + * copy of it, but with any literal code blocks blanked out and + * appropriate chars escaped; then feed it into + * + * eval "qr'modified_pattern'" + * + * For example, + * + * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno + * + * becomes + * + * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno' + * + * After eval_sv()-ing that, grab any new code blocks from the returned qr + * and merge them with any code blocks of the original regexp. + * + * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; + * instead, just save the qr and return FALSE; this tells our caller that + * the original pattern needs upgrading to utf8. + */ + +static bool +S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, + char *pat, STRLEN plen) +{ + SV *qr; + + GET_RE_DEBUG_FLAGS_DECL; + + if (pRExC_state->runtime_code_qr) { + /* this is the second time we've been called; this should + * only happen if the main pattern got upgraded to utf8 + * during compilation; re-use the qr we compiled first time + * round (which should be utf8 too) + */ + qr = pRExC_state->runtime_code_qr; + pRExC_state->runtime_code_qr = NULL; + assert(RExC_utf8 && SvUTF8(qr)); + } + else { + int n = 0; + STRLEN s; + char *p, *newpat; + int newlen = plen + 6; /* allow for "qr''x\0" extra chars */ + SV *sv, *qr_ref; + dSP; + + /* determine how many extra chars we need for ' and \ escaping */ + for (s = 0; s < plen; s++) { + if (pat[s] == '\'' || pat[s] == '\\') + newlen++; + } + + Newx(newpat, newlen, char); + p = newpat; + *p++ = 'q'; *p++ = 'r'; *p++ = '\''; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + /* blank out literal code block */ + assert(pat[s] == '('); + while (s <= pRExC_state->code_blocks[n].end) { + *p++ = ' '; + s++; + } + s--; + n++; + continue; + } + if (pat[s] == '\'' || pat[s] == '\\') + *p++ = '\\'; + *p++ = pat[s]; + } + *p++ = '\''; + if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) + *p++ = 'x'; + *p++ = '\0'; + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, + "%sre-parsing pattern for runtime code:%s %s\n", + PL_colors[4],PL_colors[5],newpat); + }); + + sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); + Safefree(newpat); + + ENTER; + SAVETMPS; + save_re_context(); + PUSHSTACKi(PERLSI_REQUIRE); + /* this causes the toker to collapse \\ into \ when parsing + * qr''; normally only q'' does this. It also alters hints + * handling */ + PL_reg_state.re_reparsing = TRUE; + eval_sv(sv, G_SCALAR); + SvREFCNT_dec(sv); + SPAGAIN; + qr_ref = POPs; + PUTBACK; + if (SvTRUE(ERRSV)) + Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV)); + assert(SvROK(qr_ref)); + qr = SvRV(qr_ref); + assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); + /* the leaving below frees the tmp qr_ref. + * Give qr a life of its own */ + SvREFCNT_inc(qr); + POPSTACK; + FREETMPS; + LEAVE; + + } + + if (!RExC_utf8 && SvUTF8(qr)) { + /* first time through; the pattern got upgraded; save the + * qr for the next time through */ + assert(!pRExC_state->runtime_code_qr); + pRExC_state->runtime_code_qr = qr; + return 0; + } + + + /* extract any code blocks within the returned qr// */ + + + /* merge the main (r1) and run-time (r2) code blocks into one */ + { + RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2); + struct reg_code_block *new_block, *dst; + RExC_state_t * const r1 = pRExC_state; /* convenient alias */ + int i1 = 0, i2 = 0; + + if (!r2->num_code_blocks) /* we guessed wrong */ + return 1; + + Newx(new_block, + r1->num_code_blocks + r2->num_code_blocks, + struct reg_code_block); + dst = new_block; + + while ( i1 < r1->num_code_blocks + || i2 < r2->num_code_blocks) + { + struct reg_code_block *src; + bool is_qr = 0; + + if (i1 == r1->num_code_blocks) { + src = &r2->code_blocks[i2++]; + is_qr = 1; + } + else if (i2 == r2->num_code_blocks) + src = &r1->code_blocks[i1++]; + else if ( r1->code_blocks[i1].start + < r2->code_blocks[i2].start) + { + src = &r1->code_blocks[i1++]; + assert(src->end < r2->code_blocks[i2].start); + } + else { + assert( r1->code_blocks[i1].start + > r2->code_blocks[i2].start); + src = &r2->code_blocks[i2++]; + is_qr = 1; + assert(src->end < r1->code_blocks[i1].start); + } + + assert(pat[src->start] == '('); + assert(pat[src->end] == ')'); + dst->start = src->start; + dst->end = src->end; + dst->block = src->block; + dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) + : src->src_regex; + dst++; + } + r1->num_code_blocks += r2->num_code_blocks; + Safefree(r1->code_blocks); + r1->code_blocks = new_block; + } + + SvREFCNT_dec(qr); + return 1; +} + + +/* + * Perl_re_op_compile - the perl internal RE engine's function to compile a + * regular expression into internal code. + * The pattern may be passed either as: + * a list of SVs (patternp plus pat_count) + * a list of OPs (expr) + * If both are passed, the SV list is used, but the OP list indicates + * which SVs are actually pre-compiled code blocks + * + * The SVs in the list have magic and qr overloading applied to them (and + * the list may be modified in-place with replacement SVs in the latter + * case). + * + * If the pattern hasn't changed from old_re, then old_re will be + * returned. + * + * eng is the current engine. If that engine has an op_comp method, then + * handle directly (i.e. we assume that op_comp was us); otherwise, just + * do the initial concatenation of arguments and pass on to the external + * engine. + * + * If is_bare_re is not null, set it to a boolean indicating whether the + * arg list reduced (after overloading) to a single bare regex which has + * been returned (i.e. /$qr/). + * + * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details. + * + * pm_flags contains the PMf_* flags, typically based on those from the + * pm_flags field of the related PMOP. Currently we're only interested in + * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL. + * + * We can't allocate space until we know how big the compiled form will be, + * but we can't compile it (and thus know how big it is) until we've got a + * place to put the code. So we cheat: we compile it twice, once with code + * generation turned off and size counting turned on, and once "for real". + * This also means that we don't allocate space until we are sure that the + * thing really will compile successfully, and we never have to move the + * code and thus invalidate pointers into it. (Note that it has to be in + * one piece because free() must be able to free it all.) [NB: not true in perl] + * + * Beware that the optimization-preparation code in here knows about some + * of the structure of the compiled regexp. [I'll say.] + */ + REGEXP * -Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) +Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, + OP *expr, const regexp_engine* eng, REGEXP *VOL old_re, + bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags) { dVAR; REGEXP *rx; struct regexp *r; register regexp_internal *ri; STRLEN plen; - char* VOL exp; + char * VOL exp; char* xend; regnode *scan; I32 flags; I32 minlen = 0; - U32 pm_flags; + U32 rx_flags; + SV * VOL pat; /* these are all flags - maybe they should be turned * into a single int with different bit masks */ @@ -4886,8 +5265,10 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) I32 sawplus = 0; I32 sawopen = 0; bool used_setjump = FALSE; - regex_charset initial_charset = get_regex_charset(orig_pm_flags); - + regex_charset initial_charset = get_regex_charset(orig_rx_flags); + bool code_is_utf8 = 0; + bool VOL recompile = 0; + bool runtime_code = 0; U8 jump_ret = 0; dJMPENV; scan_data_t data; @@ -4899,7 +5280,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) #endif GET_RE_DEBUG_FLAGS_DECL; - PERL_ARGS_ASSERT_RE_COMPILE; + PERL_ARGS_ASSERT_RE_OP_COMPILE; DEBUG_r(if (!PL_colorset) reginitcolors()); @@ -4930,9 +5311,6 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist); PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist); - PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); - PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); - PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist); PL_PosixLower = _new_invlist_C_array(PosixLower_invlist); @@ -4961,16 +5339,255 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) } #endif - exp = SvPV(pattern, plen); + pRExC_state->code_blocks = NULL; + pRExC_state->num_code_blocks = 0; + + if (is_bare_re) + *is_bare_re = FALSE; + + if (expr && (expr->op_type == OP_LIST || + (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) { + + /* is the source UTF8, and how many code blocks are there? */ + OP *o; + int ncode = 0; + + for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) { + if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv)) + code_is_utf8 = 1; + else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) + /* count of DO blocks */ + ncode++; + } + if (ncode) { + pRExC_state->num_code_blocks = ncode; + Newx(pRExC_state->code_blocks, ncode, struct reg_code_block); + } + } + + if (pat_count) { + /* handle a list of SVs */ + + SV **svp; - if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */ - RExC_utf8 = RExC_orig_utf8 = 0; + /* apply magic and RE overloading to each arg */ + for (svp = patternp; svp < patternp + pat_count; svp++) { + SV *rx = *svp; + SvGETMAGIC(rx); + if (SvROK(rx) && SvAMAGIC(rx)) { + SV *sv = AMG_CALLunary(rx, regexp_amg); + if (sv) { + if (SvROK(sv)) + sv = SvRV(sv); + if (SvTYPE(sv) != SVt_REGEXP) + Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP"); + *svp = sv; + } + } + } + + if (pat_count > 1) { + /* concat multiple args and find any code block indexes */ + + OP *o = NULL; + int n = 0; + bool utf8 = 0; + STRLEN orig_patlen = 0; + + if (pRExC_state->num_code_blocks) { + o = cLISTOPx(expr)->op_first; + assert(o->op_type == OP_PUSHMARK); + o = o->op_sibling; + } + + pat = newSVpvn("", 0); + SAVEFREESV(pat); + + /* determine if the pattern is going to be utf8 (needed + * in advance to align code block indices correctly). + * XXX This could fail to be detected for an arg with + * overloading but not concat overloading; but the main effect + * in this obscure case is to need a 'use re eval' for a + * literal code block */ + for (svp = patternp; svp < patternp + pat_count; svp++) { + if (SvUTF8(*svp)) + utf8 = 1; + } + if (utf8) + SvUTF8_on(pat); + + for (svp = patternp; svp < patternp + pat_count; svp++) { + SV *sv, *msv = *svp; + SV *rx; + bool code = 0; + if (o) { + if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) { + assert(n < pRExC_state->num_code_blocks); + pRExC_state->code_blocks[n].start = SvCUR(pat); + pRExC_state->code_blocks[n].block = o; + pRExC_state->code_blocks[n].src_regex = NULL; + n++; + code = 1; + o = o->op_sibling; /* skip CONST */ + assert(o); + } + o = o->op_sibling;; + } + + if ((SvAMAGIC(pat) || SvAMAGIC(msv)) && + (sv = amagic_call(pat, msv, concat_amg, AMGf_assign))) + { + sv_setsv(pat, sv); + /* overloading involved: all bets are off over literal + * code. Pretend we haven't seen it */ + pRExC_state->num_code_blocks -= n; + n = 0; + rx = NULL; + + } + else { + while (SvAMAGIC(msv) + && (sv = AMG_CALLunary(msv, string_amg)) + && sv != msv) + { + msv = sv; + SvGETMAGIC(msv); + } + if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) + msv = SvRV(msv); + orig_patlen = SvCUR(pat); + sv_catsv_nomg(pat, msv); + rx = msv; + if (code) + pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1; + } + + /* extract any code blocks within any embedded qr//'s */ + if (rx && SvTYPE(rx) == SVt_REGEXP + && RX_ENGINE((REGEXP*)rx)->op_comp) + { + + RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri); + if (ri->num_code_blocks) { + int i; + /* the presence of an embedded qr// with code means + * we should always recompile: the text of the + * qr// may not have changed, but it may be a + * different closure than last time */ + recompile = 1; + Renew(pRExC_state->code_blocks, + pRExC_state->num_code_blocks + ri->num_code_blocks, + struct reg_code_block); + pRExC_state->num_code_blocks += ri->num_code_blocks; + for (i=0; i < ri->num_code_blocks; i++) { + struct reg_code_block *src, *dst; + STRLEN offset = orig_patlen + + ((struct regexp *)SvANY(rx))->pre_prefix; + assert(n < pRExC_state->num_code_blocks); + src = &ri->code_blocks[i]; + dst = &pRExC_state->code_blocks[n]; + dst->start = src->start + offset; + dst->end = src->end + offset; + dst->block = src->block; + dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*) + src->src_regex + ? src->src_regex + : (REGEXP*)rx); + n++; + } + } + } + } + SvSETMAGIC(pat); + } + else { + SV *sv; + pat = *patternp; + while (SvAMAGIC(pat) + && (sv = AMG_CALLunary(pat, string_amg)) + && sv != pat) + { + pat = sv; + SvGETMAGIC(pat); + } + } + + /* handle bare regex: foo =~ $re */ + { + SV *re = pat; + if (SvROK(re)) + re = SvRV(re); + if (SvTYPE(re) == SVt_REGEXP) { + if (is_bare_re) + *is_bare_re = TRUE; + SvREFCNT_inc(re); + Safefree(pRExC_state->code_blocks); + return (REGEXP*)re; + } + } } else { - RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern); + /* not a list of SVs, so must be a list of OPs */ + assert(expr); + if (expr->op_type == OP_LIST) { + int i = -1; + bool is_code = 0; + OP *o; + + pat = newSVpvn("", 0); + SAVEFREESV(pat); + if (code_is_utf8) + SvUTF8_on(pat); + + /* given a list of CONSTs and DO blocks in expr, append all + * the CONSTs to pat, and record the start and end of each + * code block in code_blocks[] (each DO{} op is followed by an + * OP_CONST containing the corresponding literal '(?{...}) + * text) + */ + for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) { + if (o->op_type == OP_CONST) { + sv_catsv(pat, cSVOPo_sv); + if (is_code) { + pRExC_state->code_blocks[i].end = SvCUR(pat)-1; + is_code = 0; + } + } + else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) { + assert(i+1 < pRExC_state->num_code_blocks); + pRExC_state->code_blocks[++i].start = SvCUR(pat); + pRExC_state->code_blocks[i].block = o; + pRExC_state->code_blocks[i].src_regex = NULL; + is_code = 1; + } + } + } + else { + assert(expr->op_type == OP_CONST); + pat = cSVOPx_sv(expr); + } + } + + exp = SvPV_nomg(pat, plen); + + if (!eng->op_comp) { + if ((SvUTF8(pat) && IN_BYTES) + || SvGMAGICAL(pat) || SvAMAGIC(pat)) + { + /* make a temporary copy; either to convert to bytes, + * or to avoid repeating get-magic / overloaded stringify */ + pat = newSVpvn_flags(exp, plen, SVs_TEMP | + (IN_BYTES ? 0 : SvUTF8(pat))); + } + Safefree(pRExC_state->code_blocks); + return CALLREGCOMP_ENG(eng, pat, orig_rx_flags); } + + /* ignore the utf8ness if the pattern is 0 length */ + RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); RExC_uni_semantics = 0; RExC_contains_locale = 0; + pRExC_state->runtime_code_qr = NULL; /****************** LONG JUMP TARGET HERE***********************/ /* Longjmp back to here if have to switch in midstream to utf8 */ @@ -4991,7 +5608,10 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) }); } else { /* longjumped back */ - STRLEN len = plen; + U8 *src, *dst; + int n=0; + STRLEN s = 0, d = 0; + bool do_end = 0; /* If the cause for the longjmp was other than changing to utf8, pop * our own setjmp, and longjmp to the correct handler */ @@ -5012,19 +5632,81 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) -- dmq */ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); - exp = (char*)Perl_bytes_to_utf8(aTHX_ - (U8*)SvPV_nomg(pattern, plen), - &len); - xend = exp + len; - RExC_orig_utf8 = RExC_utf8 = 1; - SAVEFREEPV(exp); + + /* upgrade pattern to UTF8, and if there are code blocks, + * recalculate the indices. + * This is essentially an unrolled Perl_bytes_to_utf8() */ + + src = (U8*)SvPV_nomg(pat, plen); + Newx(dst, plen * 2 + 1, U8); + + while (s < plen) { + const UV uv = NATIVE_TO_ASCII(src[s]); + if (UNI_IS_INVARIANT(uv)) + dst[d] = (U8)UTF_TO_NATIVE(uv); + else { + dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv); + dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv); + } + if (n < pRExC_state->num_code_blocks) { + if (!do_end && pRExC_state->code_blocks[n].start == s) { + pRExC_state->code_blocks[n].start = d; + assert(dst[d] == '('); + do_end = 1; + } + else if (do_end && pRExC_state->code_blocks[n].end == s) { + pRExC_state->code_blocks[n].end = d; + assert(dst[d] == ')'); + do_end = 0; + n++; + } + } + s++; + d++; + } + dst[d] = '\0'; + plen = d; + exp = (char*) dst; + xend = exp + plen; + SAVEFREEPV(exp); + RExC_orig_utf8 = RExC_utf8 = 1; + } + + /* return old regex if pattern hasn't changed */ + + if ( old_re + && !recompile + && !!RX_UTF8(old_re) == !!RExC_utf8 + && RX_PRECOMP(old_re) + && RX_PRELEN(old_re) == plen + && memEQ(RX_PRECOMP(old_re), exp, plen)) + { + /* with runtime code, always recompile */ + runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags, + exp, plen); + if (!runtime_code) { + ReREFCNT_inc(old_re); + if (used_setjump) { + JMPENV_POP; + } + Safefree(pRExC_state->code_blocks); + return old_re; + } } + else if ((pm_flags & PMf_USE_RE_EVAL) + /* this second condition covers the non-regex literal case, + * i.e. $foo =~ '(?{})'. */ + || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME + && (PL_hints & HINT_RE_EVAL)) + ) + runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags, + exp, plen); #ifdef TRIE_STUDY_OPT restudied = 0; #endif - pm_flags = orig_pm_flags; + rx_flags = orig_rx_flags; if (initial_charset == REGEX_LOCALE_CHARSET) { RExC_contains_locale = 1; @@ -5033,17 +5715,30 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) /* Set to use unicode semantics if the pattern is in utf8 and has the * 'depends' charset specified, as it means unicode when utf8 */ - set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET); + set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); } RExC_precomp = exp; - RExC_flags = pm_flags; + RExC_flags = rx_flags; + RExC_pm_flags = pm_flags; + + if (runtime_code) { + if (PL_tainting && PL_tainted) + Perl_croak(aTHX_ "Eval-group in insecure regular expression"); + + if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { + /* whoops, we have a non-utf8 pattern, whilst run-time code + * got compiled as utf8. Try again with a utf8 pattern */ + JMPENV_JUMP(UTF8_LONGJMP); + } + } + assert(!pRExC_state->runtime_code_qr); + RExC_sawback = 0; RExC_seen = 0; RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; - RExC_seen_evals = 0; RExC_extralen = 0; RExC_override_recoding = 0; @@ -5066,6 +5761,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) #endif RExC_recurse = NULL; RExC_recurse_count = 0; + pRExC_state->code_index = 0; #if 0 /* REGC() is (currently) a NOP at the first pass. * Clever compilers notice this and complain. --jhi */ @@ -5078,6 +5774,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) ); if (reg(pRExC_state, 0, &flags,1) == NULL) { RExC_precomp = NULL; + Safefree(pRExC_state->code_blocks); return(NULL); } @@ -5097,9 +5794,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) /* The first pass could have found things that force Unicode semantics */ if ((RExC_utf8 || RExC_uni_semantics) - && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) + && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET) { - set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET); + set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET); } /* Small enough for pointer-storage convention? @@ -5130,8 +5827,15 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) /* non-zero initialization begins here */ RXi_SET( r, ri ); - r->engine= RE_ENGINE_PTR; - r->extflags = pm_flags; + r->engine= eng; + r->extflags = rx_flags; + if (pm_flags & PMf_IS_QR) { + ri->code_blocks = pRExC_state->code_blocks; + ri->num_code_blocks = pRExC_state->num_code_blocks; + } + else + SAVEFREEPV(pRExC_state->code_blocks); + { bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET); @@ -5163,7 +5867,8 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */ SvPOK_on(rx); - SvFLAGS(rx) |= SvUTF8(pattern); + if (RExC_utf8) + SvFLAGS(rx) |= SVf_UTF8; *p++='('; *p++='?'; /* If a default, cover it using the caret */ @@ -5223,7 +5928,8 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) RExC_rxi = ri; /* Second pass: emit code. */ - RExC_flags = pm_flags; /* don't let top level (?i) bleed */ + RExC_flags = rx_flags; /* don't let top level (?i) bleed */ + RExC_pm_flags = pm_flags; RExC_parse = exp; RExC_end = xend; RExC_naughty = 0; @@ -5231,9 +5937,8 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags) RExC_emit_start = ri->program; RExC_emit = ri->program; RExC_emit_bound = ri->program + RExC_size + 1; + pRExC_state->code_index = 0; - /* Store the count of eval-groups for security checks: */ - RExC_rx->seen_evals = RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); if (reg(pRExC_state, 0, &flags,1) == NULL) { ReREFCNT_dec(rx); @@ -5386,7 +6091,7 @@ reStudy: else if ((!sawopen || !RExC_sawback) && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && - !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL)) + !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks) { /* turn .* into ^.* with an implied $*=1 */ const int type = @@ -5399,7 +6104,7 @@ reStudy: goto again; } if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback) - && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */ + && !pRExC_state->num_code_blocks) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->intflags |= PREGf_SKIP; @@ -5672,7 +6377,7 @@ reStudy: r->extflags |= RXf_GPOS_SEEN; if (RExC_seen & REG_SEEN_LOOKBEHIND) r->extflags |= RXf_LOOKBEHIND_SEEN; - if (RExC_seen & REG_SEEN_EVAL) + if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; if (RExC_seen & REG_SEEN_CANY) r->extflags |= RXf_CANY_SEEN; @@ -5680,6 +6385,8 @@ reStudy: r->intflags |= PREGf_VERBARG_SEEN; if (RExC_seen & REG_SEEN_CUTGROUP) r->intflags |= PREGf_CUTGROUP_SEEN; + if (pm_flags & PMf_USE_RE_EVAL) + r->intflags |= PREGf_USE_RE_EVAL; if (RExC_paren_names) RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); else @@ -5750,8 +6457,6 @@ reStudy: return rx; } -#undef RE_ENGINE_PTR - SV* Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, @@ -6169,8 +6874,10 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) do { RExC_parse++; } while (isALNUM(*RExC_parse)); + } else { + RExC_parse++; /* so the <- from the vFAIL is after the offending character */ + vFAIL("Group name must start with a non-digit word character"); } - if ( flags ) { SV* sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), @@ -6194,7 +6901,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", (unsigned long) flags); } - /* NOT REACHED */ + assert(0); /* NOT REACHED */ } return NULL; } @@ -6727,7 +7434,6 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV return; } - void Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output) { @@ -7389,6 +8095,36 @@ S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) return TRUE; } +PERL_STATIC_INLINE UV +S_invlist_highest(pTHX_ SV* const invlist) +{ + /* Returns the highest code point that matches an inversion list. This API + * has an ambiguity, as it returns 0 under either the highest is actually + * 0, or if the list is empty. If this distinction matters to you, check + * for emptiness before calling this function */ + + UV len = invlist_len(invlist); + UV *array; + + PERL_ARGS_ASSERT_INVLIST_HIGHEST; + + if (len == 0) { + return 0; + } + + array = invlist_array(invlist); + + /* The last element in the array in the inversion list always starts a + * range that goes to infinity. That range may be for code points that are + * matched in the inversion list, or it may be for ones that aren't + * matched. In the latter case, the highest code point in the set is one + * less than the beginning of this range; otherwise it is the final element + * of this range: infinity */ + return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1)) + ? UV_MAX + : array[len - 1] - 1; +} + #ifndef PERL_IN_XSUB_RE SV * Perl__invlist_contents(pTHX_ SV* const invlist) @@ -7651,7 +8387,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, ((! FOLD) ? NREF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? NREFFA : (AT_LEAST_UNI_SEMANTICS) ? NREFFU @@ -7808,7 +8544,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } goto gen_recurse_regop; - /* NOT REACHED */ + assert(0); /* NOT REACHED */ case '+': if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { RExC_parse++; @@ -7878,7 +8614,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); return ret; } /* named and numeric backreferences */ - /* NOT REACHED */ + assert(0); /* NOT REACHED */ case '?': /* (??...) */ is_logical = 1; @@ -7892,67 +8628,51 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* FALL THROUGH */ case '{': /* (?{...}) */ { - I32 count = 1; U32 n = 0; - char c; - char *s = RExC_parse; + struct reg_code_block *cb; RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_EVAL; - while (count && (c = *RExC_parse)) { - if (c == '\\') { - if (RExC_parse[1]) - RExC_parse++; - } - else if (c == '{') - count++; - else if (c == '}') - count--; - RExC_parse++; - } - if (*RExC_parse != ')') { - RExC_parse = s; - vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); + + if ( !pRExC_state->num_code_blocks + || pRExC_state->code_index >= pRExC_state->num_code_blocks + || pRExC_state->code_blocks[pRExC_state->code_index].start + != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) + - RExC_start) + ) { + if (RExC_pm_flags & PMf_USE_RE_EVAL) + FAIL("panic: Sequence (?{...}): no code block found\n"); + FAIL("Eval-group not allowed at runtime, use re 'eval'"); } + /* this is a pre-compiled code block (?{...}) */ + cb = &pRExC_state->code_blocks[pRExC_state->code_index]; + RExC_parse = RExC_start + cb->end; if (!SIZE_ONLY) { - PAD *pad; - OP_4tree *sop, *rop; - SV * const sv = newSVpvn(s, RExC_parse - 1 - s); - - ENTER; - Perl_save_re_context(aTHX); - rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad); - sop->op_private |= OPpREFCOUNTED; - /* re_dup will OpREFCNT_inc */ - OpREFCNT_set(sop, 1); - LEAVE; - - n = add_data(pRExC_state, 3, "nop"); - RExC_rxi->data->data[n] = (void*)rop; - RExC_rxi->data->data[n+1] = (void*)sop; - RExC_rxi->data->data[n+2] = (void*)pad; - SvREFCNT_dec(sv); - } - else { /* First pass */ - if (PL_reginterp_cnt < ++RExC_seen_evals - && IN_PERL_RUNTIME) - /* No compiled RE interpolated, has runtime - components ===> unsafe. */ - FAIL("Eval-group not allowed at runtime, use re 'eval'"); - if (PL_tainting && PL_tainted) - FAIL("Eval-group in insecure regular expression"); -#if PERL_VERSION > 8 - if (IN_PERL_COMPILETIME) - PL_cv_has_eval = 1; -#endif + OP *o = cb->block; + if (cb->src_regex) { + n = add_data(pRExC_state, 2, "rl"); + RExC_rxi->data->data[n] = + (void*)SvREFCNT_inc((SV*)cb->src_regex); + RExC_rxi->data->data[n+1] = (void*)o; + } + else { + n = add_data(pRExC_state, 1, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l"); + RExC_rxi->data->data[n] = (void*)o; + } } - + pRExC_state->code_index++; nextchar(pRExC_state); + if (is_logical) { + regnode *eval; ret = reg_node(pRExC_state, LOGICAL); - if (!SIZE_ONLY) + eval = reganode(pRExC_state, EVAL, n); + if (!SIZE_ONLY) { ret->flags = 2; - REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n)); + /* for later propagation into (??{}) return value */ + eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME); + } + REGTAIL(pRExC_state, ret, eval); /* deal with the length of this later - MJD */ return ret; } @@ -8382,9 +9102,24 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } break; } + DEBUG_PARSE_r(if (!SIZE_ONLY) { + SV * const mysv_val1=sv_newmortal(); + SV * const mysv_val2=sv_newmortal(); + DEBUG_PARSE_MSG("lsbr"); + regprop(RExC_rx, mysv_val1, lastbr); + regprop(RExC_rx, mysv_val2, ender); + PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val1), + (IV)REG_NODE_NUM(lastbr), + SvPV_nolen_const(mysv_val2), + (IV)REG_NODE_NUM(ender), + (IV)(ender - lastbr) + ); + }); REGTAIL(pRExC_state, lastbr, ender); if (have_branch && !SIZE_ONLY) { + char is_nothing= 1; if (depth==1) RExC_seen |= REG_TOP_LEVEL_BRANCHES; @@ -8393,11 +9128,44 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) const U8 op = PL_regkind[OP(br)]; if (op == BRANCH) { REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender); + if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender) + is_nothing= 0; } else if (op == BRANCHJ) { REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender); + /* for now we always disable this optimisation * / + if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender) + */ + is_nothing= 0; } } + if (is_nothing) { + br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret; + DEBUG_PARSE_r(if (!SIZE_ONLY) { + SV * const mysv_val1=sv_newmortal(); + SV * const mysv_val2=sv_newmortal(); + DEBUG_PARSE_MSG("NADA"); + regprop(RExC_rx, mysv_val1, ret); + regprop(RExC_rx, mysv_val2, ender); + PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + SvPV_nolen_const(mysv_val1), + (IV)REG_NODE_NUM(ret), + SvPV_nolen_const(mysv_val2), + (IV)REG_NODE_NUM(ender), + (IV)(ender - ret) + ); + }); + OP(br)= NOTHING; + if (OP(ender) == TAIL) { + NEXT_OFF(br)= 0; + RExC_emit= br + 1; + } else { + regnode *opt; + for ( opt= br + 1; opt < ender ; opt++ ) + OP(opt)= OPTIMIZED; + NEXT_OFF(br)= ender - br; + } + } } } @@ -8434,7 +9202,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else FAIL("Junk on end of regexp"); /* "Can't happen". */ - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } if (RExC_in_lookbehind) { @@ -8967,6 +9735,81 @@ S_reg_recode(pTHX_ const char value, SV **encp) return uv; } +PERL_STATIC_INLINE U8 +S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) +{ + U8 op; + + PERL_ARGS_ASSERT_COMPUTE_EXACTISH; + + if (! FOLD) { + return EXACT; + } + + op = get_regex_charset(RExC_flags); + if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { + op--; /* /a is same as /u, and map /aa's offset to what /a's would have + been, so there is no hole */ + } + + return op + EXACTF; +} + +PERL_STATIC_INLINE void +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, STRLEN len, UV code_point) +{ + /* This knows the details about sizing an EXACTish node, and potentially + * populating it with a single character. If is non-zero, it assumes + * that the node has already been populated, and just does the sizing, + * ignoring . Otherwise it looks at and + * calculates what should be. In pass 1, it sizes the node + * appropriately. In pass 2, it additionally will populate the node's + * STRING with , if is 0. + * + * It knows that under FOLD, UTF characters and the Latin Sharp S must be + * folded (the latter only when the rules indicate it can match 'ss') */ + + bool len_passed_in = cBOOL(len != 0); + U8 character[UTF8_MAXBYTES_CASE+1]; + + PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + + if (! len_passed_in) { + if (UTF) { + if (FOLD) { + to_uni_fold(NATIVE_TO_UNI(code_point), character, &len); + } + else { + uvchr_to_utf8( character, code_point); + len = UTF8SKIP(character); + } + } + else if (! FOLD + || code_point != LATIN_SMALL_LETTER_SHARP_S + || ASCII_FOLD_RESTRICTED + || ! AT_LEAST_UNI_SEMANTICS) + { + *character = (U8) code_point; + len = 1; + } + else { + *character = 's'; + *(character + 1) = 's'; + len = 2; + } + } + + if (SIZE_ONLY) { + RExC_size += STR_SZ(len); + } + else { + RExC_emit += STR_SZ(len); + STR_LEN(node) = len; + if (! len_passed_in) { + Copy((char *) character, STRING(node), len, char); + } + } +} /* - regatom - the lowest level @@ -8983,8 +9826,46 @@ S_reg_recode(pTHX_ const char value, SV **encp) sequence, we return. Note: we have to be careful with escapes, as they can be both literal - and special, and in the case of \10 and friends can either, depending - on context. Specifically there are two separate switches for handling + and special, and in the case of \10 and friends, context determines which. + + A summary of the code structure is: + + switch (first_byte) { + cases for each special: + handle this special; + break; + case '\\': + switch (2nd byte) { + cases for each unambiguous special: + handle this special; + break; + cases for each ambigous special/literal: + disambiguate; + if (special) handle here + else goto defchar; + default: // unambiguously literal: + goto defchar; + } + default: // is a literal char + // FALL THROUGH + defchar: + create EXACTish node for literal; + while (more input and node isn't full) { + switch (input_byte) { + cases for each special; + make sure parse pointer is set so that the next call to + regatom will see this special first + goto loopdone; // EXACTish node terminated by prev. char + default: + append char to EXACTISH node; + } + get next input byte; + } + loopdone: + } + return the generated node; + + Specifically there are two separate switches for handling escape sequences, with the one for handling literal escapes requiring a dummy entry for all of the special escapes that are actually handled by the other. @@ -9138,43 +10019,17 @@ tryagain: *flagp |= HASWIDTH; goto finish_meta_pat; case 'w': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = ALNUML; - break; - case REGEX_UNICODE_CHARSET: - op = ALNUMU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = ALNUMA; - break; - case REGEX_DEPENDS_CHARSET: - op = ALNUM; - break; - default: - goto bad_charset; + op = ALNUM + get_regex_charset(RExC_flags); + if (op > ALNUMA) { /* /aa is same as /a */ + op = ALNUMA; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'W': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = NALNUML; - break; - case REGEX_UNICODE_CHARSET: - op = NALNUMU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = NALNUMA; - break; - case REGEX_DEPENDS_CHARSET: - op = NALNUM; - break; - default: - goto bad_charset; + op = NALNUM + get_regex_charset(RExC_flags); + if (op > NALNUMA) { /* /aa is same as /a */ + op = NALNUMA; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; @@ -9182,22 +10037,9 @@ tryagain: case 'b': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = BOUNDL; - break; - case REGEX_UNICODE_CHARSET: - op = BOUNDU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = BOUNDA; - break; - case REGEX_DEPENDS_CHARSET: - op = BOUND; - break; - default: - goto bad_charset; + op = BOUND + get_regex_charset(RExC_flags); + if (op > BOUNDA) { /* /aa is same as /a */ + op = BOUNDA; } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); @@ -9206,103 +10048,45 @@ tryagain: case 'B': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = NBOUNDL; - break; - case REGEX_UNICODE_CHARSET: - op = NBOUNDU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = NBOUNDA; - break; - case REGEX_DEPENDS_CHARSET: - op = NBOUND; - break; - default: - goto bad_charset; + op = NBOUND + get_regex_charset(RExC_flags); + if (op > NBOUNDA) { /* /aa is same as /a */ + op = NBOUNDA; } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; goto finish_meta_pat; case 's': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = SPACEL; - break; - case REGEX_UNICODE_CHARSET: - op = SPACEU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = SPACEA; - break; - case REGEX_DEPENDS_CHARSET: - op = SPACE; - break; - default: - goto bad_charset; + op = SPACE + get_regex_charset(RExC_flags); + if (op > SPACEA) { /* /aa is same as /a */ + op = SPACEA; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'S': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = NSPACEL; - break; - case REGEX_UNICODE_CHARSET: - op = NSPACEU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = NSPACEA; - break; - case REGEX_DEPENDS_CHARSET: - op = NSPACE; - break; - default: - goto bad_charset; - } - ret = reg_node(pRExC_state, op); - *flagp |= HASWIDTH|SIMPLE; - goto finish_meta_pat; - case 'd': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = DIGITL; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = DIGITA; - break; - case REGEX_DEPENDS_CHARSET: /* No difference between these */ - case REGEX_UNICODE_CHARSET: - op = DIGIT; - break; - default: - goto bad_charset; + op = NSPACE + get_regex_charset(RExC_flags); + if (op > NSPACEA) { /* /aa is same as /a */ + op = NSPACEA; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'D': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = NDIGITL; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = NDIGITA; - break; - case REGEX_DEPENDS_CHARSET: /* No difference between these */ - case REGEX_UNICODE_CHARSET: - op = NDIGIT; - break; - default: - goto bad_charset; + op = NDIGIT; + goto join_D_and_d; + case 'd': + op = DIGIT; + join_D_and_d: + { + U8 offset = get_regex_charset(RExC_flags); + if (offset == REGEX_UNICODE_CHARSET) { + offset = REGEX_DEPENDS_CHARSET; + } + else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) { + offset = REGEX_ASCII_RESTRICTED_CHARSET; + } + op += offset; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; @@ -9403,7 +10187,7 @@ tryagain: ret = reganode(pRExC_state, ((! FOLD) ? NREF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? NREFFA : (AT_LEAST_UNI_SEMANTICS) ? NREFFU @@ -9453,6 +10237,7 @@ tryagain: vFAIL("Reference to nonexistent or unclosed group"); } if (!isg && num > 9 && num >= RExC_npar) + /* Probably a character specified in octal, e.g. \35 */ goto defchar; else { char * const parse_start = RExC_parse - 1; /* MJD */ @@ -9473,7 +10258,7 @@ tryagain: ret = reganode(pRExC_state, ((! FOLD) ? REF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? REFFA : (AT_LEAST_UNI_SEMANTICS) ? REFFU @@ -9530,14 +10315,7 @@ tryagain: bool is_exactfu_sharp_s; ender = 0; - node_type = ((! FOLD) ? EXACT - : (LOC) - ? EXACTFL - : (MORE_ASCII_RESTRICTED) - ? EXACTFA - : (AT_LEAST_UNI_SEMANTICS) - ? EXACTFU - : EXACTF); + node_type = compute_EXACTish(pRExC_state); ret = reg_node(pRExC_state, node_type); s = STRING(ret); @@ -9667,38 +10445,39 @@ tryagain: break; } case 'x': - if (*++p == '{') { - char* const e = strchr(p, '}'); + { + STRLEN brace_len = len; + UV result; + const char* error_msg; - if (!e) { - RExC_parse = p + 1; - vFAIL("Missing right brace on \\x{}"); + bool valid = grok_bslash_x(p, + &result, + &brace_len, + &error_msg, + 1); + p += brace_len; + if (! valid) { + RExC_parse = p; /* going to die anyway; point + to exact spot of failure */ + vFAIL(error_msg); } else { - I32 flags = PERL_SCAN_ALLOW_UNDERSCORES - | PERL_SCAN_DISALLOW_PREFIX; - STRLEN numlen = e - p - 1; - ender = grok_hex(p + 1, &numlen, &flags, NULL); - if (ender > 0xff) - REQUIRE_UTF8; - p = e + 1; + ender = result; } + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; } - else { - I32 flags = PERL_SCAN_DISALLOW_PREFIX; - STRLEN numlen = 2; - ender = grok_hex(p, &numlen, &flags, NULL); - p += numlen; - } - if (PL_encoding && ender < 0x100) - goto recode_encoding; - break; case 'c': p++; ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); break; case '0': case '1': case '2': case '3':case '4': - case '5': case '6': case '7': case '8':case '9': + case '5': case '6': case '7': if (*p == '0' || (isDIGIT(p[1]) && atoi(p) >= RExC_npar)) { @@ -9731,7 +10510,7 @@ tryagain: FAIL("Trailing \\"); /* FALL THROUGH */ default: - if (!SIZE_ONLY&& isALPHA(*p)) { + if (!SIZE_ONLY&& isALNUMC(*p)) { ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p); } goto normal_default; @@ -9787,7 +10566,7 @@ tryagain: *tmpbuf = (U8) ender; foldlen = 1; } - else if (! MORE_ASCII_RESTRICTED && ! LOC) { + else if (! ASCII_FOLD_RESTRICTED && ! LOC) { /* Locale and /aa require more selectivity about the * fold, so are handled below. Otherwise, here, just @@ -9919,22 +10698,12 @@ tryagain: if (len == 1 && UNI_IS_INVARIANT(ender)) *flagp |= SIMPLE; - if (SIZE_ONLY) - RExC_size += STR_SZ(len); - else { - STR_LEN(ret) = len; - RExC_emit += STR_SZ(len); - } + alloc_maybe_populate_EXACT(pRExC_state, ret, len, 0); } break; } return(ret); - -/* Jumped to when an unrecognized character set is encountered */ -bad_charset: - Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags)); - return(NULL); } STATIC char * @@ -10009,7 +10778,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) switch (skip) { case 4: if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */ - namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM; + namedclass = ANYOF_ALNUM; break; case 5: /* Names all of length 5. */ @@ -10019,57 +10788,63 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) switch (posixcc[4]) { case 'a': if (memEQ(posixcc, "alph", 4)) /* alpha */ - namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA; + namedclass = ANYOF_ALPHA; break; case 'e': if (memEQ(posixcc, "spac", 4)) /* space */ - namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC; + namedclass = ANYOF_PSXSPC; break; case 'h': if (memEQ(posixcc, "grap", 4)) /* graph */ - namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH; + namedclass = ANYOF_GRAPH; break; case 'i': if (memEQ(posixcc, "asci", 4)) /* ascii */ - namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII; + namedclass = ANYOF_ASCII; break; case 'k': if (memEQ(posixcc, "blan", 4)) /* blank */ - namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK; + namedclass = ANYOF_BLANK; break; case 'l': if (memEQ(posixcc, "cntr", 4)) /* cntrl */ - namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL; + namedclass = ANYOF_CNTRL; break; case 'm': if (memEQ(posixcc, "alnu", 4)) /* alnum */ - namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC; + namedclass = ANYOF_ALNUMC; break; case 'r': if (memEQ(posixcc, "lowe", 4)) /* lower */ - namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER; + namedclass = ANYOF_LOWER; else if (memEQ(posixcc, "uppe", 4)) /* upper */ - namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER; + namedclass = ANYOF_UPPER; break; case 't': if (memEQ(posixcc, "digi", 4)) /* digit */ - namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT; + namedclass = ANYOF_DIGIT; else if (memEQ(posixcc, "prin", 4)) /* print */ - namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT; + namedclass = ANYOF_PRINT; else if (memEQ(posixcc, "punc", 4)) /* punct */ - namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT; + namedclass = ANYOF_PUNCT; break; } break; case 6: if (memEQ(posixcc, "xdigit", 6)) - namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT; + namedclass = ANYOF_XDIGIT; break; } if (namedclass == OOB_NAMEDCLASS) Simple_vFAIL3("POSIX class [:%.*s:] unknown", t - s - 1, s + 1); + + /* The #defines are structured so each complement is +1 to + * the normal one */ + if (complement) { + namedclass++; + } assert (posixcc[skip] == ':'); assert (posixcc[skip+1] == ']'); } else if (!SIZE_ONLY) { @@ -10200,7 +10975,7 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) * determined at run-time * run_time_list is a SV* that contains text names of properties that are to * be computed at run time. This concatenates - * to it, apppropriately + * to it, appropriately * This is essentially DO_POSIX, but we know only the Latin1 values at compile * time */ #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ @@ -10226,16 +11001,18 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) } /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of - * this and DO_N_POSIX */ + * this and DO_N_POSIX. Sets only if it can; unchanged + * otherwise */ #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ - l1_sourcelist, Xpropertyname, run_time_list) \ + l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \ if (AT_LEAST_ASCII_RESTRICTED) { \ _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \ } \ else { \ Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \ + matches_above_unicode = TRUE; \ if (LOC) { \ - ANYOF_CLASS_SET(node, namedclass); \ + ANYOF_CLASS_SET(node, namedclass); \ } \ else { \ SV* scratch_list = NULL; \ @@ -10253,171 +11030,6 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) } \ } -STATIC U8 -S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr) -{ - - /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes. - * Locale folding is done at run-time, so this function should not be - * called for nodes that are for locales. - * - * This function sets the bit corresponding to the fold of the input - * 'value', if not already set. The fold of 'f' is 'F', and the fold of - * 'F' is 'f'. - * - * It also knows about the characters that are in the bitmap that have - * folds that are matchable only outside it, and sets the appropriate lists - * and flags. - * - * It returns the number of bits that actually changed from 0 to 1 */ - - U8 stored = 0; - U8 fold; - - PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD; - - fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value] - : PL_fold[value]; - - /* It assumes the bit for 'value' has already been set */ - if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) { - ANYOF_BITMAP_SET(node, fold); - stored++; - } - if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) { - /* Certain Latin1 characters have matches outside the bitmap. To get - * here, 'value' is one of those characters. None of these matches is - * valid for ASCII characters under /aa, which have been excluded by - * the 'if' above. The matches fall into three categories: - * 1) They are singly folded-to or -from an above 255 character, as - * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y - * WITH DIAERESIS; - * 2) They are part of a multi-char fold with another character in the - * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill; - * 3) They are part of a multi-char fold with a character not in the - * bitmap, such as various ligatures. - * We aren't dealing fully with multi-char folds, except we do deal - * with the pattern containing a character that has a multi-char fold - * (not so much the inverse). - * For types 1) and 3), the matches only happen when the target string - * is utf8; that's not true for 2), and we set a flag for it. - * - * The code below adds to the passed in inversion list the single fold - * closures for 'value'. The values are hard-coded here so that an - * innocent-looking character class, like /[ks]/i won't have to go out - * to disk to find the possible matches. XXX It would be better to - * generate these via regen, in case a new version of the Unicode - * standard adds new mappings, though that is not really likely. */ - switch (value) { - case 'k': - case 'K': - /* KELVIN SIGN */ - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A); - break; - case 's': - case 'S': - /* LATIN SMALL LETTER LONG S */ - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F); - break; - case MICRO_SIGN: - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, - GREEK_SMALL_LETTER_MU); - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, - GREEK_CAPITAL_LETTER_MU); - break; - case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: - case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: - /* ANGSTROM SIGN */ - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B); - if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */ - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, - PL_fold_latin1[value]); - } - break; - case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, - LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); - break; - case LATIN_SMALL_LETTER_SHARP_S: - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, - LATIN_CAPITAL_LETTER_SHARP_S); - - /* Under /a, /d, and /u, this can match the two chars "ss" */ - if (! MORE_ASCII_RESTRICTED) { - add_alternate(alternate_ptr, (U8 *) "ss", 2); - - /* And under /u or /a, it can match even if the target is - * not utf8 */ - if (AT_LEAST_UNI_SEMANTICS) { - ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8; - } - } - break; - case 'F': case 'f': - case 'I': case 'i': - case 'L': case 'l': - case 'T': case 't': - case 'A': case 'a': - case 'H': case 'h': - case 'J': case 'j': - case 'N': case 'n': - case 'W': case 'w': - case 'Y': case 'y': - /* These all are targets of multi-character folds from code - * points that require UTF8 to express, so they can't match - * unless the target string is in UTF-8, so no action here is - * necessary, as regexec.c properly handles the general case - * for UTF-8 matching */ - break; - default: - /* Use deprecated warning to increase the chances of this - * being output */ - ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value); - break; - } - } - else if (DEPENDS_SEMANTICS - && ! isASCII(value) - && PL_fold_latin1[value] != value) - { - /* Under DEPENDS rules, non-ASCII Latin1 characters match their - * folds only when the target string is in UTF-8. We add the fold - * here to the list of things to match outside the bitmap, which - * won't be looked at unless it is UTF8 (or else if something else - * says to look even if not utf8, but those things better not happen - * under DEPENDS semantics. */ - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]); - } - - return stored; -} - - -PERL_STATIC_INLINE U8 -S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr) -{ - /* This inline function sets a bit in the bitmap if not already set, and if - * appropriate, its fold, returning the number of bits that actually - * changed from 0 to 1 */ - - U8 stored; - - PERL_ARGS_ASSERT_SET_REGCLASS_BIT; - - if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */ - return 0; - } - - ANYOF_BITMAP_SET(node, value); - stored = 1; - - if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */ - stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr); - } - - return stored; -} - STATIC void S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) { @@ -10436,6 +11048,11 @@ S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) return; } +/* The names of properties whose definitions are not known at compile time are + * stored in this SV, after a constant heading. So if the length has been + * changed since initialization, then there is a run-time definition. */ +#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len) + /* parse a class specification and produce either an ANYOF node that matches the pattern or perhaps will be optimized into an EXACTish node @@ -10448,12 +11065,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) { dVAR; register UV nextvalue; - register IV prevvalue = OOB_UNICODE; + register UV prevvalue = OOB_UNICODE; register IV range = 0; UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ register regnode *ret; STRLEN numlen; - IV namedclass; + IV namedclass = OOB_NAMEDCLASS; char *rangebegin = NULL; bool need_class = 0; bool allow_full_fold = TRUE; /* Assume wants multi-char folding */ @@ -10461,6 +11078,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more than just initialized. */ SV* properties = NULL; /* Code points that match \p{} \P{} */ + SV* posixes = NULL; /* Code points that match classes like, [:word:], + extended beyond the Latin1 range */ UV element_count = 0; /* Number of distinct elements in the class. Optimizations may be possible if this is tiny */ UV n; @@ -10475,37 +11094,35 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) /* Set if a component of this character class is user-defined; just passed * on to the engine */ - UV has_user_defined_property = 0; - - /* code points this node matches that can't be stored in the bitmap */ - SV* nonbitmap = NULL; - - /* The items that are to match that aren't stored in the bitmap, but are a - * result of things that are stored there. This is the fold closure of - * such a character, either because it has DEPENDS semantics and shouldn't - * be matched unless the target string is utf8, or is a code point that is - * too large for the bit map, as for example, the fold of the MICRO SIGN is - * above 255. This all is solely for performance reasons. By having this - * code know the outside-the-bitmap folds that the bitmapped characters are - * involved with, we don't have to go out to disk to find the list of - * matches, unless the character class includes code points that aren't - * storable in the bit map. That means that a character class with an 's' - * in it, for example, doesn't need to go out to disk to find everything - * that matches. A 2nd list is used so that the 'nonbitmap' list is kept - * empty unless there is something whose fold we don't know about, and will - * have to go out to the disk to find. */ - SV* l1_fold_invlist = NULL; + bool has_user_defined_property = FALSE; + + /* inversion list of code points this node matches only when the target + * string is in UTF-8. (Because is under /d) */ + SV* depends_list = NULL; + + /* inversion list of code points this node matches. For much of the + * function, it includes only those that match regardless of the utf8ness + * of the target string */ + SV* cp_list = NULL; /* List of multi-character folds that are matched by this node */ AV* unicode_alternate = NULL; #ifdef EBCDIC + /* In a range, counts how many 0-2 of the ends of it came from literals, + * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ UV literal_endpoint = 0; #endif UV stored = 0; /* how many chars stored in the bitmap */ + bool invert = FALSE; /* Is this class to be complemented */ + + /* Is there any thing like \W or [:^digit:] that matches above the legal + * Unicode range? */ + bool runtime_posix_matches_above_Unicode = FALSE; regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; + const I32 orig_size = RExC_size; GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCLASS; @@ -10526,8 +11143,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ RExC_naughty++; RExC_parse++; - if (!SIZE_ONLY) - ANYOF_FLAGS(ret) |= ANYOF_INVERT; + invert = TRUE; /* We have decided to not allow multi-char folds in inverted character * classes, due to the confusion that can happen, especially with @@ -10549,7 +11165,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) if (LOC) { ANYOF_FLAGS(ret) |= ANYOF_LOCALE; } - ANYOF_BITMAP_ZERO(ret); listsv = newSVpvs("# comment\n"); initial_listsv_len = SvCUR(listsv); } @@ -10652,6 +11267,7 @@ parseit: SV** invlistsvp; SV* invlist; char* name; + if (UCHARAT(RExC_parse) == '^') { RExC_parse++; n--; @@ -10706,11 +11322,15 @@ parseit: Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", (value == 'p' ? '+' : '!'), name); - has_user_defined_property = 1; + has_user_defined_property = TRUE; /* We don't know yet, so have to assume that the * property could match something in the Latin1 range, - * hence something that isn't utf8 */ + * hence something that isn't utf8. Note that this + * would cause things in to match + * inappropriately, except that any \p{}, including + * this one forces Unicode semantics, which means there + * is */ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; } else { @@ -10718,17 +11338,14 @@ parseit: /* Here, did get the swash and its inversion list. If * the swash is from a user-defined property, then this * whole character class should be regarded as such */ - SV** user_defined_svp = - hv_fetchs(MUTABLE_HV(SvRV(swash)), - "USER_DEFINED", FALSE); - if (user_defined_svp) { - has_user_defined_property - |= SvUV(*user_defined_svp); - } + has_user_defined_property = + _is_swash_user_defined(swash); /* Invert if asking for the complement */ if (value == 'P') { - _invlist_union_complement_2nd(properties, invlist, &properties); + _invlist_union_complement_2nd(properties, + invlist, + &properties); /* The swash can't be used as-is, because we've * inverted things; delay removing it to here after @@ -10775,22 +11392,18 @@ parseit: } break; case 'x': - if (*RExC_parse == '{') { - I32 flags = PERL_SCAN_ALLOW_UNDERSCORES - | PERL_SCAN_DISALLOW_PREFIX; - char * const e = strchr(RExC_parse++, '}'); - if (!e) - vFAIL("Missing right brace on \\x{}"); - - numlen = e - RExC_parse; - value = grok_hex(RExC_parse, &numlen, &flags, NULL); - RExC_parse = e + 1; - } - else { - I32 flags = PERL_SCAN_DISALLOW_PREFIX; - numlen = 2; - value = grok_hex(RExC_parse, &numlen, &flags, NULL); + RExC_parse--; /* function expects to be pointed at the 'x' */ + { + const char* error_msg; + bool valid = grok_bslash_x(RExC_parse, + &value, + &numlen, + &error_msg, + 1); RExC_parse += numlen; + if (! valid) { + vFAIL(error_msg); + } } if (PL_encoding && value < 0x100) goto recode_encoding; @@ -10834,15 +11447,20 @@ parseit: literal_endpoint++; #endif - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ - - /* What matches in a locale is not known until runtime, so need to - * (one time per class) allocate extra space to pass to regexec. - * The space will contain a bit for each named class that is to be - * matched against. This isn't needed for \p{} and pseudo-classes, - * as they are not affected by locale, and hence are dealt with - * separately */ - if (LOC && namedclass < ANYOF_MAX && ! need_class) { + /* What matches in a locale is not known until runtime. This + * includes what the Posix classes (like \w, [:space:]) match. + * Room must be reserved (one time per class) to store such + * classes, either if Perl is compiled so that locale nodes always + * should have this space, or if there is such class info to be + * stored. The space will contain a bit for each named class that + * is to be matched against. This isn't needed for \p{} and + * pseudo-classes, as they are not affected by locale, and hence + * are dealt with separately */ + if (LOC + && ! need_class + && (ANYOF_LOCALE == ANYOF_CLASS + || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX))) + { need_class = 1; if (SIZE_ONLY) { RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP; @@ -10854,6 +11472,8 @@ parseit: ANYOF_FLAGS(ret) |= ANYOF_CLASS; } + if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a * literal, as is the character that began the false range, i.e. * the 'a' in the examples */ @@ -10865,51 +11485,41 @@ parseit: ckWARN4reg(RExC_parse, "False [] range \"%*.*s\"", w, w, rangebegin); - - stored += - set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate); - if (prevvalue < 256) { - stored += - set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate); - } - else { - nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue); - } + cp_list = add_cp_to_invlist(cp_list, '-'); + cp_list = add_cp_to_invlist(cp_list, prevvalue); } range = 0; /* this was not a true range */ + element_count += 2; /* So counts for three values */ } - if (!SIZE_ONLY) { - - /* Possible truncation here but in some 64-bit environments - * the compiler gets heartburn about switch on 64-bit values. - * A similar issue a little earlier when switching on value. - * --jhi */ + if (! SIZE_ONLY) { switch ((I32)namedclass) { case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */ - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); break; case ANYOF_NALNUMC: - DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, - PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, + PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv, + runtime_posix_matches_above_Unicode); break; case ANYOF_ALPHA: - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); break; case ANYOF_NALPHA: - DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, - PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, + PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv, + runtime_posix_matches_above_Unicode); break; case ANYOF_ASCII: if (LOC) { ANYOF_CLASS_SET(ret, namedclass); } else { - _invlist_union(properties, PL_ASCII, &properties); + _invlist_union(posixes, PL_ASCII, &posixes); } break; case ANYOF_NASCII: @@ -10917,59 +11527,61 @@ parseit: ANYOF_CLASS_SET(ret, namedclass); } else { - _invlist_union_complement_2nd(properties, - PL_ASCII, &properties); + _invlist_union_complement_2nd(posixes, + PL_ASCII, &posixes); if (DEPENDS_SEMANTICS) { ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; } } break; case ANYOF_BLANK: - DO_POSIX(ret, namedclass, properties, + DO_POSIX(ret, namedclass, posixes, PL_PosixBlank, PL_XPosixBlank); break; case ANYOF_NBLANK: - DO_N_POSIX(ret, namedclass, properties, + DO_N_POSIX(ret, namedclass, posixes, PL_PosixBlank, PL_XPosixBlank); break; case ANYOF_CNTRL: - DO_POSIX(ret, namedclass, properties, + DO_POSIX(ret, namedclass, posixes, PL_PosixCntrl, PL_XPosixCntrl); break; case ANYOF_NCNTRL: - DO_N_POSIX(ret, namedclass, properties, + DO_N_POSIX(ret, namedclass, posixes, PL_PosixCntrl, PL_XPosixCntrl); break; case ANYOF_DIGIT: /* There are no digits in the Latin1 range outside of * ASCII, so call the macro that doesn't have to resolve * them */ - DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes, PL_PosixDigit, "XPosixDigit", listsv); break; case ANYOF_NDIGIT: - DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, - PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, + PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv, + runtime_posix_matches_above_Unicode); break; case ANYOF_GRAPH: - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); break; case ANYOF_NGRAPH: - DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, - PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, + PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv, + runtime_posix_matches_above_Unicode); break; case ANYOF_HORIZWS: - /* For these, we use the nonbitmap, as /d doesn't make a + /* For these, we use the cp_list, as /d doesn't make a * difference in what these match. There would be problems * if these characters had folds other than themselves, as - * nonbitmap is subject to folding. It turns out that \h + * cp_list is subject to folding. It turns out that \h * is just a synonym for XPosixBlank */ - _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap); + _invlist_union(cp_list, PL_XPosixBlank, &cp_list); break; case ANYOF_NHORIZWS: - _invlist_union_complement_2nd(nonbitmap, - PL_XPosixBlank, &nonbitmap); + _invlist_union_complement_2nd(cp_list, + PL_XPosixBlank, &cp_list); break; case ANYOF_LOWER: case ANYOF_NLOWER: @@ -10992,45 +11604,48 @@ parseit: Xname = "XPosixLower"; } if (namedclass == ANYOF_LOWER) { - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, ascii_source, l1_source, Xname, listsv); } else { DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, - properties, ascii_source, l1_source, Xname, listsv); + posixes, ascii_source, l1_source, Xname, listsv, + runtime_posix_matches_above_Unicode); } break; } case ANYOF_PRINT: - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); break; case ANYOF_NPRINT: - DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, - PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, + PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv, + runtime_posix_matches_above_Unicode); break; case ANYOF_PUNCT: - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); break; case ANYOF_NPUNCT: - DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, - PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, + PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv, + runtime_posix_matches_above_Unicode); break; case ANYOF_PSXSPC: - DO_POSIX(ret, namedclass, properties, + DO_POSIX(ret, namedclass, posixes, PL_PosixSpace, PL_XPosixSpace); break; case ANYOF_NPSXSPC: - DO_N_POSIX(ret, namedclass, properties, + DO_N_POSIX(ret, namedclass, posixes, PL_PosixSpace, PL_XPosixSpace); break; case ANYOF_SPACE: - DO_POSIX(ret, namedclass, properties, + DO_POSIX(ret, namedclass, posixes, PL_PerlSpace, PL_XPerlSpace); break; case ANYOF_NSPACE: - DO_N_POSIX(ret, namedclass, properties, + DO_N_POSIX(ret, namedclass, posixes, PL_PerlSpace, PL_XPerlSpace); break; case ANYOF_UPPER: /* Same as LOWER, above */ @@ -11051,40 +11666,42 @@ parseit: Xname = "XPosixUpper"; } if (namedclass == ANYOF_UPPER) { - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, ascii_source, l1_source, Xname, listsv); } else { DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, - properties, ascii_source, l1_source, Xname, listsv); + posixes, ascii_source, l1_source, Xname, listsv, + runtime_posix_matches_above_Unicode); } break; } case ANYOF_ALNUM: /* Really is 'Word' */ - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); break; case ANYOF_NALNUM: - DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, - PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, + PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv, + runtime_posix_matches_above_Unicode); break; case ANYOF_VERTWS: - /* For these, we use the nonbitmap, as /d doesn't make a + /* For these, we use the cp_list, as /d doesn't make a * difference in what these match. There would be problems * if these characters had folds other than themselves, as - * nonbitmap is subject to folding */ - _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap); + * cp_list is subject to folding */ + _invlist_union(cp_list, PL_VertSpace, &cp_list); break; case ANYOF_NVERTWS: - _invlist_union_complement_2nd(nonbitmap, - PL_VertSpace, &nonbitmap); + _invlist_union_complement_2nd(cp_list, + PL_VertSpace, &cp_list); break; case ANYOF_XDIGIT: - DO_POSIX(ret, namedclass, properties, + DO_POSIX(ret, namedclass, posixes, PL_PosixXDigit, PL_XPosixXDigit); break; case ANYOF_NXDIGIT: - DO_N_POSIX(ret, namedclass, properties, + DO_N_POSIX(ret, namedclass, posixes, PL_PosixXDigit, PL_XPosixXDigit); break; case ANYOF_MAX: @@ -11095,19 +11712,19 @@ parseit: break; } - continue; + continue; /* Go get next character */ } } /* end of namedclass \blah */ if (range) { - if (prevvalue > (IV)value) /* b-a */ { + if (prevvalue > value) /* b-a */ { const int w = RExC_parse - rangebegin; Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); range = 0; /* not a valid range */ } } else { - prevvalue = value; /* save the beginning of the range */ + prevvalue = value; /* save the beginning of the potential range */ if (RExC_parse+1 < RExC_end && *RExC_parse == '-' && RExC_parse[1] != ']') @@ -11124,135 +11741,416 @@ parseit: "False [] range \"%*.*s\"", w, w, rangebegin); } - if (!SIZE_ONLY) - stored += - set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate); + if (!SIZE_ONLY) + cp_list = add_cp_to_invlist(cp_list, '-'); } else range = 1; /* yeah, it's a range! */ continue; /* but do it the next time */ } } + /* Here, is the beginning of the range, if any; or + * if not */ + /* non-Latin1 code point implies unicode semantics. Must be set in * pass1 so is there for the whole of pass 2 */ if (value > 255) { RExC_uni_semantics = 1; } - /* now is the next time */ + /* Ready to process either the single value, or the completed range */ if (!SIZE_ONLY) { - if (prevvalue < 256) { - const IV ceilvalue = value < 256 ? value : 255; - IV i; -#ifdef EBCDIC - /* In EBCDIC [\x89-\x91] should include - * the \x8e but [i-j] should not. */ - if (literal_endpoint == 2 && - ((isLOWER(prevvalue) && isLOWER(ceilvalue)) || - (isUPPER(prevvalue) && isUPPER(ceilvalue)))) - { - if (isLOWER(prevvalue)) { - for (i = prevvalue; i <= ceilvalue; i++) - if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { - stored += - set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); - } - } else { - for (i = prevvalue; i <= ceilvalue; i++) - if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { - stored += - set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); - } - } - } - else -#endif - for (i = prevvalue; i <= ceilvalue; i++) { - stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); - } - } - if (value > 255) { - const UV prevnatvalue = NATIVE_TO_UNI(prevvalue); - const UV natvalue = NATIVE_TO_UNI(value); - nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue); - } -#ifdef EBCDIC - literal_endpoint = 0; +#ifndef EBCDIC + cp_list = _add_range_to_invlist(cp_list, prevvalue, value); +#else + UV* this_range = _new_invlist(1); + _append_range_to_invlist(this_range, prevvalue, value); + + /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous. + * If this range was specified using something like 'i-j', we want + * to include only the 'i' and the 'j', and not anything in + * between, so exclude non-ASCII, non-alphabetics from it. + * However, if the range was specified with something like + * [\x89-\x91] or [\x89-j], all code points within it should be + * included. literal_endpoint==2 means both ends of the range used + * a literal character, not \x{foo} */ + if (literal_endpoint == 2 + && (prevvalue >= 'a' && value <= 'z') + || (prevvalue >= 'A' && value <= 'Z')) + { + _invlist_intersection(this_range, PL_ASCII, &this_range, ); + _invlist_intersection(this_range, PL_Alpha, &this_range, ); + } + _invlist_union(cp_list, this_range, &cp_list); + literal_endpoint = 0; #endif } range = 0; /* this range (if it was one) is done now */ - } + } /* End of loop through all the text within the brackets */ + + /* If the character class contains only a single element, it may be + * optimizable into another node type which is smaller and runs faster. + * Check if this is the case for this class */ + if (element_count == 1) { + U8 op = END; + + if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or + [:digit:] or \p{foo} */ + + /* Certain named classes have equivalents that can appear outside a + * character class, e.g. \w, \H. We use these instead of a + * character class. */ + switch ((I32)namedclass) { + U8 offset; + + /* The first group is for node types that depend on the charset + * modifier to the regex. We first calculate the base node + * type, and if it should be inverted */ + + case ANYOF_NALNUM: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_ALNUM: + op = ALNUM; + goto join_charset_classes; + + case ANYOF_NSPACE: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_SPACE: + op = SPACE; + goto join_charset_classes; + + case ANYOF_NDIGIT: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_DIGIT: + op = DIGIT; + + join_charset_classes: + + /* Now that we have the base node type, we take advantage + * of the enum ordering of the charset modifiers to get the + * exact node type, For example the base SPACE also has + * SPACEL, SPACEU, and SPACEA */ + + offset = get_regex_charset(RExC_flags); + + /* /aa is the same as /a for these */ + if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) { + offset = REGEX_ASCII_RESTRICTED_CHARSET; + } + else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) { + offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */ + } + op += offset; + /* The number of varieties of each of these is the same, + * hence, so is the delta between the normal and + * complemented nodes */ + if (invert) { + op += NALNUM - ALNUM; + } + break; + + /* The second group doesn't depend of the charset modifiers. + * We just have normal and complemented */ + case ANYOF_NHORIZWS: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_HORIZWS: + op = (invert) ? NHORIZWS : HORIZWS; + break; + + case ANYOF_NVERTWS: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_VERTWS: + op = (invert) ? NVERTWS : VERTWS; + break; + + + } + } + else if (value == prevvalue) { + + /* Here, the class consists of just a single code point */ + + if (invert) { + if (! LOC && value == '\n') { + op = REG_ANY; /* Optimize [^\n] */ + } + } + else if (value < 256 || UTF) { + + /* Optimize a single value into an EXACTish node, but not if it + * would require converting the pattern to UTF-8. */ + op = compute_EXACTish(pRExC_state); + } + } /* Otherwise is a range */ + else if (! LOC) { /* locale could vary these */ + if (prevvalue == '0') { + if (value == '9') { + op = (invert) ? NDIGITA : DIGITA; + } + } + } + + /* Here, we have changed away from its initial value iff we found + * an optimization */ + if (op != END) { + + /* Throw away this ANYOF regnode, and emit the calculated one, + * which should correspond to the beginning, not current, state of + * the parse */ + const char * cur_parse = RExC_parse; + RExC_parse = (char *)orig_parse; + if ( SIZE_ONLY) { + if (! LOC) { + + /* To get locale nodes to not use the full ANYOF size would + * require moving the code above that writes the portions + * of it that aren't in other nodes to after this point. + * e.g. ANYOF_CLASS_SET */ + RExC_size = orig_size; + } + } + else { + RExC_emit = (regnode *)orig_emit; + } + + ret = reg_node(pRExC_state, op); + + if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value); + } + + RExC_parse = (char *) cur_parse; + + SvREFCNT_dec(listsv); + return ret; + } + } if (SIZE_ONLY) return ret; /****** !SIZE_ONLY AFTER HERE *********/ - /* If folding and there are code points above 255, we calculate all - * characters that could fold to or from the ones already on the list */ - if (FOLD && nonbitmap) { + /* If folding, we calculate all characters that could fold to or from the + * ones already on the list */ + if (FOLD && cp_list) { UV start, end; /* End points of code point ranges */ SV* fold_intersection = NULL; - /* This is a list of all the characters that participate in folds - * (except marks, etc in multi-char folds */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0); - PL_utf8_foldable = _swash_to_invlist(swash); - SvREFCNT_dec(swash); - } + /* In the Latin1 range, the characters that can be folded-to or -from + * are precisely the alphabetic characters. If the highest code point + * is within Latin1, we can use the compiled-in list, and not have to + * go out to disk. */ + if (invlist_highest(cp_list) < 256) { + _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection); + } + else { - /* This is a hash that for a particular fold gives all characters - * that are involved in it */ - if (! PL_utf8_foldclosures) { + /* Here, there are non-Latin1 code points, so we will have to go + * fetch the list of all the characters that participate in folds + */ + if (! PL_utf8_foldable) { + SV* swash = swash_init("utf8", "_Perl_Any_Folds", + &PL_sv_undef, 1, 0); + PL_utf8_foldable = _swash_to_invlist(swash); + SvREFCNT_dec(swash); + } - /* If we were unable to find any folds, then we likely won't be - * able to find the closures. So just create an empty list. - * Folding will effectively be restricted to the non-Unicode rules - * hard-coded into Perl. (This case happens legitimately during - * compilation of Perl itself before the Unicode tables are - * generated) */ - if (invlist_len(PL_utf8_foldable) == 0) { - PL_utf8_foldclosures = newHV(); - } else { - /* If the folds haven't been read in, call a fold function - * to force that */ - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; - STRLEN dummy_len; - - /* This particular string is above \xff in both UTF-8 and - * UTFEBCDIC */ - to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len); - assert(PL_utf8_tofold); /* Verify that worked */ - } - PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); - } - } + /* This is a hash that for a particular fold gives all characters + * that are involved in it */ + if (! PL_utf8_foldclosures) { + + /* If we were unable to find any folds, then we likely won't be + * able to find the closures. So just create an empty list. + * Folding will effectively be restricted to the non-Unicode + * rules hard-coded into Perl. (This case happens legitimately + * during compilation of Perl itself before the Unicode tables + * are generated) */ + if (invlist_len(PL_utf8_foldable) == 0) { + PL_utf8_foldclosures = newHV(); + } + else { + /* If the folds haven't been read in, call a fold function + * to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES+1]; + STRLEN dummy_len; + + /* This particular string is above \xff in both UTF-8 + * and UTFEBCDIC */ + to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len); + assert(PL_utf8_tofold); /* Verify that worked */ + } + PL_utf8_foldclosures = + _swash_inversion_hash(PL_utf8_tofold); + } + } - /* Only the characters in this class that participate in folds need be - * checked. Get the intersection of this class and all the possible - * characters that are foldable. This can quickly narrow down a large - * class */ - _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection); + /* Only the characters in this class that participate in folds need + * be checked. Get the intersection of this class and all the + * possible characters that are foldable. This can quickly narrow + * down a large class */ + _invlist_intersection(PL_utf8_foldable, cp_list, + &fold_intersection); + } /* Now look at the foldable characters in this class individually */ invlist_iterinit(fold_intersection); while (invlist_iternext(fold_intersection, &start, &end)) { UV j; + /* Locale folding for Latin1 characters is deferred until runtime */ + if (LOC && start < 256) { + start = 256; + } + /* Look at every character in the range */ for (j = start; j <= end; j++) { - /* Get its fold */ U8 foldbuf[UTF8_MAXBYTES_CASE+1]; STRLEN foldlen; - const UV f = - _to_uni_fold_flags(j, foldbuf, &foldlen, - (allow_full_fold) ? FOLD_FLAGS_FULL : 0); + UV f; + + if (j < 256) { + + /* We have the latin1 folding rules hard-coded here so that + * an innocent-looking character class, like /[ks]/i won't + * have to go out to disk to find the possible matches. + * XXX It would be better to generate these via regen, in + * case a new version of the Unicode standard adds new + * mappings, though that is not really likely, and may be + * caught by the default: case of the switch below. */ + + if (PL_fold_latin1[j] != j) { + + /* ASCII is always matched; non-ASCII is matched only + * under Unicode rules */ + if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) { + cp_list = + add_cp_to_invlist(cp_list, PL_fold_latin1[j]); + } + else { + depends_list = + add_cp_to_invlist(depends_list, PL_fold_latin1[j]); + } + } + + if (HAS_NONLATIN1_FOLD_CLOSURE(j) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) + { + /* Certain Latin1 characters have matches outside + * Latin1, or are multi-character. To get here, 'j' is + * one of those characters. None of these matches is + * valid for ASCII characters under /aa, which is why + * the 'if' just above excludes those. The matches + * fall into three categories: + * 1) They are singly folded-to or -from an above 255 + * character, e.g., LATIN SMALL LETTER Y WITH + * DIAERESIS and LATIN CAPITAL LETTER Y WITH + * DIAERESIS; + * 2) They are part of a multi-char fold with another + * latin1 character; only LATIN SMALL LETTER + * SHARP S => "ss" fits this; + * 3) They are part of a multi-char fold with a + * character outside of Latin1, such as various + * ligatures. + * We aren't dealing fully with multi-char folds, except + * we do deal with the pattern containing a character + * that has a multi-char fold (not so much the inverse). + * For types 1) and 3), the matches only happen when the + * target string is utf8; that's not true for 2), and we + * set a flag for it. + * + * The code below adds the single fold closures for 'j' + * to the inversion list. */ + switch (j) { + case 'k': + case 'K': + cp_list = + add_cp_to_invlist(cp_list, KELVIN_SIGN); + break; + case 's': + case 'S': + cp_list = add_cp_to_invlist(cp_list, + LATIN_SMALL_LETTER_LONG_S); + break; + case MICRO_SIGN: + cp_list = add_cp_to_invlist(cp_list, + GREEK_SMALL_LETTER_MU); + cp_list = add_cp_to_invlist(cp_list, + GREEK_CAPITAL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + cp_list = + add_cp_to_invlist(cp_list, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + cp_list = add_cp_to_invlist(cp_list, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + case LATIN_SMALL_LETTER_SHARP_S: + cp_list = add_cp_to_invlist(cp_list, + LATIN_CAPITAL_LETTER_SHARP_S); + + /* Under /a, /d, and /u, this can match the two + * chars "ss" */ + if (! ASCII_FOLD_RESTRICTED) { + add_alternate(&unicode_alternate, + (U8 *) "ss", 2); + + /* And under /u or /a, it can match even if + * the target is not utf8 */ + if (AT_LEAST_UNI_SEMANTICS) { + ANYOF_FLAGS(ret) |= + ANYOF_NONBITMAP_NON_UTF8; + } + } + break; + case 'F': case 'f': + case 'I': case 'i': + case 'L': case 'l': + case 'T': case 't': + case 'A': case 'a': + case 'H': case 'h': + case 'J': case 'j': + case 'N': case 'n': + case 'W': case 'w': + case 'Y': case 'y': + /* These all are targets of multi-character + * folds from code points that require UTF8 to + * express, so they can't match unless the + * target string is in UTF-8, so no action here + * is necessary, as regexec.c properly handles + * the general case for UTF-8 matching */ + break; + default: + /* Use deprecated warning to increase the + * chances of this being output */ + ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); + break; + } + } + continue; + } + + /* Here is an above Latin1 character. We don't have the rules + * hard-coded for it. First, get its fold */ + f = _to_uni_fold_flags(j, foldbuf, &foldlen, + ((allow_full_fold) ? FOLD_FLAGS_FULL : 0) + | ((LOC) + ? FOLD_FLAGS_LOCALE + : (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); if (foldlen > (STRLEN)UNISKIP(f)) { @@ -11268,54 +12166,25 @@ parseit: /* If any of the folded characters of this are in the * Latin1 range, tell the regex engine that this can - * match a non-utf8 target string. The only multi-byte - * fold whose source is in the Latin1 range (U+00DF) - * applies only when the target string is utf8, or - * under unicode rules */ - if (j > 255 || AT_LEAST_UNI_SEMANTICS) { - while (loc < e) { - - /* Can't mix ascii with non- under /aa */ - if (MORE_ASCII_RESTRICTED - && (isASCII(*loc) != isASCII(j))) - { - goto end_multi_fold; - } - if (UTF8_IS_INVARIANT(*loc) - || UTF8_IS_DOWNGRADEABLE_START(*loc)) - { - /* Can't mix above and below 256 under LOC - */ - if (LOC) { - goto end_multi_fold; - } - ANYOF_FLAGS(ret) - |= ANYOF_NONBITMAP_NON_UTF8; - break; - } - loc += UTF8SKIP(loc); - } - } + * match a non-utf8 target string. */ + while (loc < e) { + if (UTF8_IS_INVARIANT(*loc) + || UTF8_IS_DOWNGRADEABLE_START(*loc)) + { + ANYOF_FLAGS(ret) + |= ANYOF_NONBITMAP_NON_UTF8; + break; + } + loc += UTF8SKIP(loc); + } add_alternate(&unicode_alternate, foldbuf, foldlen); - end_multi_fold: ; - } - - /* This is special-cased, as it is the only letter which - * has both a multi-fold and single-fold in Latin1. All - * the other chars that have single and multi-folds are - * always in utf8, and the utf8 folding algorithm catches - * them */ - if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) { - stored += set_regclass_bit(pRExC_state, - ret, - LATIN_SMALL_LETTER_SHARP_S, - &l1_fold_invlist, &unicode_alternate); } } - else { - /* Single character fold. Add everything in its fold - * closure to the list that this node should match */ + else { + /* Single character fold of above Latin1. Add everything + * in its fold closure to the list that this node should + * match */ SV** listp; /* The fold closures data structure is a hash with the keys @@ -11338,90 +12207,177 @@ parseit: /* /aa doesn't allow folds between ASCII and non-; * /l doesn't allow them between above and below * 256 */ - if ((MORE_ASCII_RESTRICTED - && (isASCII(c) != isASCII(j))) - || (LOC && ((c < 256) != (j < 256)))) + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j))) + || (LOC && ((c < 256) != (j < 256)))) { continue; } - if (c < 256 && AT_LEAST_UNI_SEMANTICS) { - stored += set_regclass_bit(pRExC_state, - ret, - (U8) c, - &l1_fold_invlist, &unicode_alternate); - } - /* It may be that the code point is already in - * this range or already in the bitmap, in - * which case we need do nothing */ - else if ((c < start || c > end) - && (c > 255 - || ! ANYOF_BITMAP_TEST(ret, c))) - { - nonbitmap = add_cp_to_invlist(nonbitmap, c); + /* Folds involving non-ascii Latin1 characters + * under /d are added to a separate list */ + if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) + { + cp_list = add_cp_to_invlist(cp_list, c); + } + else { + depends_list = add_cp_to_invlist(depends_list, c); } } } } - } + } } SvREFCNT_dec(fold_intersection); } - /* Combine the two lists into one. */ - if (l1_fold_invlist) { - if (nonbitmap) { - _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap); - SvREFCNT_dec(l1_fold_invlist); - } - else { - nonbitmap = l1_fold_invlist; - } + /* And combine the result (if any) with any inversion list from posix + * classes. The lists are kept separate up to now because we don't want to + * fold the classes (folding of those is automatically handled by the swash + * fetching code) */ + if (posixes) { + if (AT_LEAST_UNI_SEMANTICS) { + if (cp_list) { + _invlist_union(cp_list, posixes, &cp_list); + SvREFCNT_dec(posixes); + } + else { + cp_list = posixes; + } + } + else { + + /* Under /d, we put into a separate list the Latin1 things that + * match only when the target string is utf8 */ + SV* nonascii_but_latin1_properties = NULL; + _invlist_intersection(posixes, PL_Latin1, + &nonascii_but_latin1_properties); + _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII, + &nonascii_but_latin1_properties); + _invlist_subtract(posixes, nonascii_but_latin1_properties, + &posixes); + if (cp_list) { + _invlist_union(cp_list, posixes, &cp_list); + SvREFCNT_dec(posixes); + } + else { + cp_list = posixes; + } + + if (depends_list) { + _invlist_union(depends_list, nonascii_but_latin1_properties, + &depends_list); + SvREFCNT_dec(nonascii_but_latin1_properties); + } + else { + depends_list = nonascii_but_latin1_properties; + } + } } /* And combine the result (if any) with any inversion list from properties. - * The lists are kept separate up to now because we don't want to fold the - * properties */ + * The lists are kept separate up to now so that we can distinguish the two + * in regards to matching above-Unicode. A run-time warning is generated + * if a Unicode property is matched against a non-Unicode code point. But, + * we allow user-defined properties to match anything, without any warning, + * and we also suppress the warning if there is a portion of the character + * class that isn't a Unicode property, and which matches above Unicode, \W + * or [\x{110000}] for example. + * (Note that in this case, unlike the Posix one above, there is no + * , because having a Unicode property forces Unicode + * semantics */ if (properties) { - if (nonbitmap) { - _invlist_union(nonbitmap, properties, &nonbitmap); - SvREFCNT_dec(properties); - } - else { - nonbitmap = properties; - } + bool warn_super = ! has_user_defined_property; + if (cp_list) { + + /* If it matters to the final outcome, see if a non-property + * component of the class matches above Unicode. If so, the + * warning gets suppressed. This is true even if just a single + * such code point is specified, as though not strictly correct if + * another such code point is matched against, the fact that they + * are using above-Unicode code points indicates they should know + * the issues involved */ + if (warn_super) { + bool non_prop_matches_above_Unicode = + runtime_posix_matches_above_Unicode + | (invlist_highest(cp_list) > PERL_UNICODE_MAX); + if (invert) { + non_prop_matches_above_Unicode = + ! non_prop_matches_above_Unicode; + } + warn_super = ! non_prop_matches_above_Unicode; + } + + _invlist_union(properties, cp_list, &cp_list); + SvREFCNT_dec(properties); + } + else { + cp_list = properties; + } + + if (warn_super) { + ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; + } } - /* Here, contains all the code points we can determine at - * compile time that we haven't put into the bitmap. Go through it, and - * for things that belong in the bitmap, put them there, and delete from - * */ - if (nonbitmap) { + /* Here, we have calculated what code points should be in the character + * class. + * + * Now we can see about various optimizations. Fold calculation (which we + * did above) needs to take place before inversion. Otherwise /[^k]/i + * would invert to include K, which under /i would match k, which it + * shouldn't. */ - /* Above-ASCII code points in /d have to stay in , as they - * possibly only should match when the target string is UTF-8 */ - UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255; + /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't + * set the FOLD flag yet, so this does optimize those. It doesn't + * optimize locale. Doing so perhaps could be done as long as there is + * nothing like \w in it; some thought also would have to be given to the + * interaction with above 0x100 chars */ + if (invert + && ! LOC + && ! depends_list + && ! unicode_alternate + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + { + _invlist_invert(cp_list); + + /* Any swash can't be used as-is, because we've inverted things */ + if (swash) { + SvREFCNT_dec(swash); + swash = NULL; + } + + /* Clear the invert flag since have just done it here */ + invert = FALSE; + } + + /* Here, contains all the code points we can determine at + * compile time that match under all conditions. Go through it, and + * for things that belong in the bitmap, put them there, and delete from + * */ + ANYOF_BITMAP_ZERO(ret); + if (cp_list) { /* This gets set if we actually need to modify things */ bool change_invlist = FALSE; UV start, end; - /* Start looking through */ - invlist_iterinit(nonbitmap); - while (invlist_iternext(nonbitmap, &start, &end)) { + /* Start looking through */ + invlist_iterinit(cp_list); + while (invlist_iternext(cp_list, &start, &end)) { UV high; int i; /* Quit if are above what we should change */ - if (start > max_cp_to_set) { + if (start > 255) { break; } change_invlist = TRUE; /* Set all the bits in the range, up to the max that we are doing */ - high = (end < max_cp_to_set) ? end : max_cp_to_set; + high = (end < 255) ? end : 255; for (i = start; i <= (int) high; i++) { if (! ANYOF_BITMAP_TEST(ret, i)) { ANYOF_BITMAP_SET(ret, i); @@ -11433,140 +12389,31 @@ parseit: } /* Done with loop; remove any code points that are in the bitmap from - * */ + * */ if (change_invlist) { - _invlist_subtract(nonbitmap, - (DEPENDS_SEMANTICS) - ? PL_ASCII - : PL_Latin1, - &nonbitmap); + _invlist_subtract(cp_list, PL_Latin1, &cp_list); } /* If have completely emptied it, remove it completely */ - if (invlist_len(nonbitmap) == 0) { - SvREFCNT_dec(nonbitmap); - nonbitmap = NULL; + if (invlist_len(cp_list) == 0) { + SvREFCNT_dec(cp_list); + cp_list = NULL; } } - /* Here, we have calculated what code points should be in the character - * class. does not overlap the bitmap except possibly in the - * case of DEPENDS rules. - * - * Now we can see about various optimizations. Fold calculation (which we - * did above) needs to take place before inversion. Otherwise /[^k]/i - * would invert to include K, which under /i would match k, which it - * shouldn't. */ + if (invert) { + ANYOF_FLAGS(ret) |= ANYOF_INVERT; + } - /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't - * set the FOLD flag yet, so this does optimize those. It doesn't - * optimize locale. Doing so perhaps could be done as long as there is - * nothing like \w in it; some thought also would have to be given to the - * interaction with above 0x100 chars */ - if ((ANYOF_FLAGS(ret) & ANYOF_INVERT) - && ! LOC - && ! unicode_alternate - /* In case of /d, there are some things that should match only when in - * not in the bitmap, i.e., they require UTF8 to match. These are - * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this - * case, they don't require UTF8, so can invert here */ - && (! nonbitmap - || ! DEPENDS_SEMANTICS - || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8)) - && SvCUR(listsv) == initial_listsv_len) - { - int i; - if (! nonbitmap) { - for (i = 0; i < 256; ++i) { - if (ANYOF_BITMAP_TEST(ret, i)) { - ANYOF_BITMAP_CLEAR(ret, i); - } - else { - ANYOF_BITMAP_SET(ret, i); - prevvalue = value; - value = i; - } - } - /* The inversion means that everything above 255 is matched */ - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; + /* Combine the two lists into one. */ + if (depends_list) { + if (cp_list) { + _invlist_union(cp_list, depends_list, &cp_list); + SvREFCNT_dec(depends_list); } else { - /* Here, also has things outside the bitmap that may overlap with - * the bitmap. We have to sync them up, so that they get inverted - * in both places. Earlier, we removed all overlaps except in the - * case of /d rules, so no syncing is needed except for this case - */ - SV *remove_list = NULL; - - if (DEPENDS_SEMANTICS) { - UV start, end; - - /* Set the bits that correspond to the ones that aren't in the - * bitmap. Otherwise, when we invert, we'll miss these. - * Earlier, we removed from the nonbitmap all code points - * < 128, so there is no extra work here */ - invlist_iterinit(nonbitmap); - while (invlist_iternext(nonbitmap, &start, &end)) { - if (start > 255) { /* The bit map goes to 255 */ - break; - } - if (end > 255) { - end = 255; - } - for (i = start; i <= (int) end; ++i) { - ANYOF_BITMAP_SET(ret, i); - prevvalue = value; - value = i; - } - } - } - - /* Now invert both the bitmap and the nonbitmap. Anything in the - * bitmap has to also be removed from the non-bitmap, but again, - * there should not be overlap unless is /d rules. */ - _invlist_invert(nonbitmap); - - /* Any swash can't be used as-is, because we've inverted things */ - if (swash) { - SvREFCNT_dec(swash); - swash = NULL; - } - - for (i = 0; i < 256; ++i) { - if (ANYOF_BITMAP_TEST(ret, i)) { - ANYOF_BITMAP_CLEAR(ret, i); - if (DEPENDS_SEMANTICS) { - if (! remove_list) { - remove_list = _new_invlist(2); - } - remove_list = add_cp_to_invlist(remove_list, i); - } - } - else { - ANYOF_BITMAP_SET(ret, i); - prevvalue = value; - value = i; - } - } - - /* And do the removal */ - if (DEPENDS_SEMANTICS) { - if (remove_list) { - _invlist_subtract(nonbitmap, remove_list, &nonbitmap); - SvREFCNT_dec(remove_list); - } - } - else { - /* There is no overlap for non-/d, so just delete anything - * below 256 */ - _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap); - } + cp_list = depends_list; } - - stored = 256 - stored; - - /* Clear the invert flag since have just done it here */ - ANYOF_FLAGS(ret) &= ~ANYOF_INVERT; } /* Folding in the bitmap is taken care of above, but not for locale (for @@ -11576,7 +12423,7 @@ parseit: * run-time fold flag for these */ if (FOLD && (LOC || (DEPENDS_SEMANTICS - && nonbitmap + && cp_list && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8)) || unicode_alternate)) { @@ -11597,9 +12444,9 @@ parseit: * characters which only have the two folds; so things like 'fF' and 'Ii' * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE * FI'. */ - if (! nonbitmap + if (! cp_list && ! unicode_alternate - && SvCUR(listsv) == initial_listsv_len + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL)) && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) || (! ANYOF_CLASS_TEST_ANY_SET(ret))))) @@ -11607,7 +12454,7 @@ parseit: && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)) /* If the latest code point has a fold whose * bit is set, it must be the only other one */ - && ((prevvalue = PL_fold_latin1[value]) != (IV)value) + && ((prevvalue = PL_fold_latin1[value]) != value) && ANYOF_BITMAP_TEST(ret, prevvalue))))) { /* Note that the information needed to decide to do this optimization @@ -11650,7 +12497,7 @@ parseit: * then EXACTFU if the regex calls for it, or is required because * the character is non-ASCII. (If is ASCII, its fold is * also ASCII for the cases where we get here.) */ - if (MORE_ASCII_RESTRICTED && isASCII(value)) { + if (ASCII_FOLD_RESTRICTED && isASCII(value)) { op = EXACTFA; } else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) { @@ -11684,8 +12531,8 @@ parseit: SvREFCNT_dec(swash); swash = NULL; } - if (! nonbitmap - && SvCUR(listsv) == initial_listsv_len + if (! cp_list + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION && ! unicode_alternate) { ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); @@ -11701,24 +12548,24 @@ parseit: * swash is stored there now. * av[2] stores the multicharacter foldings, used later in * regexec.c:S_reginclass(). - * av[3] stores the nonbitmap inversion list for use in addition or - * instead of av[0]; not used if av[1] isn't NULL + * av[3] stores the cp_list inversion list for use in addition or + * instead of av[0]; used only if av[1] is NULL * av[4] is set if any component of the class is from a user-defined - * property; not used if av[1] isn't NULL */ + * property; used only if av[1] is NULL */ AV * const av = newAV(); SV *rv; - av_store(av, 0, (SvCUR(listsv) == initial_listsv_len) - ? &PL_sv_undef - : listsv); + av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + ? listsv + : &PL_sv_undef); if (swash) { av_store(av, 1, swash); - SvREFCNT_dec(nonbitmap); + SvREFCNT_dec(cp_list); } else { av_store(av, 1, NULL); - if (nonbitmap) { - av_store(av, 3, nonbitmap); + if (cp_list) { + av_store(av, 3, cp_list); av_store(av, 4, newSVuv(has_user_defined_property)); } } @@ -11741,6 +12588,7 @@ parseit: } return ret; } +#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION /* reg_skipcomment() @@ -12334,6 +13182,40 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) #ifdef DEBUGGING dVAR; register int k; + + /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ + static const char * const anyofs[] = { + "\\w", + "\\W", + "\\s", + "\\S", + "\\d", + "\\D", + "[:alnum:]", + "[:^alnum:]", + "[:alpha:]", + "[:^alpha:]", + "[:ascii:]", + "[:^ascii:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:graph:]", + "[:^graph:]", + "[:lower:]", + "[:^lower:]", + "[:print:]", + "[:^print:]", + "[:punct:]", + "[:^punct:]", + "[:upper:]", + "[:^upper:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:blank:]", + "[:^blank:]" + }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -12454,39 +13336,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) const U8 flags = ANYOF_FLAGS(o); int do_sep = 0; - /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ - static const char * const anyofs[] = { - "\\w", - "\\W", - "\\s", - "\\S", - "\\d", - "\\D", - "[:alnum:]", - "[:^alnum:]", - "[:alpha:]", - "[:^alpha:]", - "[:ascii:]", - "[:^ascii:]", - "[:cntrl:]", - "[:^cntrl:]", - "[:graph:]", - "[:^graph:]", - "[:lower:]", - "[:^lower:]", - "[:print:]", - "[:^print:]", - "[:punct:]", - "[:^punct:]", - "[:upper:]", - "[:^upper:]", - "[:xdigit:]", - "[:^xdigit:]", - "[:space:]", - "[:^space:]", - "[:blank:]", - "[:^blank:]" - }; if (flags & ANYOF_LOCALE) sv_catpvs(sv, "{loc}"); @@ -12709,6 +13558,7 @@ Perl_pregfree2(pTHX_ REGEXP *rx) SvREFCNT_dec(r->saved_copy); #endif Safefree(r->offs); + SvREFCNT_dec(r->qr_anoncv); } /* reg_temp_copy() @@ -12733,7 +13583,6 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) { struct regexp *ret; struct regexp *const r = (struct regexp *)SvANY(rx); - register const I32 npar = r->nparens+1; PERL_ARGS_ASSERT_REG_TEMP_COPY; @@ -12753,8 +13602,11 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) SvLEN_set(ret_x, 0); SvSTASH_set(ret_x, NULL); SvMAGIC_set(ret_x, NULL); - Newx(ret->offs, npar, regexp_paren_pair); - Copy(r->offs, ret->offs, npar, regexp_paren_pair); + if (r->offs) { + const I32 npar = r->nparens+1; + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + } if (r->substrs) { Newx(ret->substrs, 1, struct reg_substr_data); StructCopy(r->substrs, ret->substrs, struct reg_substr_data); @@ -12772,6 +13624,7 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) ret->saved_copy = NULL; #endif ret->mother_re = rx; + SvREFCNT_inc_void(ret->qr_anoncv); return ret_x; } @@ -12814,16 +13667,21 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) if (ri->u.offsets) Safefree(ri->u.offsets); /* 20010421 MJD */ #endif + if (ri->code_blocks) { + int n; + for (n = 0; n < ri->num_code_blocks; n++) + SvREFCNT_dec(ri->code_blocks[n].src_regex); + Safefree(ri->code_blocks); + } + if (ri->data) { int n = ri->data->count; - PAD* new_comppad = NULL; - PAD* old_comppad; - PADOFFSET refcnt; while (--n >= 0) { /* If you add a ->what type here, update the comment in regcomp.h */ switch (ri->data->what[n]) { case 'a': + case 'r': case 's': case 'S': case 'u': @@ -12832,27 +13690,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) case 'f': Safefree(ri->data->data[n]); break; - case 'p': - new_comppad = MUTABLE_AV(ri->data->data[n]); - break; - case 'o': - if (new_comppad == NULL) - Perl_croak(aTHX_ "panic: pregfree comppad"); - PAD_SAVE_LOCAL(old_comppad, - /* Watch out for global destruction's random ordering. */ - (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL - ); - OP_REFCNT_LOCK; - refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]); - OP_REFCNT_UNLOCK; - if (!refcnt) - op_free((OP_4tree*)ri->data->data[n]); - - PAD_RESTORE_LOCAL(old_comppad); - SvREFCNT_dec(MUTABLE_SV(new_comppad)); - new_comppad = NULL; - break; - case 'n': + case 'l': + case 'L': break; case 'T': { /* Aho Corasick add-on structure for a trie node. @@ -12980,6 +13819,7 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) } RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); + ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param)); if (ret->pprivate) RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); @@ -13040,7 +13880,20 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); - + + reti->num_code_blocks = ri->num_code_blocks; + if (ri->code_blocks) { + int n; + Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block, + struct reg_code_block); + Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks, + struct reg_code_block); + for (n = 0; n < ri->num_code_blocks; n++) + reti->code_blocks[n].src_regex = (REGEXP*) + sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param); + } + else + reti->code_blocks = NULL; reti->regstclass = NULL; @@ -13057,12 +13910,11 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) for (i = 0; i < count; i++) { d->what[i] = ri->data->what[i]; switch (d->what[i]) { - /* legal options are one of: sSfpontTua - see also regcomp.h and pregfree() */ + /* see also regcomp.h and regfree_internal() */ case 'a': /* actually an AV, but the dup function is identical. */ + case 'r': case 's': case 'S': - case 'p': /* actually an AV, but the dup function is identical. */ case 'u': /* actually an HV, but the dup function is identical. */ d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); break; @@ -13073,13 +13925,6 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) struct regnode_charclass_class); reti->regstclass = (regnode*)d->data[i]; break; - case 'o': - /* Compiled op trees are readonly and in shared memory, - and can thus be shared without duplication. */ - OP_REFCNT_LOCK; - d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]); - OP_REFCNT_UNLOCK; - break; case 'T': /* Trie stclasses are readonly and can thus be shared * without duplication. We free the stclass in pregfree @@ -13092,7 +13937,8 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) ((reg_trie_data*)ri->data->data[i])->refcount++; OP_REFCNT_UNLOCK; /* Fall through */ - case 'n': + case 'l': + case 'L': d->data[i] = ri->data->data[i]; break; default: @@ -13202,8 +14048,6 @@ Perl_save_re_context(pTHX) Copy(&PL_reg_state, state, 1, struct re_save_state); - PL_reg_start_tmp = 0; - PL_reg_start_tmpl = 0; PL_reg_oldsaved = NULL; PL_reg_oldsavedlen = 0; PL_reg_maxiter = 0; @@ -13445,8 +14289,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, * Local variables: * c-indentation-style: bsd * c-basic-offset: 4 - * indent-tabs-mode: t + * indent-tabs-mode: nil * End: * - * ex: set ts=8 sts=4 sw=4 noet: + * ex: set ts=8 sts=4 sw=4 et: */