X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/3c13cae629d936c43bca9d992cc445d93287af8e..13c3945fa95c36eaef31b4e8262adcfb27142c4b:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 021e14e..d09d89f 100644 --- a/regcomp.c +++ b/regcomp.c @@ -81,6 +81,7 @@ #define REG_COMP_C #ifdef PERL_IN_XSUB_RE # include "re_comp.h" +extern const struct regexp_engine my_reg_engine; #else # include "regcomp.h" #endif @@ -133,7 +134,6 @@ typedef struct RExC_state_t { I32 nestroot; /* root parens we are in - used by accept */ I32 extralen; I32 seen_zerolen; - I32 seen_evals; regnode **open_parens; /* pointers to open parens */ regnode **close_parens; /* pointers to close parens */ regnode *opend; /* END node in program */ @@ -159,6 +159,7 @@ typedef struct RExC_state_t { char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) #endif + SV *runtime_code_qr; /* qr with the runtime code blocks */ #ifdef DEBUGGING const char *lastparse; I32 lastnum; @@ -193,7 +194,6 @@ typedef struct RExC_state_t { #define RExC_nestroot (pRExC_state->nestroot) #define RExC_extralen (pRExC_state->extralen) #define RExC_seen_zerolen (pRExC_state->seen_zerolen) -#define RExC_seen_evals (pRExC_state->seen_evals) #define RExC_utf8 (pRExC_state->utf8) #define RExC_uni_semantics (pRExC_state->uni_semantics) #define RExC_orig_utf8 (pRExC_state->orig_utf8) @@ -221,7 +221,7 @@ typedef struct RExC_state_t { #define WORST 0 /* Worst case. */ #define HASWIDTH 0x01 /* Known to match non-null strings. */ -/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single +/* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single * character, and if utf8, must be invariant. Note that this is not the same * thing as REGNODE_SIMPLE */ #define SIMPLE 0x02 @@ -399,14 +399,18 @@ static const scan_data_t zero_scan_data = #define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET) #define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET) #define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET) -#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET) +#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET) #define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD) -#define OOB_UNICODE 12345678 #define OOB_NAMEDCLASS -1 +/* There is no code point that is out-of-bounds, so this is problematic. But + * its only current use is to initialize a variable that is always set before + * looked at. */ +#define OOB_UNICODE 0xDEADBEEF + #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) @@ -2631,9 +2635,9 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode * problematic sequences. This delta is used by the caller to adjust the * min length of the match, and the delta between min and max, so that the * optimizer doesn't reject these possibilities based on size constraints. - * 2) These sequences are not currently correctly handled by the trie code - * either, so it changes the joined node type to ops that are not handled - * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD. + * 2) These sequences require special handling by the trie code, so it + * changes the joined node type to ops for the trie's benefit, those new + * ops being EXACTFU_SS and EXACTFU_TRICKYFOLD. * 3) This is sufficient for the two Greek sequences (described below), but * the one involving the Sharp s (\xDF) needs more. The node type * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss" @@ -3380,7 +3384,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * the end pointer. */ if ( !first ) { first = cur; - trietype = noper_trietype; if ( noper_trietype == NOTHING ) { #if !defined(DEBUGGING) && !defined(NOJUMPTRIE) regnode * const noper_next = regnext( noper ); @@ -3388,8 +3391,15 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; #endif - if ( noper_next_trietype ) + if ( noper_next_trietype ) { trietype = noper_next_trietype; + } else if (noper_next_type) { + /* a NOTHING regop is 1 regop wide. We need at least two + * for a trie so we can't merge this in */ + first = NULL; + } + } else { + trietype = noper_trietype; } } else { if ( trietype == NOTHING ) @@ -4399,6 +4409,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->flags |= (OP(scan) == MEOL ? SF_BEFORE_MEOL : SF_BEFORE_SEOL); + SCAN_COMMIT(pRExC_state, data, minlenp); + } else if ( PL_regkind[OP(scan)] == BRANCHJ /* Lookbehind, or need to calculate parens/evals/stclass: */ @@ -4950,16 +4962,246 @@ Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) } #endif -/* public(ish) wrapper for Perl_re_op_compile that only takes an SV - * pattern rather than a list of OPs */ +/* public(ish) entry point for the perl core's own regex compiling code. + * It's actually a wrapper for Perl_re_op_compile that only takes an SV + * pattern rather than a list of OPs, and uses the internal engine rather + * than the current one */ REGEXP * Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) { SV *pat = pattern; /* defeat constness! */ PERL_ARGS_ASSERT_RE_COMPILE; - return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(), - NULL, NULL, rx_flags, 0); + return Perl_re_op_compile(aTHX_ &pat, 1, NULL, +#ifdef PERL_IN_XSUB_RE + &my_reg_engine, +#else + &PL_core_reg_engine, +#endif + NULL, NULL, rx_flags, 0); +} + +/* see if there are any run-time code blocks in the pattern. + * False positives are allowed */ + +static bool +S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr, + U32 pm_flags, char *pat, STRLEN plen) +{ + int n = 0; + STRLEN s; + + /* avoid infinitely recursing when we recompile the pattern parcelled up + * as qr'...'. A single constant qr// string can't have have any + * run-time component in it, and thus, no runtime code. (A non-qr + * string, however, can, e.g. $x =~ '(?{})') */ + if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST) + return 0; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + s = pRExC_state->code_blocks[n].end; + n++; + continue; + } + /* TODO ideally should handle [..], (#..), /#.../x to reduce false + * positives here */ + if (pat[s] == '(' && pat[s+1] == '?' && + (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{')) + ) + return 1; + } + return 0; +} + +/* Handle run-time code blocks. We will already have compiled any direct + * or indirect literal code blocks. Now, take the pattern 'pat' and make a + * copy of it, but with any literal code blocks blanked out and + * appropriate chars escaped; then feed it into + * + * eval "qr'modified_pattern'" + * + * For example, + * + * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno + * + * becomes + * + * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno' + * + * After eval_sv()-ing that, grab any new code blocks from the returned qr + * and merge them with any code blocks of the original regexp. + * + * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge; + * instead, just save the qr and return FALSE; this tells our caller that + * the original pattern needs upgrading to utf8. + */ + +static bool +S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, + char *pat, STRLEN plen) +{ + SV *qr; + + GET_RE_DEBUG_FLAGS_DECL; + + if (pRExC_state->runtime_code_qr) { + /* this is the second time we've been called; this should + * only happen if the main pattern got upgraded to utf8 + * during compilation; re-use the qr we compiled first time + * round (which should be utf8 too) + */ + qr = pRExC_state->runtime_code_qr; + pRExC_state->runtime_code_qr = NULL; + assert(RExC_utf8 && SvUTF8(qr)); + } + else { + int n = 0; + STRLEN s; + char *p, *newpat; + int newlen = plen + 6; /* allow for "qr''x\0" extra chars */ + SV *sv, *qr_ref; + dSP; + + /* determine how many extra chars we need for ' and \ escaping */ + for (s = 0; s < plen; s++) { + if (pat[s] == '\'' || pat[s] == '\\') + newlen++; + } + + Newx(newpat, newlen, char); + p = newpat; + *p++ = 'q'; *p++ = 'r'; *p++ = '\''; + + for (s = 0; s < plen; s++) { + if (n < pRExC_state->num_code_blocks + && s == pRExC_state->code_blocks[n].start) + { + /* blank out literal code block */ + assert(pat[s] == '('); + while (s <= pRExC_state->code_blocks[n].end) { + *p++ = ' '; + s++; + } + s--; + n++; + continue; + } + if (pat[s] == '\'' || pat[s] == '\\') + *p++ = '\\'; + *p++ = pat[s]; + } + *p++ = '\''; + if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) + *p++ = 'x'; + *p++ = '\0'; + DEBUG_COMPILE_r({ + PerlIO_printf(Perl_debug_log, + "%sre-parsing pattern for runtime code:%s %s\n", + PL_colors[4],PL_colors[5],newpat); + }); + + sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0); + Safefree(newpat); + + ENTER; + SAVETMPS; + save_re_context(); + PUSHSTACKi(PERLSI_REQUIRE); + /* this causes the toker to collapse \\ into \ when parsing + * qr''; normally only q'' does this. It also alters hints + * handling */ + PL_reg_state.re_reparsing = TRUE; + eval_sv(sv, G_SCALAR); + SvREFCNT_dec(sv); + SPAGAIN; + qr_ref = POPs; + PUTBACK; + if (SvTRUE(ERRSV)) + Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV)); + assert(SvROK(qr_ref)); + qr = SvRV(qr_ref); + assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp); + /* the leaving below frees the tmp qr_ref. + * Give qr a life of its own */ + SvREFCNT_inc(qr); + POPSTACK; + FREETMPS; + LEAVE; + + } + + if (!RExC_utf8 && SvUTF8(qr)) { + /* first time through; the pattern got upgraded; save the + * qr for the next time through */ + assert(!pRExC_state->runtime_code_qr); + pRExC_state->runtime_code_qr = qr; + return 0; + } + + + /* extract any code blocks within the returned qr// */ + + + /* merge the main (r1) and run-time (r2) code blocks into one */ + { + RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2); + struct reg_code_block *new_block, *dst; + RExC_state_t * const r1 = pRExC_state; /* convenient alias */ + int i1 = 0, i2 = 0; + + if (!r2->num_code_blocks) /* we guessed wrong */ + return 1; + + Newx(new_block, + r1->num_code_blocks + r2->num_code_blocks, + struct reg_code_block); + dst = new_block; + + while ( i1 < r1->num_code_blocks + || i2 < r2->num_code_blocks) + { + struct reg_code_block *src; + bool is_qr = 0; + + if (i1 == r1->num_code_blocks) { + src = &r2->code_blocks[i2++]; + is_qr = 1; + } + else if (i2 == r2->num_code_blocks) + src = &r1->code_blocks[i1++]; + else if ( r1->code_blocks[i1].start + < r2->code_blocks[i2].start) + { + src = &r1->code_blocks[i1++]; + assert(src->end < r2->code_blocks[i2].start); + } + else { + assert( r1->code_blocks[i1].start + > r2->code_blocks[i2].start); + src = &r2->code_blocks[i2++]; + is_qr = 1; + assert(src->end < r1->code_blocks[i1].start); + } + + assert(pat[src->start] == '('); + assert(pat[src->end] == ')'); + dst->start = src->start; + dst->end = src->end; + dst->block = src->block; + dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr) + : src->src_regex; + dst++; + } + r1->num_code_blocks += r2->num_code_blocks; + Safefree(r1->code_blocks); + r1->code_blocks = new_block; + } + + SvREFCNT_dec(qr); + return 1; } @@ -4990,8 +5232,9 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) * * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details. * - * pm_flags contains the PMf_* flags from the calling PMOP. Currently - * we're only interested in PMf_HAS_CV and PMf_IS_QR. + * pm_flags contains the PMf_* flags, typically based on those from the + * pm_flags field of the related PMOP. Currently we're only interested in + * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL. * * We can't allocate space until we know how big the compiled form will be, * but we can't compile it (and thus know how big it is) until we've got a @@ -5009,7 +5252,7 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) REGEXP * Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, OP *expr, const regexp_engine* eng, REGEXP *VOL old_re, - int *is_bare_re, U32 orig_rx_flags, U32 pm_flags) + bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags) { dVAR; REGEXP *rx; @@ -5032,7 +5275,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, bool used_setjump = FALSE; regex_charset initial_charset = get_regex_charset(orig_rx_flags); bool code_is_utf8 = 0; - + bool VOL recompile = 0; + bool runtime_code = 0; U8 jump_ret = 0; dJMPENV; scan_data_t data; @@ -5075,9 +5319,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist); PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist); - PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist); - PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist); - PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist); PL_PosixLower = _new_invlist_C_array(PosixLower_invlist); @@ -5110,7 +5351,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, pRExC_state->num_code_blocks = 0; if (is_bare_re) - *is_bare_re = 0; + *is_bare_re = FALSE; if (expr && (expr->op_type == OP_LIST || (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) { @@ -5159,6 +5400,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, OP *o = NULL; int n = 0; bool utf8 = 0; + STRLEN orig_patlen = 0; if (pRExC_state->num_code_blocks) { o = cLISTOPx(expr)->op_first; @@ -5200,24 +5442,54 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, o = o->op_sibling;; } + if ((SvAMAGIC(pat) || SvAMAGIC(msv)) && + (sv = amagic_call(pat, msv, concat_amg, AMGf_assign))) + { + sv_setsv(pat, sv); + /* overloading involved: all bets are off over literal + * code. Pretend we haven't seen it */ + pRExC_state->num_code_blocks -= n; + n = 0; + rx = NULL; + + } + else { + while (SvAMAGIC(msv) + && (sv = AMG_CALLunary(msv, string_amg)) + && sv != msv) + { + msv = sv; + SvGETMAGIC(msv); + } + if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP) + msv = SvRV(msv); + orig_patlen = SvCUR(pat); + sv_catsv_nomg(pat, msv); + rx = msv; + if (code) + pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1; + } + /* extract any code blocks within any embedded qr//'s */ - rx = msv; - if (SvROK(rx)) - rx = SvRV(rx); - if (SvTYPE(rx) == SVt_REGEXP + if (rx && SvTYPE(rx) == SVt_REGEXP && RX_ENGINE((REGEXP*)rx)->op_comp) { RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri); if (ri->num_code_blocks) { int i; + /* the presence of an embedded qr// with code means + * we should always recompile: the text of the + * qr// may not have changed, but it may be a + * different closure than last time */ + recompile = 1; Renew(pRExC_state->code_blocks, pRExC_state->num_code_blocks + ri->num_code_blocks, struct reg_code_block); pRExC_state->num_code_blocks += ri->num_code_blocks; for (i=0; i < ri->num_code_blocks; i++) { struct reg_code_block *src, *dst; - STRLEN offset = SvCUR(pat) + STRLEN offset = orig_patlen + ((struct regexp *)SvANY(rx))->pre_prefix; assert(n < pRExC_state->num_code_blocks); src = &ri->code_blocks[i]; @@ -5233,27 +5505,20 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } } } - - if ((SvAMAGIC(pat) || SvAMAGIC(msv)) && - (sv = amagic_call(pat, msv, concat_amg, AMGf_assign))) - { - sv_setsv(pat, sv); - /* overloading involved: all bets are off over literal - * code. Pretend we haven't seen it */ - pRExC_state->num_code_blocks -= n; - n = 0; - - } - else { - sv_catsv_nomg(pat, msv); - if (code) - pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1; - } } SvSETMAGIC(pat); } - else + else { + SV *sv; pat = *patternp; + while (SvAMAGIC(pat) + && (sv = AMG_CALLunary(pat, string_amg)) + && sv != pat) + { + pat = sv; + SvGETMAGIC(pat); + } + } /* handle bare regex: foo =~ $re */ { @@ -5262,7 +5527,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, re = SvRV(re); if (SvTYPE(re) == SVt_REGEXP) { if (is_bare_re) - *is_bare_re = 1; + *is_bare_re = TRUE; SvREFCNT_inc(re); Safefree(pRExC_state->code_blocks); return (REGEXP*)re; @@ -5330,6 +5595,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat); RExC_uni_semantics = 0; RExC_contains_locale = 0; + pRExC_state->runtime_code_qr = NULL; /****************** LONG JUMP TARGET HERE***********************/ /* Longjmp back to here if have to switch in midstream to utf8 */ @@ -5417,18 +5683,31 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* return old regex if pattern hasn't changed */ if ( old_re + && !recompile && !!RX_UTF8(old_re) == !!RExC_utf8 && RX_PRECOMP(old_re) && RX_PRELEN(old_re) == plen && memEQ(RX_PRECOMP(old_re), exp, plen)) { - ReREFCNT_inc(old_re); - if (used_setjump) { - JMPENV_POP; + /* with runtime code, always recompile */ + runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags, + exp, plen); + if (!runtime_code) { + if (used_setjump) { + JMPENV_POP; + } + Safefree(pRExC_state->code_blocks); + return old_re; } - Safefree(pRExC_state->code_blocks); - return old_re; } + else if ((pm_flags & PMf_USE_RE_EVAL) + /* this second condition covers the non-regex literal case, + * i.e. $foo =~ '(?{})'. */ + || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME + && (PL_hints & HINT_RE_EVAL)) + ) + runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags, + exp, plen); #ifdef TRIE_STUDY_OPT restudied = 0; @@ -5449,12 +5728,24 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_precomp = exp; RExC_flags = rx_flags; RExC_pm_flags = pm_flags; + + if (runtime_code) { + if (PL_tainting && PL_tainted) + Perl_croak(aTHX_ "Eval-group in insecure regular expression"); + + if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) { + /* whoops, we have a non-utf8 pattern, whilst run-time code + * got compiled as utf8. Try again with a utf8 pattern */ + JMPENV_JUMP(UTF8_LONGJMP); + } + } + assert(!pRExC_state->runtime_code_qr); + RExC_sawback = 0; RExC_seen = 0; RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; - RExC_seen_evals = 0; RExC_extralen = 0; RExC_override_recoding = 0; @@ -5483,7 +5774,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, * Clever compilers notice this and complain. --jhi */ REGC((U8)REG_MAGIC, (char*)RExC_emit); #endif - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n")); + DEBUG_PARSE_r( + PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"); + RExC_lastnum=0; + RExC_lastparse=NULL; + ); if (reg(pRExC_state, 0, &flags,1) == NULL) { RExC_precomp = NULL; Safefree(pRExC_state->code_blocks); @@ -5651,8 +5946,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_emit_bound = ri->program + RExC_size + 1; pRExC_state->code_index = 0; - /* Store the count of eval-groups for security checks: */ - RExC_rx->seen_evals = RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); if (reg(pRExC_state, 0, &flags,1) == NULL) { ReREFCNT_dec(rx); @@ -5805,7 +6098,7 @@ reStudy: else if ((!sawopen || !RExC_sawback) && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && - !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL)) + !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks) { /* turn .* into ^.* with an implied $*=1 */ const int type = @@ -5818,7 +6111,7 @@ reStudy: goto again; } if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback) - && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */ + && !pRExC_state->num_code_blocks) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->intflags |= PREGf_SKIP; @@ -6091,7 +6384,7 @@ reStudy: r->extflags |= RXf_GPOS_SEEN; if (RExC_seen & REG_SEEN_LOOKBEHIND) r->extflags |= RXf_LOOKBEHIND_SEEN; - if (RExC_seen & REG_SEEN_EVAL) + if (pRExC_state->num_code_blocks) r->extflags |= RXf_EVAL_SEEN; if (RExC_seen & REG_SEEN_CANY) r->extflags |= RXf_CANY_SEEN; @@ -6099,6 +6392,8 @@ reStudy: r->intflags |= PREGf_VERBARG_SEEN; if (RExC_seen & REG_SEEN_CUTGROUP) r->intflags |= PREGf_CUTGROUP_SEEN; + if (pm_flags & PMf_USE_RE_EVAL) + r->intflags |= PREGf_USE_RE_EVAL; if (RExC_paren_names) RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names)); else @@ -6586,8 +6881,10 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) do { RExC_parse++; } while (isALNUM(*RExC_parse)); + } else { + RExC_parse++; /* so the <- from the vFAIL is after the offending character */ + vFAIL("Group name must start with a non-digit word character"); } - if ( flags ) { SV* sv_name = newSVpvn_flags(name_start, (int)(RExC_parse - name_start), @@ -6611,7 +6908,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name", (unsigned long) flags); } - /* NOT REACHED */ + assert(0); /* NOT REACHED */ } return NULL; } @@ -7017,8 +7314,6 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end } } -#ifndef PERL_IN_XSUB_RE - STATIC IV S_invlist_search(pTHX_ SV* const invlist, const UV cp) { @@ -7061,6 +7356,8 @@ S_invlist_search(pTHX_ SV* const invlist, const UV cp) return high - 1; } +#ifndef PERL_IN_XSUB_RE + void Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch) { @@ -7144,7 +7441,6 @@ Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV return; } - void Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output) { @@ -7649,6 +7945,18 @@ Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) #endif +STATIC bool +S__invlist_contains_cp(pTHX_ SV* const invlist, const UV cp) +{ + /* Does contain code point as part of the set? */ + + IV index = invlist_search(invlist, cp); + + PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP; + + return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index); +} + PERL_STATIC_INLINE SV* S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) { return _add_range_to_invlist(invlist, cp, cp); @@ -7806,6 +8114,36 @@ S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end) return TRUE; } +PERL_STATIC_INLINE UV +S_invlist_highest(pTHX_ SV* const invlist) +{ + /* Returns the highest code point that matches an inversion list. This API + * has an ambiguity, as it returns 0 under either the highest is actually + * 0, or if the list is empty. If this distinction matters to you, check + * for emptiness before calling this function */ + + UV len = invlist_len(invlist); + UV *array; + + PERL_ARGS_ASSERT_INVLIST_HIGHEST; + + if (len == 0) { + return 0; + } + + array = invlist_array(invlist); + + /* The last element in the array in the inversion list always starts a + * range that goes to infinity. That range may be for code points that are + * matched in the inversion list, or it may be for ones that aren't + * matched. In the latter case, the highest code point in the set is one + * less than the beginning of this range; otherwise it is the final element + * of this range: infinity */ + return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1)) + ? UV_MAX + : array[len - 1] - 1; +} + #ifndef PERL_IN_XSUB_RE SV * Perl__invlist_contents(pTHX_ SV* const invlist) @@ -7861,6 +8199,77 @@ S_invlist_dump(pTHX_ SV* const invlist, const char * const header) } #endif +#if 0 +bool +S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b) +{ + /* Return a boolean as to if the two passed in inversion lists are + * identical. The final argument, if TRUE, says to take the complement of + * the second inversion list before doing the comparison */ + + UV* array_a = invlist_array(a); + UV* array_b = invlist_array(b); + UV len_a = invlist_len(a); + UV len_b = invlist_len(b); + + UV i = 0; /* current index into the arrays */ + bool retval = TRUE; /* Assume are identical until proven otherwise */ + + PERL_ARGS_ASSERT__INVLISTEQ; + + /* If are to compare 'a' with the complement of b, set it + * up so are looking at b's complement. */ + if (complement_b) { + + /* The complement of nothing is everything, so would have to have + * just one element, starting at zero (ending at infinity) */ + if (len_b == 0) { + return (len_a == 1 && array_a[0] == 0); + } + else if (array_b[0] == 0) { + + /* Otherwise, to complement, we invert. Here, the first element is + * 0, just remove it. To do this, we just pretend the array starts + * one later, and clear the flag as we don't have to do anything + * else later */ + + array_b++; + len_b--; + complement_b = FALSE; + } + else { + + /* But if the first element is not zero, we unshift a 0 before the + * array. The data structure reserves a space for that 0 (which + * should be a '1' right now), so physical shifting is unneeded, + * but temporarily change that element to 0. Before exiting the + * routine, we must restore the element to '1' */ + array_b--; + len_b++; + array_b[0] = 0; + } + } + + /* Make sure that the lengths are the same, as well as the final element + * before looping through the remainder. (Thus we test the length, final, + * and first elements right off the bat) */ + if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) { + retval = FALSE; + } + else for (i = 0; i < len_a - 1; i++) { + if (array_a[i] != array_b[i]) { + retval = FALSE; + break; + } + } + + if (complement_b) { + array_b[0] = 1; + } + return retval; +} +#endif + #undef HEADER_LENGTH #undef INVLIST_INITIAL_LENGTH #undef TO_INTERNAL_SIZE @@ -8068,7 +8477,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ret = reganode(pRExC_state, ((! FOLD) ? NREF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? NREFFA : (AT_LEAST_UNI_SEMANTICS) ? NREFFU @@ -8225,7 +8634,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; } goto gen_recurse_regop; - /* NOT REACHED */ + assert(0); /* NOT REACHED */ case '+': if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) { RExC_parse++; @@ -8295,7 +8704,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); return ret; } /* named and numeric backreferences */ - /* NOT REACHED */ + assert(0); /* NOT REACHED */ case '?': /* (??...) */ is_logical = 1; @@ -8309,98 +8718,51 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* FALL THROUGH */ case '{': /* (?{...}) */ { - I32 count = 1; U32 n = 0; - char c; - char *s = RExC_parse; + struct reg_code_block *cb; RExC_seen_zerolen++; - RExC_seen |= REG_SEEN_EVAL; - if ( pRExC_state->num_code_blocks - && pRExC_state->code_index < pRExC_state->num_code_blocks - && pRExC_state->code_blocks[pRExC_state->code_index].start - == (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) + if ( !pRExC_state->num_code_blocks + || pRExC_state->code_index >= pRExC_state->num_code_blocks + || pRExC_state->code_blocks[pRExC_state->code_index].start + != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) - RExC_start) ) { - /* this is a pre-compiled literal (?{}) */ - struct reg_code_block *cb = - &pRExC_state->code_blocks[pRExC_state->code_index]; - RExC_parse = RExC_start + cb->end; - if (SIZE_ONLY) - RExC_seen_evals++; - else { - OP *o = cb->block; - if (cb->src_regex) { - n = add_data(pRExC_state, 2, "rl"); - RExC_rxi->data->data[n] = - (void*)SvREFCNT_inc((SV*)cb->src_regex); - RExC_rxi->data->data[n+1] = (void*)o->op_next; - } - else { - n = add_data(pRExC_state, 1, - (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l"); - RExC_rxi->data->data[n] = (void*)o->op_next; - } - } - pRExC_state->code_index++; + if (RExC_pm_flags & PMf_USE_RE_EVAL) + FAIL("panic: Sequence (?{...}): no code block found\n"); + FAIL("Eval-group not allowed at runtime, use re 'eval'"); } - else { - while (count && (c = *RExC_parse)) { - if (c == '\\') { - if (RExC_parse[1]) - RExC_parse++; - } - else if (c == '{') - count++; - else if (c == '}') - count--; - RExC_parse++; - } - if (*RExC_parse != ')') { - RExC_parse = s; - vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); - } - if (!SIZE_ONLY) { - PAD *pad; - OP_4tree *sop, *rop; - SV * const sv = newSVpvn(s, RExC_parse - 1 - s); - - ENTER; - Perl_save_re_context(aTHX); - rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad); - sop->op_private |= OPpREFCOUNTED; - /* re_dup will OpREFCNT_inc */ - OpREFCNT_set(sop, 1); - LEAVE; - - n = add_data(pRExC_state, 3, "nop"); - RExC_rxi->data->data[n] = (void*)rop; - RExC_rxi->data->data[n+1] = (void*)sop; - RExC_rxi->data->data[n+2] = (void*)pad; - SvREFCNT_dec(sv); + /* this is a pre-compiled code block (?{...}) */ + cb = &pRExC_state->code_blocks[pRExC_state->code_index]; + RExC_parse = RExC_start + cb->end; + if (!SIZE_ONLY) { + OP *o = cb->block; + if (cb->src_regex) { + n = add_data(pRExC_state, 2, "rl"); + RExC_rxi->data->data[n] = + (void*)SvREFCNT_inc((SV*)cb->src_regex); + RExC_rxi->data->data[n+1] = (void*)o; } - else { /* First pass */ - if (PL_reginterp_cnt < ++RExC_seen_evals - && IN_PERL_RUNTIME) - /* No compiled RE interpolated, has runtime - components ===> unsafe. */ - FAIL("Eval-group not allowed at runtime, use re 'eval'"); - if (PL_tainting && PL_tainted) - FAIL("Eval-group in insecure regular expression"); - #if PERL_VERSION > 8 - if (IN_PERL_COMPILETIME) - PL_cv_has_eval = 1; - #endif + else { + n = add_data(pRExC_state, 1, + (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l"); + RExC_rxi->data->data[n] = (void*)o; } } + pRExC_state->code_index++; nextchar(pRExC_state); if (is_logical) { + regnode *eval; ret = reg_node(pRExC_state, LOGICAL); - if (!SIZE_ONLY) + eval = reganode(pRExC_state, EVAL, n); + if (!SIZE_ONLY) { ret->flags = 2; - REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n)); + /* for later propagation into (??{}) return value */ + eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME); + } + REGTAIL(pRExC_state, ret, eval); /* deal with the length of this later - MJD */ return ret; } @@ -8930,7 +9292,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else FAIL("Junk on end of regexp"); /* "Can't happen". */ - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } if (RExC_in_lookbehind) { @@ -9463,6 +9825,81 @@ S_reg_recode(pTHX_ const char value, SV **encp) return uv; } +PERL_STATIC_INLINE U8 +S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state) +{ + U8 op; + + PERL_ARGS_ASSERT_COMPUTE_EXACTISH; + + if (! FOLD) { + return EXACT; + } + + op = get_regex_charset(RExC_flags); + if (op >= REGEX_ASCII_RESTRICTED_CHARSET) { + op--; /* /a is same as /u, and map /aa's offset to what /a's would have + been, so there is no hole */ + } + + return op + EXACTF; +} + +PERL_STATIC_INLINE void +S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, STRLEN len, UV code_point) +{ + /* This knows the details about sizing an EXACTish node, and potentially + * populating it with a single character. If is non-zero, it assumes + * that the node has already been populated, and just does the sizing, + * ignoring . Otherwise it looks at and + * calculates what should be. In pass 1, it sizes the node + * appropriately. In pass 2, it additionally will populate the node's + * STRING with , if is 0. + * + * It knows that under FOLD, UTF characters and the Latin Sharp S must be + * folded (the latter only when the rules indicate it can match 'ss') */ + + bool len_passed_in = cBOOL(len != 0); + U8 character[UTF8_MAXBYTES_CASE+1]; + + PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; + + if (! len_passed_in) { + if (UTF) { + if (FOLD) { + to_uni_fold(NATIVE_TO_UNI(code_point), character, &len); + } + else { + uvchr_to_utf8( character, code_point); + len = UTF8SKIP(character); + } + } + else if (! FOLD + || code_point != LATIN_SMALL_LETTER_SHARP_S + || ASCII_FOLD_RESTRICTED + || ! AT_LEAST_UNI_SEMANTICS) + { + *character = (U8) code_point; + len = 1; + } + else { + *character = 's'; + *(character + 1) = 's'; + len = 2; + } + } + + if (SIZE_ONLY) { + RExC_size += STR_SZ(len); + } + else { + RExC_emit += STR_SZ(len); + STR_LEN(node) = len; + if (! len_passed_in) { + Copy((char *) character, STRING(node), len, char); + } + } +} /* - regatom - the lowest level @@ -9479,8 +9916,46 @@ S_reg_recode(pTHX_ const char value, SV **encp) sequence, we return. Note: we have to be careful with escapes, as they can be both literal - and special, and in the case of \10 and friends can either, depending - on context. Specifically there are two separate switches for handling + and special, and in the case of \10 and friends, context determines which. + + A summary of the code structure is: + + switch (first_byte) { + cases for each special: + handle this special; + break; + case '\\': + switch (2nd byte) { + cases for each unambiguous special: + handle this special; + break; + cases for each ambigous special/literal: + disambiguate; + if (special) handle here + else goto defchar; + default: // unambiguously literal: + goto defchar; + } + default: // is a literal char + // FALL THROUGH + defchar: + create EXACTish node for literal; + while (more input and node isn't full) { + switch (input_byte) { + cases for each special; + make sure parse pointer is set so that the next call to + regatom will see this special first + goto loopdone; // EXACTish node terminated by prev. char + default: + append char to EXACTISH node; + } + get next input byte; + } + loopdone: + } + return the generated node; + + Specifically there are two separate switches for handling escape sequences, with the one for handling literal escapes requiring a dummy entry for all of the special escapes that are actually handled by the other. @@ -9634,43 +10109,17 @@ tryagain: *flagp |= HASWIDTH; goto finish_meta_pat; case 'w': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = ALNUML; - break; - case REGEX_UNICODE_CHARSET: - op = ALNUMU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = ALNUMA; - break; - case REGEX_DEPENDS_CHARSET: - op = ALNUM; - break; - default: - goto bad_charset; + op = ALNUM + get_regex_charset(RExC_flags); + if (op > ALNUMA) { /* /aa is same as /a */ + op = ALNUMA; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'W': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = NALNUML; - break; - case REGEX_UNICODE_CHARSET: - op = NALNUMU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = NALNUMA; - break; - case REGEX_DEPENDS_CHARSET: - op = NALNUM; - break; - default: - goto bad_charset; + op = NALNUM + get_regex_charset(RExC_flags); + if (op > NALNUMA) { /* /aa is same as /a */ + op = NALNUMA; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; @@ -9678,22 +10127,9 @@ tryagain: case 'b': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = BOUNDL; - break; - case REGEX_UNICODE_CHARSET: - op = BOUNDU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = BOUNDA; - break; - case REGEX_DEPENDS_CHARSET: - op = BOUND; - break; - default: - goto bad_charset; + op = BOUND + get_regex_charset(RExC_flags); + if (op > BOUNDA) { /* /aa is same as /a */ + op = BOUNDA; } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); @@ -9702,103 +10138,45 @@ tryagain: case 'B': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = NBOUNDL; - break; - case REGEX_UNICODE_CHARSET: - op = NBOUNDU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = NBOUNDA; - break; - case REGEX_DEPENDS_CHARSET: - op = NBOUND; - break; - default: - goto bad_charset; + op = NBOUND + get_regex_charset(RExC_flags); + if (op > NBOUNDA) { /* /aa is same as /a */ + op = NBOUNDA; } ret = reg_node(pRExC_state, op); FLAGS(ret) = get_regex_charset(RExC_flags); *flagp |= SIMPLE; goto finish_meta_pat; case 's': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = SPACEL; - break; - case REGEX_UNICODE_CHARSET: - op = SPACEU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = SPACEA; - break; - case REGEX_DEPENDS_CHARSET: - op = SPACE; - break; - default: - goto bad_charset; + op = SPACE + get_regex_charset(RExC_flags); + if (op > SPACEA) { /* /aa is same as /a */ + op = SPACEA; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'S': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = NSPACEL; - break; - case REGEX_UNICODE_CHARSET: - op = NSPACEU; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = NSPACEA; - break; - case REGEX_DEPENDS_CHARSET: - op = NSPACE; - break; - default: - goto bad_charset; - } - ret = reg_node(pRExC_state, op); - *flagp |= HASWIDTH|SIMPLE; - goto finish_meta_pat; - case 'd': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = DIGITL; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = DIGITA; - break; - case REGEX_DEPENDS_CHARSET: /* No difference between these */ - case REGEX_UNICODE_CHARSET: - op = DIGIT; - break; - default: - goto bad_charset; + op = NSPACE + get_regex_charset(RExC_flags); + if (op > NSPACEA) { /* /aa is same as /a */ + op = NSPACEA; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'D': - switch (get_regex_charset(RExC_flags)) { - case REGEX_LOCALE_CHARSET: - op = NDIGITL; - break; - case REGEX_ASCII_RESTRICTED_CHARSET: - case REGEX_ASCII_MORE_RESTRICTED_CHARSET: - op = NDIGITA; - break; - case REGEX_DEPENDS_CHARSET: /* No difference between these */ - case REGEX_UNICODE_CHARSET: - op = NDIGIT; - break; - default: - goto bad_charset; + op = NDIGIT; + goto join_D_and_d; + case 'd': + op = DIGIT; + join_D_and_d: + { + U8 offset = get_regex_charset(RExC_flags); + if (offset == REGEX_UNICODE_CHARSET) { + offset = REGEX_DEPENDS_CHARSET; + } + else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) { + offset = REGEX_ASCII_RESTRICTED_CHARSET; + } + op += offset; } ret = reg_node(pRExC_state, op); *flagp |= HASWIDTH|SIMPLE; @@ -9899,7 +10277,7 @@ tryagain: ret = reganode(pRExC_state, ((! FOLD) ? NREF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? NREFFA : (AT_LEAST_UNI_SEMANTICS) ? NREFFU @@ -9949,6 +10327,7 @@ tryagain: vFAIL("Reference to nonexistent or unclosed group"); } if (!isg && num > 9 && num >= RExC_npar) + /* Probably a character specified in octal, e.g. \35 */ goto defchar; else { char * const parse_start = RExC_parse - 1; /* MJD */ @@ -9969,7 +10348,7 @@ tryagain: ret = reganode(pRExC_state, ((! FOLD) ? REF - : (MORE_ASCII_RESTRICTED) + : (ASCII_FOLD_RESTRICTED) ? REFFA : (AT_LEAST_UNI_SEMANTICS) ? REFFU @@ -10020,20 +10399,14 @@ tryagain: STRLEN foldlen; U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf; U8 node_type; + bool next_is_quantifier; /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so, * it is folded to 'ss' even if not utf8 */ bool is_exactfu_sharp_s; ender = 0; - node_type = ((! FOLD) ? EXACT - : (LOC) - ? EXACTFL - : (MORE_ASCII_RESTRICTED) - ? EXACTFA - : (AT_LEAST_UNI_SEMANTICS) - ? EXACTFU - : EXACTF); + node_type = compute_EXACTish(pRExC_state); ret = reg_node(pRExC_state, node_type); s = STRING(ret); @@ -10163,38 +10536,39 @@ tryagain: break; } case 'x': - if (*++p == '{') { - char* const e = strchr(p, '}'); + { + STRLEN brace_len = len; + UV result; + const char* error_msg; - if (!e) { - RExC_parse = p + 1; - vFAIL("Missing right brace on \\x{}"); + bool valid = grok_bslash_x(p, + &result, + &brace_len, + &error_msg, + 1); + p += brace_len; + if (! valid) { + RExC_parse = p; /* going to die anyway; point + to exact spot of failure */ + vFAIL(error_msg); } else { - I32 flags = PERL_SCAN_ALLOW_UNDERSCORES - | PERL_SCAN_DISALLOW_PREFIX; - STRLEN numlen = e - p - 1; - ender = grok_hex(p + 1, &numlen, &flags, NULL); - if (ender > 0xff) - REQUIRE_UTF8; - p = e + 1; + ender = result; } + if (PL_encoding && ender < 0x100) { + goto recode_encoding; + } + if (ender > 0xff) { + REQUIRE_UTF8; + } + break; } - else { - I32 flags = PERL_SCAN_DISALLOW_PREFIX; - STRLEN numlen = 2; - ender = grok_hex(p, &numlen, &flags, NULL); - p += numlen; - } - if (PL_encoding && ender < 0x100) - goto recode_encoding; - break; case 'c': p++; ender = grok_bslash_c(*p++, UTF, SIZE_ONLY); break; case '0': case '1': case '2': case '3':case '4': - case '5': case '6': case '7': case '8':case '9': + case '5': case '6': case '7': if (*p == '0' || (isDIGIT(p[1]) && atoi(p) >= RExC_npar)) { @@ -10227,7 +10601,7 @@ tryagain: FAIL("Trailing \\"); /* FALL THROUGH */ default: - if (!SIZE_ONLY&& isALPHA(*p)) { + if (!SIZE_ONLY&& isALNUMC(*p)) { ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p); } goto normal_default; @@ -10261,6 +10635,20 @@ tryagain: && ender == LATIN_SMALL_LETTER_SHARP_S); if ( RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); + + /* If the next thing is a quantifier, it applies to this + * character only, which means that this character has to be in + * its own node and can't just be appended to the string in an + * existing node, so if there are already other characters in + * the node, close the node with just them, and set up to do + * this character again next time through, when it will be the + * only thing in its new node */ + if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len) + { + p = oldp; + goto loopdone; + } + if ((UTF && FOLD) || is_exactfu_sharp_s) { /* Prime the casefolded buffer. Locale rules, which apply * only to code points < 256, aren't known until execution, @@ -10277,114 +10665,24 @@ tryagain: foldlen = 2; } } - else if (isASCII(ender)) { /* Note: Here can't also be LOC - */ - ender = toLOWER(ender); - *tmpbuf = (U8) ender; - foldlen = 1; - } - else if (! MORE_ASCII_RESTRICTED && ! LOC) { - - /* Locale and /aa require more selectivity about the - * fold, so are handled below. Otherwise, here, just - * use the fold */ - ender = toFOLD_uni(ender, tmpbuf, &foldlen); - } - else { - /* Under locale rules or /aa we are not to mix, - * respectively, ords < 256 or ASCII with non-. So - * reject folds that mix them, using only the - * non-folded code point. So do the fold to a - * temporary, and inspect each character in it. */ - U8 trialbuf[UTF8_MAXBYTES_CASE+1]; - U8* s = trialbuf; - UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen); - U8* e = s + foldlen; - bool fold_ok = TRUE; - - while (s < e) { - if (isASCII(*s) - || (LOC && (UTF8_IS_INVARIANT(*s) - || UTF8_IS_DOWNGRADEABLE_START(*s)))) - { - fold_ok = FALSE; - break; - } - s += UTF8SKIP(s); - } - if (fold_ok) { - Copy(trialbuf, tmpbuf, foldlen, U8); - ender = tmpender; - } - else { - uvuni_to_utf8(tmpbuf, ender); - foldlen = UNISKIP(ender); - } - } - } - if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ - if (len) - p = oldp; - else if (UTF || is_exactfu_sharp_s) { - if (FOLD) { - /* Emit all the Unicode characters. */ - STRLEN numlen; - for (foldbuf = tmpbuf; - foldlen; - foldlen -= numlen) { - - /* tmpbuf has been constructed by us, so we - * know it is valid utf8 */ - ender = valid_utf8_to_uvchr(foldbuf, &numlen); - if (numlen > 0) { - const STRLEN unilen = reguni(pRExC_state, ender, s); - s += unilen; - len += unilen; - /* In EBCDIC the numlen - * and unilen can differ. */ - foldbuf += numlen; - if (numlen >= foldlen) - break; - } - else - break; /* "Can't happen." */ - } - } - else { - const STRLEN unilen = reguni(pRExC_state, ender, s); - if (unilen > 0) { - s += unilen; - len += unilen; - } - } - } else { - len++; - REGC((char)ender, s++); + ender = _to_uni_fold_flags(ender, tmpbuf, &foldlen, + FOLD_FLAGS_FULL + | ((LOC) ? FOLD_FLAGS_LOCALE + : (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0) + ); } - break; } if (UTF || is_exactfu_sharp_s) { - if (FOLD) { - /* Emit all the Unicode characters. */ - STRLEN numlen; - for (foldbuf = tmpbuf; - foldlen; - foldlen -= numlen) { - ender = valid_utf8_to_uvchr(foldbuf, &numlen); - if (numlen > 0) { - const STRLEN unilen = reguni(pRExC_state, ender, s); - len += unilen; - s += unilen; - /* In EBCDIC the numlen - * and unilen can differ. */ - foldbuf += numlen; - if (numlen >= foldlen) - break; - } - else - break; - } + if (FOLD) { + if (! SIZE_ONLY) { + /* Emit all the Unicode characters. */ + Copy(tmpbuf, s, foldlen, char); + } + len += foldlen; + s += foldlen; } else { const STRLEN unilen = reguni(pRExC_state, ender, s); @@ -10393,10 +10691,26 @@ tryagain: len += unilen; } } + + /* The loop increments each time, as all but this + * path through it add a single byte to the EXACTish node. + * But this one has changed len to be the correct final + * value, so subtract one to cancel out the increment that + * follows */ len--; } else { REGC((char)ender, s++); + } + + if (next_is_quantifier) { + + /* Here, the next input is a quantifier, and to get here, + * the current character is the only one in the node. + * Also, here doesn't include the final byte for this + * character */ + len++; + goto loopdone; } } loopdone: /* Jumped to when encounters something that shouldn't be in @@ -10415,22 +10729,12 @@ tryagain: if (len == 1 && UNI_IS_INVARIANT(ender)) *flagp |= SIMPLE; - if (SIZE_ONLY) - RExC_size += STR_SZ(len); - else { - STR_LEN(ret) = len; - RExC_emit += STR_SZ(len); - } + alloc_maybe_populate_EXACT(pRExC_state, ret, len, 0); } break; } return(ret); - -/* Jumped to when an unrecognized character set is encountered */ -bad_charset: - Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags)); - return(NULL); } STATIC char * @@ -10505,7 +10809,7 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) switch (skip) { case 4: if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */ - namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM; + namedclass = ANYOF_ALNUM; break; case 5: /* Names all of length 5. */ @@ -10515,57 +10819,63 @@ S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) switch (posixcc[4]) { case 'a': if (memEQ(posixcc, "alph", 4)) /* alpha */ - namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA; + namedclass = ANYOF_ALPHA; break; case 'e': if (memEQ(posixcc, "spac", 4)) /* space */ - namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC; + namedclass = ANYOF_PSXSPC; break; case 'h': if (memEQ(posixcc, "grap", 4)) /* graph */ - namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH; + namedclass = ANYOF_GRAPH; break; case 'i': if (memEQ(posixcc, "asci", 4)) /* ascii */ - namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII; + namedclass = ANYOF_ASCII; break; case 'k': if (memEQ(posixcc, "blan", 4)) /* blank */ - namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK; + namedclass = ANYOF_BLANK; break; case 'l': if (memEQ(posixcc, "cntr", 4)) /* cntrl */ - namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL; + namedclass = ANYOF_CNTRL; break; case 'm': if (memEQ(posixcc, "alnu", 4)) /* alnum */ - namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC; + namedclass = ANYOF_ALNUMC; break; case 'r': if (memEQ(posixcc, "lowe", 4)) /* lower */ - namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER; + namedclass = ANYOF_LOWER; else if (memEQ(posixcc, "uppe", 4)) /* upper */ - namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER; + namedclass = ANYOF_UPPER; break; case 't': if (memEQ(posixcc, "digi", 4)) /* digit */ - namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT; + namedclass = ANYOF_DIGIT; else if (memEQ(posixcc, "prin", 4)) /* print */ - namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT; + namedclass = ANYOF_PRINT; else if (memEQ(posixcc, "punc", 4)) /* punct */ - namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT; + namedclass = ANYOF_PUNCT; break; } break; case 6: if (memEQ(posixcc, "xdigit", 6)) - namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT; + namedclass = ANYOF_XDIGIT; break; } if (namedclass == OOB_NAMEDCLASS) Simple_vFAIL3("POSIX class [:%.*s:] unknown", t - s - 1, s + 1); + + /* The #defines are structured so each complement is +1 to + * the normal one */ + if (complement) { + namedclass++; + } assert (posixcc[skip] == ':'); assert (posixcc[skip+1] == ']'); } else if (!SIZE_ONLY) { @@ -10696,7 +11006,7 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) * determined at run-time * run_time_list is a SV* that contains text names of properties that are to * be computed at run time. This concatenates - * to it, apppropriately + * to it, appropriately * This is essentially DO_POSIX, but we know only the Latin1 values at compile * time */ #define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ @@ -10722,16 +11032,18 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) } /* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of - * this and DO_N_POSIX */ + * this and DO_N_POSIX. Sets only if it can; unchanged + * otherwise */ #define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \ - l1_sourcelist, Xpropertyname, run_time_list) \ + l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \ if (AT_LEAST_ASCII_RESTRICTED) { \ _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \ } \ else { \ Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \ + matches_above_unicode = TRUE; \ if (LOC) { \ - ANYOF_CLASS_SET(node, namedclass); \ + ANYOF_CLASS_SET(node, namedclass); \ } \ else { \ SV* scratch_list = NULL; \ @@ -10749,171 +11061,6 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) } \ } -STATIC U8 -S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr) -{ - - /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes. - * Locale folding is done at run-time, so this function should not be - * called for nodes that are for locales. - * - * This function sets the bit corresponding to the fold of the input - * 'value', if not already set. The fold of 'f' is 'F', and the fold of - * 'F' is 'f'. - * - * It also knows about the characters that are in the bitmap that have - * folds that are matchable only outside it, and sets the appropriate lists - * and flags. - * - * It returns the number of bits that actually changed from 0 to 1 */ - - U8 stored = 0; - U8 fold; - - PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD; - - fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value] - : PL_fold[value]; - - /* It assumes the bit for 'value' has already been set */ - if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) { - ANYOF_BITMAP_SET(node, fold); - stored++; - } - if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) { - /* Certain Latin1 characters have matches outside the bitmap. To get - * here, 'value' is one of those characters. None of these matches is - * valid for ASCII characters under /aa, which have been excluded by - * the 'if' above. The matches fall into three categories: - * 1) They are singly folded-to or -from an above 255 character, as - * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y - * WITH DIAERESIS; - * 2) They are part of a multi-char fold with another character in the - * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill; - * 3) They are part of a multi-char fold with a character not in the - * bitmap, such as various ligatures. - * We aren't dealing fully with multi-char folds, except we do deal - * with the pattern containing a character that has a multi-char fold - * (not so much the inverse). - * For types 1) and 3), the matches only happen when the target string - * is utf8; that's not true for 2), and we set a flag for it. - * - * The code below adds to the passed in inversion list the single fold - * closures for 'value'. The values are hard-coded here so that an - * innocent-looking character class, like /[ks]/i won't have to go out - * to disk to find the possible matches. XXX It would be better to - * generate these via regen, in case a new version of the Unicode - * standard adds new mappings, though that is not really likely. */ - switch (value) { - case 'k': - case 'K': - /* KELVIN SIGN */ - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A); - break; - case 's': - case 'S': - /* LATIN SMALL LETTER LONG S */ - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F); - break; - case MICRO_SIGN: - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, - GREEK_SMALL_LETTER_MU); - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, - GREEK_CAPITAL_LETTER_MU); - break; - case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: - case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: - /* ANGSTROM SIGN */ - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B); - if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */ - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, - PL_fold_latin1[value]); - } - break; - case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, - LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); - break; - case LATIN_SMALL_LETTER_SHARP_S: - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, - LATIN_CAPITAL_LETTER_SHARP_S); - - /* Under /a, /d, and /u, this can match the two chars "ss" */ - if (! MORE_ASCII_RESTRICTED) { - add_alternate(alternate_ptr, (U8 *) "ss", 2); - - /* And under /u or /a, it can match even if the target is - * not utf8 */ - if (AT_LEAST_UNI_SEMANTICS) { - ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8; - } - } - break; - case 'F': case 'f': - case 'I': case 'i': - case 'L': case 'l': - case 'T': case 't': - case 'A': case 'a': - case 'H': case 'h': - case 'J': case 'j': - case 'N': case 'n': - case 'W': case 'w': - case 'Y': case 'y': - /* These all are targets of multi-character folds from code - * points that require UTF8 to express, so they can't match - * unless the target string is in UTF-8, so no action here is - * necessary, as regexec.c properly handles the general case - * for UTF-8 matching */ - break; - default: - /* Use deprecated warning to increase the chances of this - * being output */ - ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value); - break; - } - } - else if (DEPENDS_SEMANTICS - && ! isASCII(value) - && PL_fold_latin1[value] != value) - { - /* Under DEPENDS rules, non-ASCII Latin1 characters match their - * folds only when the target string is in UTF-8. We add the fold - * here to the list of things to match outside the bitmap, which - * won't be looked at unless it is UTF8 (or else if something else - * says to look even if not utf8, but those things better not happen - * under DEPENDS semantics. */ - *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]); - } - - return stored; -} - - -PERL_STATIC_INLINE U8 -S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr) -{ - /* This inline function sets a bit in the bitmap if not already set, and if - * appropriate, its fold, returning the number of bits that actually - * changed from 0 to 1 */ - - U8 stored; - - PERL_ARGS_ASSERT_SET_REGCLASS_BIT; - - if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */ - return 0; - } - - ANYOF_BITMAP_SET(node, value); - stored = 1; - - if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */ - stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr); - } - - return stored; -} - STATIC void S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) { @@ -10932,6 +11079,15 @@ S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len) return; } +/* The names of properties whose definitions are not known at compile time are + * stored in this SV, after a constant heading. So if the length has been + * changed since initialization, then there is a run-time definition. */ +#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len) + +/* This converts the named class defined in regcomp.h to its equivalent class + * number defined in handy.h. */ +#define namedclass_to_classnum(class) ((class) / 2) + /* parse a class specification and produce either an ANYOF node that matches the pattern or perhaps will be optimized into an EXACTish node @@ -10944,12 +11100,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) { dVAR; register UV nextvalue; - register IV prevvalue = OOB_UNICODE; + register UV prevvalue = OOB_UNICODE; register IV range = 0; UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ register regnode *ret; STRLEN numlen; - IV namedclass; + IV namedclass = OOB_NAMEDCLASS; char *rangebegin = NULL; bool need_class = 0; bool allow_full_fold = TRUE; /* Assume wants multi-char folding */ @@ -10957,6 +11113,8 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more than just initialized. */ SV* properties = NULL; /* Code points that match \p{} \P{} */ + SV* posixes = NULL; /* Code points that match classes like, [:word:], + extended beyond the Latin1 range */ UV element_count = 0; /* Number of distinct elements in the class. Optimizations may be possible if this is tiny */ UV n; @@ -10971,37 +11129,34 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) /* Set if a component of this character class is user-defined; just passed * on to the engine */ - UV has_user_defined_property = 0; - - /* code points this node matches that can't be stored in the bitmap */ - SV* nonbitmap = NULL; - - /* The items that are to match that aren't stored in the bitmap, but are a - * result of things that are stored there. This is the fold closure of - * such a character, either because it has DEPENDS semantics and shouldn't - * be matched unless the target string is utf8, or is a code point that is - * too large for the bit map, as for example, the fold of the MICRO SIGN is - * above 255. This all is solely for performance reasons. By having this - * code know the outside-the-bitmap folds that the bitmapped characters are - * involved with, we don't have to go out to disk to find the list of - * matches, unless the character class includes code points that aren't - * storable in the bit map. That means that a character class with an 's' - * in it, for example, doesn't need to go out to disk to find everything - * that matches. A 2nd list is used so that the 'nonbitmap' list is kept - * empty unless there is something whose fold we don't know about, and will - * have to go out to the disk to find. */ - SV* l1_fold_invlist = NULL; + bool has_user_defined_property = FALSE; + + /* inversion list of code points this node matches only when the target + * string is in UTF-8. (Because is under /d) */ + SV* depends_list = NULL; + + /* inversion list of code points this node matches. For much of the + * function, it includes only those that match regardless of the utf8ness + * of the target string */ + SV* cp_list = NULL; /* List of multi-character folds that are matched by this node */ AV* unicode_alternate = NULL; #ifdef EBCDIC + /* In a range, counts how many 0-2 of the ends of it came from literals, + * not escapes. Thus we can tell if 'A' was input vs \x{C1} */ UV literal_endpoint = 0; #endif - UV stored = 0; /* how many chars stored in the bitmap */ + bool invert = FALSE; /* Is this class to be complemented */ + + /* Is there any thing like \W or [:^digit:] that matches above the legal + * Unicode range? */ + bool runtime_posix_matches_above_Unicode = FALSE; regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in case we need to change the emitted regop to an EXACT. */ const char * orig_parse = RExC_parse; + const I32 orig_size = RExC_size; GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCLASS; @@ -11022,8 +11177,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ RExC_naughty++; RExC_parse++; - if (!SIZE_ONLY) - ANYOF_FLAGS(ret) |= ANYOF_INVERT; + invert = TRUE; /* We have decided to not allow multi-char folds in inverted character * classes, due to the confusion that can happen, especially with @@ -11045,7 +11199,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) if (LOC) { ANYOF_FLAGS(ret) |= ANYOF_LOCALE; } - ANYOF_BITMAP_ZERO(ret); listsv = newSVpvs("# comment\n"); initial_listsv_len = SvCUR(listsv); } @@ -11145,9 +11298,9 @@ parseit: n = 1; } if (!SIZE_ONLY) { - SV** invlistsvp; SV* invlist; char* name; + if (UCHARAT(RExC_parse) == '^') { RExC_parse++; n--; @@ -11186,10 +11339,7 @@ parseit: if ( ! swash || ! SvROK(swash) || ! SvTYPE(SvRV(swash)) == SVt_PVHV - || ! (invlistsvp = - hv_fetchs(MUTABLE_HV(SvRV(swash)), - "INVLIST", FALSE)) - || ! (invlist = *invlistsvp)) + || ! (invlist = _get_swash_invlist(swash))) { if (swash) { SvREFCNT_dec(swash); @@ -11202,11 +11352,15 @@ parseit: Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", (value == 'p' ? '+' : '!'), name); - has_user_defined_property = 1; + has_user_defined_property = TRUE; /* We don't know yet, so have to assume that the * property could match something in the Latin1 range, - * hence something that isn't utf8 */ + * hence something that isn't utf8. Note that this + * would cause things in to match + * inappropriately, except that any \p{}, including + * this one forces Unicode semantics, which means there + * is */ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8; } else { @@ -11214,17 +11368,14 @@ parseit: /* Here, did get the swash and its inversion list. If * the swash is from a user-defined property, then this * whole character class should be regarded as such */ - SV** user_defined_svp = - hv_fetchs(MUTABLE_HV(SvRV(swash)), - "USER_DEFINED", FALSE); - if (user_defined_svp) { - has_user_defined_property - |= SvUV(*user_defined_svp); - } + has_user_defined_property = + _is_swash_user_defined(swash); /* Invert if asking for the complement */ if (value == 'P') { - _invlist_union_complement_2nd(properties, invlist, &properties); + _invlist_union_complement_2nd(properties, + invlist, + &properties); /* The swash can't be used as-is, because we've * inverted things; delay removing it to here after @@ -11271,22 +11422,18 @@ parseit: } break; case 'x': - if (*RExC_parse == '{') { - I32 flags = PERL_SCAN_ALLOW_UNDERSCORES - | PERL_SCAN_DISALLOW_PREFIX; - char * const e = strchr(RExC_parse++, '}'); - if (!e) - vFAIL("Missing right brace on \\x{}"); - - numlen = e - RExC_parse; - value = grok_hex(RExC_parse, &numlen, &flags, NULL); - RExC_parse = e + 1; - } - else { - I32 flags = PERL_SCAN_DISALLOW_PREFIX; - numlen = 2; - value = grok_hex(RExC_parse, &numlen, &flags, NULL); + RExC_parse--; /* function expects to be pointed at the 'x' */ + { + const char* error_msg; + bool valid = grok_bslash_x(RExC_parse, + &value, + &numlen, + &error_msg, + 1); RExC_parse += numlen; + if (! valid) { + vFAIL(error_msg); + } } if (PL_encoding && value < 0x100) goto recode_encoding; @@ -11330,15 +11477,20 @@ parseit: literal_endpoint++; #endif - if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ - - /* What matches in a locale is not known until runtime, so need to - * (one time per class) allocate extra space to pass to regexec. - * The space will contain a bit for each named class that is to be - * matched against. This isn't needed for \p{} and pseudo-classes, - * as they are not affected by locale, and hence are dealt with - * separately */ - if (LOC && namedclass < ANYOF_MAX && ! need_class) { + /* What matches in a locale is not known until runtime. This + * includes what the Posix classes (like \w, [:space:]) match. + * Room must be reserved (one time per class) to store such + * classes, either if Perl is compiled so that locale nodes always + * should have this space, or if there is such class info to be + * stored. The space will contain a bit for each named class that + * is to be matched against. This isn't needed for \p{} and + * pseudo-classes, as they are not affected by locale, and hence + * are dealt with separately */ + if (LOC + && ! need_class + && (ANYOF_LOCALE == ANYOF_CLASS + || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX))) + { need_class = 1; if (SIZE_ONLY) { RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP; @@ -11350,6 +11502,8 @@ parseit: ANYOF_FLAGS(ret) |= ANYOF_CLASS; } + if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a * literal, as is the character that began the false range, i.e. * the 'a' in the examples */ @@ -11361,51 +11515,41 @@ parseit: ckWARN4reg(RExC_parse, "False [] range \"%*.*s\"", w, w, rangebegin); - - stored += - set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate); - if (prevvalue < 256) { - stored += - set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate); - } - else { - nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue); - } + cp_list = add_cp_to_invlist(cp_list, '-'); + cp_list = add_cp_to_invlist(cp_list, prevvalue); } range = 0; /* this was not a true range */ + element_count += 2; /* So counts for three values */ } - if (!SIZE_ONLY) { - - /* Possible truncation here but in some 64-bit environments - * the compiler gets heartburn about switch on 64-bit values. - * A similar issue a little earlier when switching on value. - * --jhi */ + if (! SIZE_ONLY) { switch ((I32)namedclass) { case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */ - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); break; case ANYOF_NALNUMC: - DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, - PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, + PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv, + runtime_posix_matches_above_Unicode); break; case ANYOF_ALPHA: - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); break; case ANYOF_NALPHA: - DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, - PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, + PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv, + runtime_posix_matches_above_Unicode); break; case ANYOF_ASCII: if (LOC) { ANYOF_CLASS_SET(ret, namedclass); } else { - _invlist_union(properties, PL_ASCII, &properties); + _invlist_union(posixes, PL_ASCII, &posixes); } break; case ANYOF_NASCII: @@ -11413,59 +11557,61 @@ parseit: ANYOF_CLASS_SET(ret, namedclass); } else { - _invlist_union_complement_2nd(properties, - PL_ASCII, &properties); + _invlist_union_complement_2nd(posixes, + PL_ASCII, &posixes); if (DEPENDS_SEMANTICS) { ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; } } break; case ANYOF_BLANK: - DO_POSIX(ret, namedclass, properties, + DO_POSIX(ret, namedclass, posixes, PL_PosixBlank, PL_XPosixBlank); break; case ANYOF_NBLANK: - DO_N_POSIX(ret, namedclass, properties, + DO_N_POSIX(ret, namedclass, posixes, PL_PosixBlank, PL_XPosixBlank); break; case ANYOF_CNTRL: - DO_POSIX(ret, namedclass, properties, + DO_POSIX(ret, namedclass, posixes, PL_PosixCntrl, PL_XPosixCntrl); break; case ANYOF_NCNTRL: - DO_N_POSIX(ret, namedclass, properties, + DO_N_POSIX(ret, namedclass, posixes, PL_PosixCntrl, PL_XPosixCntrl); break; case ANYOF_DIGIT: /* There are no digits in the Latin1 range outside of * ASCII, so call the macro that doesn't have to resolve * them */ - DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes, PL_PosixDigit, "XPosixDigit", listsv); break; case ANYOF_NDIGIT: - DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, - PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, + PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv, + runtime_posix_matches_above_Unicode); break; case ANYOF_GRAPH: - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); break; case ANYOF_NGRAPH: - DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, - PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, + PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv, + runtime_posix_matches_above_Unicode); break; case ANYOF_HORIZWS: - /* For these, we use the nonbitmap, as /d doesn't make a + /* For these, we use the cp_list, as /d doesn't make a * difference in what these match. There would be problems * if these characters had folds other than themselves, as - * nonbitmap is subject to folding. It turns out that \h + * cp_list is subject to folding. It turns out that \h * is just a synonym for XPosixBlank */ - _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap); + _invlist_union(cp_list, PL_XPosixBlank, &cp_list); break; case ANYOF_NHORIZWS: - _invlist_union_complement_2nd(nonbitmap, - PL_XPosixBlank, &nonbitmap); + _invlist_union_complement_2nd(cp_list, + PL_XPosixBlank, &cp_list); break; case ANYOF_LOWER: case ANYOF_NLOWER: @@ -11488,45 +11634,48 @@ parseit: Xname = "XPosixLower"; } if (namedclass == ANYOF_LOWER) { - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, ascii_source, l1_source, Xname, listsv); } else { DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, - properties, ascii_source, l1_source, Xname, listsv); + posixes, ascii_source, l1_source, Xname, listsv, + runtime_posix_matches_above_Unicode); } break; } case ANYOF_PRINT: - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); break; case ANYOF_NPRINT: - DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, - PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, + PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv, + runtime_posix_matches_above_Unicode); break; case ANYOF_PUNCT: - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); break; case ANYOF_NPUNCT: - DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, - PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, + PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv, + runtime_posix_matches_above_Unicode); break; case ANYOF_PSXSPC: - DO_POSIX(ret, namedclass, properties, + DO_POSIX(ret, namedclass, posixes, PL_PosixSpace, PL_XPosixSpace); break; case ANYOF_NPSXSPC: - DO_N_POSIX(ret, namedclass, properties, + DO_N_POSIX(ret, namedclass, posixes, PL_PosixSpace, PL_XPosixSpace); break; case ANYOF_SPACE: - DO_POSIX(ret, namedclass, properties, + DO_POSIX(ret, namedclass, posixes, PL_PerlSpace, PL_XPerlSpace); break; case ANYOF_NSPACE: - DO_N_POSIX(ret, namedclass, properties, + DO_N_POSIX(ret, namedclass, posixes, PL_PerlSpace, PL_XPerlSpace); break; case ANYOF_UPPER: /* Same as LOWER, above */ @@ -11547,40 +11696,42 @@ parseit: Xname = "XPosixUpper"; } if (namedclass == ANYOF_UPPER) { - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, ascii_source, l1_source, Xname, listsv); } else { DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, - properties, ascii_source, l1_source, Xname, listsv); + posixes, ascii_source, l1_source, Xname, listsv, + runtime_posix_matches_above_Unicode); } break; } case ANYOF_ALNUM: /* Really is 'Word' */ - DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, + DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); break; case ANYOF_NALNUM: - DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties, - PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv); + DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes, + PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv, + runtime_posix_matches_above_Unicode); break; case ANYOF_VERTWS: - /* For these, we use the nonbitmap, as /d doesn't make a + /* For these, we use the cp_list, as /d doesn't make a * difference in what these match. There would be problems * if these characters had folds other than themselves, as - * nonbitmap is subject to folding */ - _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap); + * cp_list is subject to folding */ + _invlist_union(cp_list, PL_VertSpace, &cp_list); break; case ANYOF_NVERTWS: - _invlist_union_complement_2nd(nonbitmap, - PL_VertSpace, &nonbitmap); + _invlist_union_complement_2nd(cp_list, + PL_VertSpace, &cp_list); break; case ANYOF_XDIGIT: - DO_POSIX(ret, namedclass, properties, + DO_POSIX(ret, namedclass, posixes, PL_PosixXDigit, PL_XPosixXDigit); break; case ANYOF_NXDIGIT: - DO_N_POSIX(ret, namedclass, properties, + DO_N_POSIX(ret, namedclass, posixes, PL_PosixXDigit, PL_XPosixXDigit); break; case ANYOF_MAX: @@ -11591,19 +11742,19 @@ parseit: break; } - continue; + continue; /* Go get next character */ } } /* end of namedclass \blah */ if (range) { - if (prevvalue > (IV)value) /* b-a */ { + if (prevvalue > value) /* b-a */ { const int w = RExC_parse - rangebegin; Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin); range = 0; /* not a valid range */ } } else { - prevvalue = value; /* save the beginning of the range */ + prevvalue = value; /* save the beginning of the potential range */ if (RExC_parse+1 < RExC_end && *RExC_parse == '-' && RExC_parse[1] != ']') @@ -11620,135 +11771,452 @@ parseit: "False [] range \"%*.*s\"", w, w, rangebegin); } - if (!SIZE_ONLY) - stored += - set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate); + if (!SIZE_ONLY) { + cp_list = add_cp_to_invlist(cp_list, '-'); + element_count++; + } } else range = 1; /* yeah, it's a range! */ continue; /* but do it the next time */ } } + /* Here, is the beginning of the range, if any; or + * if not */ + /* non-Latin1 code point implies unicode semantics. Must be set in * pass1 so is there for the whole of pass 2 */ if (value > 255) { RExC_uni_semantics = 1; } - /* now is the next time */ + /* Ready to process either the single value, or the completed range */ if (!SIZE_ONLY) { - if (prevvalue < 256) { - const IV ceilvalue = value < 256 ? value : 255; - IV i; -#ifdef EBCDIC - /* In EBCDIC [\x89-\x91] should include - * the \x8e but [i-j] should not. */ - if (literal_endpoint == 2 && - ((isLOWER(prevvalue) && isLOWER(ceilvalue)) || - (isUPPER(prevvalue) && isUPPER(ceilvalue)))) - { - if (isLOWER(prevvalue)) { - for (i = prevvalue; i <= ceilvalue; i++) - if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { - stored += - set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); - } - } else { - for (i = prevvalue; i <= ceilvalue; i++) - if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { - stored += - set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); - } - } - } - else -#endif - for (i = prevvalue; i <= ceilvalue; i++) { - stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate); - } - } - if (value > 255) { - const UV prevnatvalue = NATIVE_TO_UNI(prevvalue); - const UV natvalue = NATIVE_TO_UNI(value); - nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue); - } -#ifdef EBCDIC - literal_endpoint = 0; +#ifndef EBCDIC + cp_list = _add_range_to_invlist(cp_list, prevvalue, value); +#else + UV* this_range = _new_invlist(1); + _append_range_to_invlist(this_range, prevvalue, value); + + /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous. + * If this range was specified using something like 'i-j', we want + * to include only the 'i' and the 'j', and not anything in + * between, so exclude non-ASCII, non-alphabetics from it. + * However, if the range was specified with something like + * [\x89-\x91] or [\x89-j], all code points within it should be + * included. literal_endpoint==2 means both ends of the range used + * a literal character, not \x{foo} */ + if (literal_endpoint == 2 + && (prevvalue >= 'a' && value <= 'z') + || (prevvalue >= 'A' && value <= 'Z')) + { + _invlist_intersection(this_range, PL_ASCII, &this_range, ); + _invlist_intersection(this_range, PL_Alpha, &this_range, ); + } + _invlist_union(cp_list, this_range, &cp_list); + literal_endpoint = 0; #endif } range = 0; /* this range (if it was one) is done now */ - } + } /* End of loop through all the text within the brackets */ + + /* If the character class contains only a single element, it may be + * optimizable into another node type which is smaller and runs faster. + * Check if this is the case for this class */ + if (element_count == 1) { + U8 op = END; + U8 arg = 0; + + if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or + [:digit:] or \p{foo} */ + + /* Certain named classes have equivalents that can appear outside a + * character class, e.g. \w, \H. We use these instead of a + * character class. */ + switch ((I32)namedclass) { + U8 offset; + + /* The first group is for node types that depend on the charset + * modifier to the regex. We first calculate the base node + * type, and if it should be inverted */ + + case ANYOF_NALNUM: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_ALNUM: + op = ALNUM; + goto join_charset_classes; + + case ANYOF_NSPACE: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_SPACE: + op = SPACE; + goto join_charset_classes; + + case ANYOF_NDIGIT: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_DIGIT: + op = DIGIT; + + join_charset_classes: + + /* Now that we have the base node type, we take advantage + * of the enum ordering of the charset modifiers to get the + * exact node type, For example the base SPACE also has + * SPACEL, SPACEU, and SPACEA */ + + offset = get_regex_charset(RExC_flags); + + /* /aa is the same as /a for these */ + if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) { + offset = REGEX_ASCII_RESTRICTED_CHARSET; + } + else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) { + offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */ + } + op += offset; + + /* The number of varieties of each of these is the same, + * hence, so is the delta between the normal and + * complemented nodes */ + if (invert) { + op += NALNUM - ALNUM; + } + break; + /* The second group doesn't depend of the charset modifiers. + * We just have normal and complemented */ + case ANYOF_NHORIZWS: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_HORIZWS: + is_horizws: + op = (invert) ? NHORIZWS : HORIZWS; + break; + + case ANYOF_NVERTWS: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_VERTWS: + op = (invert) ? NVERTWS : VERTWS; + break; + + case ANYOF_MAX: + break; + + case ANYOF_NBLANK: + invert = ! invert; + /* FALLTHROUGH */ + case ANYOF_BLANK: + if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) { + goto is_horizws; + } + /* FALLTHROUGH */ + default: + /* A generic posix class. All the /a ones can be handled + * by the POSIXA opcode. And all are closed under folding + * in the ASCII range, so FOLD doesn't matter */ + if (AT_LEAST_ASCII_RESTRICTED + || (! LOC && namedclass == ANYOF_ASCII)) + { + /* The odd numbered ones are the complements of the + * next-lower even number one */ + if (namedclass % 2 == 1) { + invert = ! invert; + namedclass--; + } + arg = namedclass_to_classnum(namedclass); + op = (invert) ? NPOSIXA : POSIXA; + } + break; + } + } + else if (value == prevvalue) { + + /* Here, the class consists of just a single code point */ + + if (invert) { + if (! LOC && value == '\n') { + op = REG_ANY; /* Optimize [^\n] */ + } + } + else if (value < 256 || UTF) { + + /* Optimize a single value into an EXACTish node, but not if it + * would require converting the pattern to UTF-8. */ + op = compute_EXACTish(pRExC_state); + } + } /* Otherwise is a range */ + else if (! LOC) { /* locale could vary these */ + if (prevvalue == '0') { + if (value == '9') { + op = (invert) ? NDIGITA : DIGITA; + } + } + } + + /* Here, we have changed away from its initial value iff we found + * an optimization */ + if (op != END) { + + /* Throw away this ANYOF regnode, and emit the calculated one, + * which should correspond to the beginning, not current, state of + * the parse */ + const char * cur_parse = RExC_parse; + RExC_parse = (char *)orig_parse; + if ( SIZE_ONLY) { + if (! LOC) { + + /* To get locale nodes to not use the full ANYOF size would + * require moving the code above that writes the portions + * of it that aren't in other nodes to after this point. + * e.g. ANYOF_CLASS_SET */ + RExC_size = orig_size; + } + } + else { + RExC_emit = (regnode *)orig_emit; + } + + ret = reg_node(pRExC_state, op); + + if (PL_regkind[op] == POSIXD) { + if (! SIZE_ONLY) { + FLAGS(ret) = arg; + } + } + else if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value); + } + + RExC_parse = (char *) cur_parse; + + SvREFCNT_dec(listsv); + return ret; + } + } if (SIZE_ONLY) return ret; /****** !SIZE_ONLY AFTER HERE *********/ - /* If folding and there are code points above 255, we calculate all - * characters that could fold to or from the ones already on the list */ - if (FOLD && nonbitmap) { + /* If folding, we calculate all characters that could fold to or from the + * ones already on the list */ + if (FOLD && cp_list) { UV start, end; /* End points of code point ranges */ SV* fold_intersection = NULL; - /* This is a list of all the characters that participate in folds - * (except marks, etc in multi-char folds */ - if (! PL_utf8_foldable) { - SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0); - PL_utf8_foldable = _swash_to_invlist(swash); - SvREFCNT_dec(swash); - } + /* In the Latin1 range, the characters that can be folded-to or -from + * are precisely the alphabetic characters. If the highest code point + * is within Latin1, we can use the compiled-in list, and not have to + * go out to disk. */ + if (invlist_highest(cp_list) < 256) { + _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection); + } + else { - /* This is a hash that for a particular fold gives all characters - * that are involved in it */ - if (! PL_utf8_foldclosures) { + /* Here, there are non-Latin1 code points, so we will have to go + * fetch the list of all the characters that participate in folds + */ + if (! PL_utf8_foldable) { + SV* swash = swash_init("utf8", "_Perl_Any_Folds", + &PL_sv_undef, 1, 0); + PL_utf8_foldable = _get_swash_invlist(swash); + SvREFCNT_dec(swash); + } - /* If we were unable to find any folds, then we likely won't be - * able to find the closures. So just create an empty list. - * Folding will effectively be restricted to the non-Unicode rules - * hard-coded into Perl. (This case happens legitimately during - * compilation of Perl itself before the Unicode tables are - * generated) */ - if (invlist_len(PL_utf8_foldable) == 0) { - PL_utf8_foldclosures = newHV(); - } else { - /* If the folds haven't been read in, call a fold function - * to force that */ - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; - STRLEN dummy_len; - - /* This particular string is above \xff in both UTF-8 and - * UTFEBCDIC */ - to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len); - assert(PL_utf8_tofold); /* Verify that worked */ - } - PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); - } - } + /* This is a hash that for a particular fold gives all characters + * that are involved in it */ + if (! PL_utf8_foldclosures) { + + /* If we were unable to find any folds, then we likely won't be + * able to find the closures. So just create an empty list. + * Folding will effectively be restricted to the non-Unicode + * rules hard-coded into Perl. (This case happens legitimately + * during compilation of Perl itself before the Unicode tables + * are generated) */ + if (invlist_len(PL_utf8_foldable) == 0) { + PL_utf8_foldclosures = newHV(); + } + else { + /* If the folds haven't been read in, call a fold function + * to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES+1]; + STRLEN dummy_len; + + /* This particular string is above \xff in both UTF-8 + * and UTFEBCDIC */ + to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len); + assert(PL_utf8_tofold); /* Verify that worked */ + } + PL_utf8_foldclosures = + _swash_inversion_hash(PL_utf8_tofold); + } + } - /* Only the characters in this class that participate in folds need be - * checked. Get the intersection of this class and all the possible - * characters that are foldable. This can quickly narrow down a large - * class */ - _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection); + /* Only the characters in this class that participate in folds need + * be checked. Get the intersection of this class and all the + * possible characters that are foldable. This can quickly narrow + * down a large class */ + _invlist_intersection(PL_utf8_foldable, cp_list, + &fold_intersection); + } /* Now look at the foldable characters in this class individually */ invlist_iterinit(fold_intersection); while (invlist_iternext(fold_intersection, &start, &end)) { UV j; + /* Locale folding for Latin1 characters is deferred until runtime */ + if (LOC && start < 256) { + start = 256; + } + /* Look at every character in the range */ for (j = start; j <= end; j++) { - /* Get its fold */ U8 foldbuf[UTF8_MAXBYTES_CASE+1]; STRLEN foldlen; - const UV f = - _to_uni_fold_flags(j, foldbuf, &foldlen, - (allow_full_fold) ? FOLD_FLAGS_FULL : 0); + UV f; + + if (j < 256) { + + /* We have the latin1 folding rules hard-coded here so that + * an innocent-looking character class, like /[ks]/i won't + * have to go out to disk to find the possible matches. + * XXX It would be better to generate these via regen, in + * case a new version of the Unicode standard adds new + * mappings, though that is not really likely, and may be + * caught by the default: case of the switch below. */ + + if (PL_fold_latin1[j] != j) { + + /* ASCII is always matched; non-ASCII is matched only + * under Unicode rules */ + if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) { + cp_list = + add_cp_to_invlist(cp_list, PL_fold_latin1[j]); + } + else { + depends_list = + add_cp_to_invlist(depends_list, PL_fold_latin1[j]); + } + } + + if (HAS_NONLATIN1_FOLD_CLOSURE(j) + && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) + { + /* Certain Latin1 characters have matches outside + * Latin1, or are multi-character. To get here, 'j' is + * one of those characters. None of these matches is + * valid for ASCII characters under /aa, which is why + * the 'if' just above excludes those. The matches + * fall into three categories: + * 1) They are singly folded-to or -from an above 255 + * character, e.g., LATIN SMALL LETTER Y WITH + * DIAERESIS and LATIN CAPITAL LETTER Y WITH + * DIAERESIS; + * 2) They are part of a multi-char fold with another + * latin1 character; only LATIN SMALL LETTER + * SHARP S => "ss" fits this; + * 3) They are part of a multi-char fold with a + * character outside of Latin1, such as various + * ligatures. + * We aren't dealing fully with multi-char folds, except + * we do deal with the pattern containing a character + * that has a multi-char fold (not so much the inverse). + * For types 1) and 3), the matches only happen when the + * target string is utf8; that's not true for 2), and we + * set a flag for it. + * + * The code below adds the single fold closures for 'j' + * to the inversion list. */ + switch (j) { + case 'k': + case 'K': + cp_list = + add_cp_to_invlist(cp_list, KELVIN_SIGN); + break; + case 's': + case 'S': + cp_list = add_cp_to_invlist(cp_list, + LATIN_SMALL_LETTER_LONG_S); + break; + case MICRO_SIGN: + cp_list = add_cp_to_invlist(cp_list, + GREEK_CAPITAL_LETTER_MU); + cp_list = add_cp_to_invlist(cp_list, + GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + cp_list = + add_cp_to_invlist(cp_list, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + cp_list = add_cp_to_invlist(cp_list, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + case LATIN_SMALL_LETTER_SHARP_S: + cp_list = add_cp_to_invlist(cp_list, + LATIN_CAPITAL_LETTER_SHARP_S); + + /* Under /a, /d, and /u, this can match the two + * chars "ss" */ + if (! ASCII_FOLD_RESTRICTED) { + add_alternate(&unicode_alternate, + (U8 *) "ss", 2); + + /* And under /u or /a, it can match even if + * the target is not utf8 */ + if (AT_LEAST_UNI_SEMANTICS) { + ANYOF_FLAGS(ret) |= + ANYOF_NONBITMAP_NON_UTF8; + } + } + break; + case 'F': case 'f': + case 'I': case 'i': + case 'L': case 'l': + case 'T': case 't': + case 'A': case 'a': + case 'H': case 'h': + case 'J': case 'j': + case 'N': case 'n': + case 'W': case 'w': + case 'Y': case 'y': + /* These all are targets of multi-character + * folds from code points that require UTF8 to + * express, so they can't match unless the + * target string is in UTF-8, so no action here + * is necessary, as regexec.c properly handles + * the general case for UTF-8 matching */ + break; + default: + /* Use deprecated warning to increase the + * chances of this being output */ + ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); + break; + } + } + continue; + } + + /* Here is an above Latin1 character. We don't have the rules + * hard-coded for it. First, get its fold */ + f = _to_uni_fold_flags(j, foldbuf, &foldlen, + ((allow_full_fold) ? FOLD_FLAGS_FULL : 0) + | ((LOC) + ? FOLD_FLAGS_LOCALE + : (ASCII_FOLD_RESTRICTED) + ? FOLD_FLAGS_NOMIX_ASCII + : 0)); if (foldlen > (STRLEN)UNISKIP(f)) { @@ -11764,54 +12232,25 @@ parseit: /* If any of the folded characters of this are in the * Latin1 range, tell the regex engine that this can - * match a non-utf8 target string. The only multi-byte - * fold whose source is in the Latin1 range (U+00DF) - * applies only when the target string is utf8, or - * under unicode rules */ - if (j > 255 || AT_LEAST_UNI_SEMANTICS) { - while (loc < e) { - - /* Can't mix ascii with non- under /aa */ - if (MORE_ASCII_RESTRICTED - && (isASCII(*loc) != isASCII(j))) - { - goto end_multi_fold; - } - if (UTF8_IS_INVARIANT(*loc) - || UTF8_IS_DOWNGRADEABLE_START(*loc)) - { - /* Can't mix above and below 256 under LOC - */ - if (LOC) { - goto end_multi_fold; - } - ANYOF_FLAGS(ret) - |= ANYOF_NONBITMAP_NON_UTF8; - break; - } - loc += UTF8SKIP(loc); - } - } + * match a non-utf8 target string. */ + while (loc < e) { + if (UTF8_IS_INVARIANT(*loc) + || UTF8_IS_DOWNGRADEABLE_START(*loc)) + { + ANYOF_FLAGS(ret) + |= ANYOF_NONBITMAP_NON_UTF8; + break; + } + loc += UTF8SKIP(loc); + } add_alternate(&unicode_alternate, foldbuf, foldlen); - end_multi_fold: ; - } - - /* This is special-cased, as it is the only letter which - * has both a multi-fold and single-fold in Latin1. All - * the other chars that have single and multi-folds are - * always in utf8, and the utf8 folding algorithm catches - * them */ - if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) { - stored += set_regclass_bit(pRExC_state, - ret, - LATIN_SMALL_LETTER_SHARP_S, - &l1_fold_invlist, &unicode_alternate); } } - else { - /* Single character fold. Add everything in its fold - * closure to the list that this node should match */ + else { + /* Single character fold of above Latin1. Add everything + * in its fold closure to the list that this node should + * match */ SV** listp; /* The fold closures data structure is a hash with the keys @@ -11834,94 +12273,317 @@ parseit: /* /aa doesn't allow folds between ASCII and non-; * /l doesn't allow them between above and below * 256 */ - if ((MORE_ASCII_RESTRICTED - && (isASCII(c) != isASCII(j))) - || (LOC && ((c < 256) != (j < 256)))) + if ((ASCII_FOLD_RESTRICTED + && (isASCII(c) != isASCII(j))) + || (LOC && ((c < 256) != (j < 256)))) { continue; } - if (c < 256 && AT_LEAST_UNI_SEMANTICS) { - stored += set_regclass_bit(pRExC_state, - ret, - (U8) c, - &l1_fold_invlist, &unicode_alternate); - } - /* It may be that the code point is already in - * this range or already in the bitmap, in - * which case we need do nothing */ - else if ((c < start || c > end) - && (c > 255 - || ! ANYOF_BITMAP_TEST(ret, c))) - { - nonbitmap = add_cp_to_invlist(nonbitmap, c); + /* Folds involving non-ascii Latin1 characters + * under /d are added to a separate list */ + if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS) + { + cp_list = add_cp_to_invlist(cp_list, c); + } + else { + depends_list = add_cp_to_invlist(depends_list, c); } } } } - } + } } SvREFCNT_dec(fold_intersection); } - /* Combine the two lists into one. */ - if (l1_fold_invlist) { - if (nonbitmap) { - _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap); - SvREFCNT_dec(l1_fold_invlist); - } - else { - nonbitmap = l1_fold_invlist; - } + /* And combine the result (if any) with any inversion list from posix + * classes. The lists are kept separate up to now because we don't want to + * fold the classes (folding of those is automatically handled by the swash + * fetching code) */ + if (posixes) { + if (! DEPENDS_SEMANTICS) { + if (cp_list) { + _invlist_union(cp_list, posixes, &cp_list); + SvREFCNT_dec(posixes); + } + else { + cp_list = posixes; + } + } + else { + /* Under /d, we put into a separate list the Latin1 things that + * match only when the target string is utf8 */ + SV* nonascii_but_latin1_properties = NULL; + _invlist_intersection(posixes, PL_Latin1, + &nonascii_but_latin1_properties); + _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII, + &nonascii_but_latin1_properties); + _invlist_subtract(posixes, nonascii_but_latin1_properties, + &posixes); + if (cp_list) { + _invlist_union(cp_list, posixes, &cp_list); + SvREFCNT_dec(posixes); + } + else { + cp_list = posixes; + } + + if (depends_list) { + _invlist_union(depends_list, nonascii_but_latin1_properties, + &depends_list); + SvREFCNT_dec(nonascii_but_latin1_properties); + } + else { + depends_list = nonascii_but_latin1_properties; + } + } } /* And combine the result (if any) with any inversion list from properties. - * The lists are kept separate up to now because we don't want to fold the - * properties */ + * The lists are kept separate up to now so that we can distinguish the two + * in regards to matching above-Unicode. A run-time warning is generated + * if a Unicode property is matched against a non-Unicode code point. But, + * we allow user-defined properties to match anything, without any warning, + * and we also suppress the warning if there is a portion of the character + * class that isn't a Unicode property, and which matches above Unicode, \W + * or [\x{110000}] for example. + * (Note that in this case, unlike the Posix one above, there is no + * , because having a Unicode property forces Unicode + * semantics */ if (properties) { - if (nonbitmap) { - _invlist_union(nonbitmap, properties, &nonbitmap); - SvREFCNT_dec(properties); - } - else { - nonbitmap = properties; - } + bool warn_super = ! has_user_defined_property; + if (cp_list) { + + /* If it matters to the final outcome, see if a non-property + * component of the class matches above Unicode. If so, the + * warning gets suppressed. This is true even if just a single + * such code point is specified, as though not strictly correct if + * another such code point is matched against, the fact that they + * are using above-Unicode code points indicates they should know + * the issues involved */ + if (warn_super) { + bool non_prop_matches_above_Unicode = + runtime_posix_matches_above_Unicode + | (invlist_highest(cp_list) > PERL_UNICODE_MAX); + if (invert) { + non_prop_matches_above_Unicode = + ! non_prop_matches_above_Unicode; + } + warn_super = ! non_prop_matches_above_Unicode; + } + + _invlist_union(properties, cp_list, &cp_list); + SvREFCNT_dec(properties); + } + else { + cp_list = properties; + } + + if (warn_super) { + ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER; + } } - /* Here, contains all the code points we can determine at - * compile time that we haven't put into the bitmap. Go through it, and - * for things that belong in the bitmap, put them there, and delete from - * */ - if (nonbitmap) { + /* Here, we have calculated what code points should be in the character + * class. + * + * Now we can see about various optimizations. Fold calculation (which we + * did above) needs to take place before inversion. Otherwise /[^k]/i + * would invert to include K, which under /i would match k, which it + * shouldn't. Therefore we can't invert folded locale now, as it won't be + * folded until runtime */ + + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known + * at compile time. Besides not inverting folded locale now, we can't invert + * if there are things such as \w, which aren't known until runtime */ + if (invert + && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS))) + && ! depends_list + && ! unicode_alternate + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + { + _invlist_invert(cp_list); - /* Above-ASCII code points in /d have to stay in , as they - * possibly only should match when the target string is UTF-8 */ - UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255; + /* Any swash can't be used as-is, because we've inverted things */ + if (swash) { + SvREFCNT_dec(swash); + swash = NULL; + } + + /* Clear the invert flag since have just done it here */ + invert = FALSE; + } + + /* If we didn't do folding, it's because some information isn't available + * until runtime; set the run-time fold flag for these. (We don't have to + * worry about properties folding, as that is taken care of by the swash + * fetching) */ + if (FOLD && (LOC || unicode_alternate)) + { + ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD; + } + + /* Some character classes are equivalent to other nodes. Such nodes take + * up less room and generally fewer operations to execute than ANYOF nodes. + * Above, we checked for and optimized into some such equivalents for + * certain common classes that are easy to test. Getting to this point in + * the code means that the class didn't get optimized there. Since this + * code is only executed in Pass 2, it is too late to save space--it has + * been allocated in Pass 1, and currently isn't given back. But turning + * things into an EXACTish node can allow the optimizer to join it to any + * adjacent such nodes. And if the class is equivalent to things like /./, + * expensive run-time swashes can be avoided. Now that we have more + * complete information, we can find things necessarily missed by the + * earlier code. I (khw) am not sure how much to look for here. It would + * be easy, but perhaps too slow, to check any candidates against all the + * node types they could possibly match using _invlistEQ(). */ + + if (cp_list + && ! unicode_alternate + && ! invert + && ! depends_list + && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS) + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + { + UV start, end; + U8 op = END; /* The optimzation node-type */ + const char * cur_parse= RExC_parse; + + invlist_iterinit(cp_list); + if (! invlist_iternext(cp_list, &start, &end)) { + + /* Here, the list is empty. This happens, for example, when a + * Unicode property is the only thing in the character class, and + * it doesn't match anything. (perluniprops.pod notes such + * properties) */ + op = OPFAIL; + } + else if (start == end) { /* The range is a single code point */ + if (! invlist_iternext(cp_list, &start, &end) + + /* Don't do this optimization if it would require changing + * the pattern to UTF-8 */ + && (start < 256 || UTF)) + { + /* Here, the list contains a single code point. Can optimize + * into an EXACT node */ + + value = start; + + if (! FOLD) { + op = EXACT; + } + else if (LOC) { + + /* A locale node under folding with one code point can be + * an EXACTFL, as its fold won't be calculated until + * runtime */ + op = EXACTFL; + } + else { + + /* Here, we are generally folding, but there is only one + * code point to match. If we have to, we use an EXACT + * node, but it would be better for joining with adjacent + * nodes in the optimization pass if we used the same + * EXACTFish node that any such are likely to be. We can + * do this iff the code point doesn't participate in any + * folds. For example, an EXACTF of a colon is the same as + * an EXACT one, since nothing folds to or from a colon. + * In the Latin1 range, being an alpha means that the + * character participates in a fold (except for the + * feminine and masculine ordinals, which I (khw) don't + * think are worrying about optimizing for). */ + if (value < 256) { + if (isALPHA_L1(value)) { + op = EXACT; + } + } + else { + if (! PL_utf8_foldable) { + SV* swash = swash_init("utf8", "_Perl_Any_Folds", + &PL_sv_undef, 1, 0); + PL_utf8_foldable = _get_swash_invlist(swash); + SvREFCNT_dec(swash); + } + if (_invlist_contains_cp(PL_utf8_foldable, value)) { + op = EXACT; + } + } + + /* If we haven't found the node type, above, it means we + * can use the prevailing one */ + if (op == END) { + op = compute_EXACTish(pRExC_state); + } + } + } + } + else if (start == 0) { + if (end == UV_MAX) { + op = SANY; + } + else if (end == '\n' - 1 + && invlist_iternext(cp_list, &start, &end) + && start == '\n' + 1 && end == UV_MAX) + { + op = REG_ANY; + } + } + + if (op != END) { + RExC_parse = (char *)orig_parse; + RExC_emit = (regnode *)orig_emit; + + ret = reg_node(pRExC_state, op); + + RExC_parse = (char *)cur_parse; + + if (PL_regkind[op] == EXACT) { + alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value); + } + + SvREFCNT_dec(listsv); + return ret; + } + } + + /* Here, contains all the code points we can determine at + * compile time that match under all conditions. Go through it, and + * for things that belong in the bitmap, put them there, and delete from + * . While we are at it, see if everything above 255 is in the + * list, and if so, set a flag to speed up execution */ + ANYOF_BITMAP_ZERO(ret); + if (cp_list) { /* This gets set if we actually need to modify things */ bool change_invlist = FALSE; UV start, end; - /* Start looking through */ - invlist_iterinit(nonbitmap); - while (invlist_iternext(nonbitmap, &start, &end)) { + /* Start looking through */ + invlist_iterinit(cp_list); + while (invlist_iternext(cp_list, &start, &end)) { UV high; int i; + if (end == UV_MAX && start <= 256) { + ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; + } + /* Quit if are above what we should change */ - if (start > max_cp_to_set) { + if (start > 255) { break; } change_invlist = TRUE; /* Set all the bits in the range, up to the max that we are doing */ - high = (end < max_cp_to_set) ? end : max_cp_to_set; + high = (end < 255) ? end : 255; for (i = start; i <= (int) high; i++) { if (! ANYOF_BITMAP_TEST(ret, i)) { ANYOF_BITMAP_SET(ret, i); - stored++; prevvalue = value; value = i; } @@ -11929,249 +12591,33 @@ parseit: } /* Done with loop; remove any code points that are in the bitmap from - * */ + * */ if (change_invlist) { - _invlist_subtract(nonbitmap, - (DEPENDS_SEMANTICS) - ? PL_ASCII - : PL_Latin1, - &nonbitmap); + _invlist_subtract(cp_list, PL_Latin1, &cp_list); } /* If have completely emptied it, remove it completely */ - if (invlist_len(nonbitmap) == 0) { - SvREFCNT_dec(nonbitmap); - nonbitmap = NULL; + if (invlist_len(cp_list) == 0) { + SvREFCNT_dec(cp_list); + cp_list = NULL; } } - /* Here, we have calculated what code points should be in the character - * class. does not overlap the bitmap except possibly in the - * case of DEPENDS rules. - * - * Now we can see about various optimizations. Fold calculation (which we - * did above) needs to take place before inversion. Otherwise /[^k]/i - * would invert to include K, which under /i would match k, which it - * shouldn't. */ - - /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't - * set the FOLD flag yet, so this does optimize those. It doesn't - * optimize locale. Doing so perhaps could be done as long as there is - * nothing like \w in it; some thought also would have to be given to the - * interaction with above 0x100 chars */ - if ((ANYOF_FLAGS(ret) & ANYOF_INVERT) - && ! LOC - && ! unicode_alternate - /* In case of /d, there are some things that should match only when in - * not in the bitmap, i.e., they require UTF8 to match. These are - * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this - * case, they don't require UTF8, so can invert here */ - && (! nonbitmap - || ! DEPENDS_SEMANTICS - || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8)) - && SvCUR(listsv) == initial_listsv_len) - { - int i; - if (! nonbitmap) { - for (i = 0; i < 256; ++i) { - if (ANYOF_BITMAP_TEST(ret, i)) { - ANYOF_BITMAP_CLEAR(ret, i); - } - else { - ANYOF_BITMAP_SET(ret, i); - prevvalue = value; - value = i; - } - } - /* The inversion means that everything above 255 is matched */ - ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; - } - else { - /* Here, also has things outside the bitmap that may overlap with - * the bitmap. We have to sync them up, so that they get inverted - * in both places. Earlier, we removed all overlaps except in the - * case of /d rules, so no syncing is needed except for this case - */ - SV *remove_list = NULL; - - if (DEPENDS_SEMANTICS) { - UV start, end; - - /* Set the bits that correspond to the ones that aren't in the - * bitmap. Otherwise, when we invert, we'll miss these. - * Earlier, we removed from the nonbitmap all code points - * < 128, so there is no extra work here */ - invlist_iterinit(nonbitmap); - while (invlist_iternext(nonbitmap, &start, &end)) { - if (start > 255) { /* The bit map goes to 255 */ - break; - } - if (end > 255) { - end = 255; - } - for (i = start; i <= (int) end; ++i) { - ANYOF_BITMAP_SET(ret, i); - prevvalue = value; - value = i; - } - } - } - - /* Now invert both the bitmap and the nonbitmap. Anything in the - * bitmap has to also be removed from the non-bitmap, but again, - * there should not be overlap unless is /d rules. */ - _invlist_invert(nonbitmap); - - /* Any swash can't be used as-is, because we've inverted things */ - if (swash) { - SvREFCNT_dec(swash); - swash = NULL; - } - - for (i = 0; i < 256; ++i) { - if (ANYOF_BITMAP_TEST(ret, i)) { - ANYOF_BITMAP_CLEAR(ret, i); - if (DEPENDS_SEMANTICS) { - if (! remove_list) { - remove_list = _new_invlist(2); - } - remove_list = add_cp_to_invlist(remove_list, i); - } - } - else { - ANYOF_BITMAP_SET(ret, i); - prevvalue = value; - value = i; - } - } - - /* And do the removal */ - if (DEPENDS_SEMANTICS) { - if (remove_list) { - _invlist_subtract(nonbitmap, remove_list, &nonbitmap); - SvREFCNT_dec(remove_list); - } - } - else { - /* There is no overlap for non-/d, so just delete anything - * below 256 */ - _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap); - } - } - - stored = 256 - stored; - - /* Clear the invert flag since have just done it here */ - ANYOF_FLAGS(ret) &= ~ANYOF_INVERT; - } - - /* Folding in the bitmap is taken care of above, but not for locale (for - * which we have to wait to see what folding is in effect at runtime), and - * for some things not in the bitmap (only the upper latin folds in this - * case, as all other single-char folding has been set above). Set - * run-time fold flag for these */ - if (FOLD && (LOC - || (DEPENDS_SEMANTICS - && nonbitmap - && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8)) - || unicode_alternate)) - { - ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD; + if (invert) { + ANYOF_FLAGS(ret) |= ANYOF_INVERT; } - /* A single character class can be "optimized" into an EXACTish node. - * Note that since we don't currently count how many characters there are - * outside the bitmap, we are XXX missing optimization possibilities for - * them. This optimization can't happen unless this is a truly single - * character class, which means that it can't be an inversion into a - * many-character class, and there must be no possibility of there being - * things outside the bitmap. 'stored' (only) for locales doesn't include - * \w, etc, so have to make a special test that they aren't present - * - * Similarly A 2-character class of the very special form like [bB] can be - * optimized into an EXACTFish node, but only for non-locales, and for - * characters which only have the two folds; so things like 'fF' and 'Ii' - * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE - * FI'. */ - if (! nonbitmap - && ! unicode_alternate - && SvCUR(listsv) == initial_listsv_len - && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL)) - && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) - || (! ANYOF_CLASS_TEST_ANY_SET(ret))))) - || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE)) - && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)) - /* If the latest code point has a fold whose - * bit is set, it must be the only other one */ - && ((prevvalue = PL_fold_latin1[value]) != (IV)value) - && ANYOF_BITMAP_TEST(ret, prevvalue))))) - { - /* Note that the information needed to decide to do this optimization - * is not currently available until the 2nd pass, and that the actually - * used EXACTish node takes less space than the calculated ANYOF node, - * and hence the amount of space calculated in the first pass is larger - * than actually used, so this optimization doesn't gain us any space. - * But an EXACT node is faster than an ANYOF node, and can be combined - * with any adjacent EXACT nodes later by the optimizer for further - * gains. The speed of executing an EXACTF is similar to an ANYOF - * node, so the optimization advantage comes from the ability to join - * it to adjacent EXACT nodes */ - - const char * cur_parse= RExC_parse; - U8 op; - RExC_emit = (regnode *)orig_emit; - RExC_parse = (char *)orig_parse; - - if (stored == 1) { - - /* A locale node with one point can be folded; all the other cases - * with folding will have two points, since we calculate them above - */ - if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) { - op = EXACTFL; - } - else { - op = EXACT; - } - } - else { /* else 2 chars in the bit map: the folds of each other */ - - /* Use the folded value, which for the cases where we get here, - * is just the lower case of the current one (which may resolve to - * itself, or to the other one */ - value = toLOWER_LATIN1(value); - - /* To join adjacent nodes, they must be the exact EXACTish type. - * Try to use the most likely type, by using EXACTFA if possible, - * then EXACTFU if the regex calls for it, or is required because - * the character is non-ASCII. (If is ASCII, its fold is - * also ASCII for the cases where we get here.) */ - if (MORE_ASCII_RESTRICTED && isASCII(value)) { - op = EXACTFA; - } - else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) { - op = EXACTFU; - } - else { /* Otherwise, more likely to be EXACTF type */ - op = EXACTF; - } - } - - ret = reg_node(pRExC_state, op); - RExC_parse = (char *)cur_parse; - if (UTF && ! NATIVE_IS_INVARIANT(value)) { - *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value); - *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value); - STR_LEN(ret)= 2; - RExC_emit += STR_SZ(2); + /* Here, the bitmap has been populated with all the Latin1 code points that + * always match. Can now add to the overall list those that match only + * when the target string is UTF-8 (). */ + if (depends_list) { + if (cp_list) { + _invlist_union(cp_list, depends_list, &cp_list); + SvREFCNT_dec(depends_list); } else { - *STRING(ret)= (char)value; - STR_LEN(ret)= 1; - RExC_emit += STR_SZ(1); + cp_list = depends_list; } - SvREFCNT_dec(listsv); - return ret; } /* If there is a swash and more than one element, we can't use the swash in @@ -12180,8 +12626,9 @@ parseit: SvREFCNT_dec(swash); swash = NULL; } - if (! nonbitmap - && SvCUR(listsv) == initial_listsv_len + + if (! cp_list + && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION && ! unicode_alternate) { ARG_SET(ret, ANYOF_NONBITMAP_EMPTY); @@ -12197,24 +12644,24 @@ parseit: * swash is stored there now. * av[2] stores the multicharacter foldings, used later in * regexec.c:S_reginclass(). - * av[3] stores the nonbitmap inversion list for use in addition or - * instead of av[0]; not used if av[1] isn't NULL + * av[3] stores the cp_list inversion list for use in addition or + * instead of av[0]; used only if av[1] is NULL * av[4] is set if any component of the class is from a user-defined - * property; not used if av[1] isn't NULL */ + * property; used only if av[1] is NULL */ AV * const av = newAV(); SV *rv; - av_store(av, 0, (SvCUR(listsv) == initial_listsv_len) - ? &PL_sv_undef - : listsv); + av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + ? listsv + : &PL_sv_undef); if (swash) { av_store(av, 1, swash); - SvREFCNT_dec(nonbitmap); + SvREFCNT_dec(cp_list); } else { av_store(av, 1, NULL); - if (nonbitmap) { - av_store(av, 3, nonbitmap); + if (cp_list) { + av_store(av, 3, cp_list); av_store(av, 4, newSVuv(has_user_defined_property)); } } @@ -12237,6 +12684,7 @@ parseit: } return ret; } +#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION /* reg_skipcomment() @@ -12830,6 +13278,40 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) #ifdef DEBUGGING dVAR; register int k; + + /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ + static const char * const anyofs[] = { + "\\w", + "\\W", + "\\s", + "\\S", + "\\d", + "\\D", + "[:alnum:]", + "[:^alnum:]", + "[:alpha:]", + "[:^alpha:]", + "[:ascii:]", + "[:^ascii:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:graph:]", + "[:^graph:]", + "[:lower:]", + "[:^lower:]", + "[:print:]", + "[:^print:]", + "[:punct:]", + "[:^punct:]", + "[:upper:]", + "[:^upper:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:blank:]", + "[:^blank:]" + }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -12950,39 +13432,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) const U8 flags = ANYOF_FLAGS(o); int do_sep = 0; - /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ - static const char * const anyofs[] = { - "\\w", - "\\W", - "\\s", - "\\S", - "\\d", - "\\D", - "[:alnum:]", - "[:^alnum:]", - "[:alpha:]", - "[:^alpha:]", - "[:ascii:]", - "[:^ascii:]", - "[:cntrl:]", - "[:^cntrl:]", - "[:graph:]", - "[:^graph:]", - "[:lower:]", - "[:^lower:]", - "[:print:]", - "[:^print:]", - "[:punct:]", - "[:^punct:]", - "[:upper:]", - "[:^upper:]", - "[:xdigit:]", - "[:^xdigit:]", - "[:space:]", - "[:^space:]", - "[:blank:]", - "[:^blank:]" - }; if (flags & ANYOF_LOCALE) sv_catpvs(sv, "{loc}"); @@ -13120,6 +13569,15 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } + else if (k == POSIXD) { + U8 index = FLAGS(o) * 2; + if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) { + Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index); + } + else { + sv_catpv(sv, anyofs[index]); + } + } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); #else @@ -13230,7 +13688,6 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) { struct regexp *ret; struct regexp *const r = (struct regexp *)SvANY(rx); - register const I32 npar = r->nparens+1; PERL_ARGS_ASSERT_REG_TEMP_COPY; @@ -13250,8 +13707,11 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) SvLEN_set(ret_x, 0); SvSTASH_set(ret_x, NULL); SvMAGIC_set(ret_x, NULL); - Newx(ret->offs, npar, regexp_paren_pair); - Copy(r->offs, ret->offs, npar, regexp_paren_pair); + if (r->offs) { + const I32 npar = r->nparens+1; + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + } if (r->substrs) { Newx(ret->substrs, 1, struct reg_substr_data); StructCopy(r->substrs, ret->substrs, struct reg_substr_data); @@ -13321,9 +13781,6 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) if (ri->data) { int n = ri->data->count; - PAD* new_comppad = NULL; - PAD* old_comppad; - PADOFFSET refcnt; while (--n >= 0) { /* If you add a ->what type here, update the comment in regcomp.h */ @@ -13338,29 +13795,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) case 'f': Safefree(ri->data->data[n]); break; - case 'p': - new_comppad = MUTABLE_AV(ri->data->data[n]); - break; - case 'o': - if (new_comppad == NULL) - Perl_croak(aTHX_ "panic: pregfree comppad"); - PAD_SAVE_LOCAL(old_comppad, - /* Watch out for global destruction's random ordering. */ - (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL - ); - OP_REFCNT_LOCK; - refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]); - OP_REFCNT_UNLOCK; - if (!refcnt) - op_free((OP_4tree*)ri->data->data[n]); - - PAD_RESTORE_LOCAL(old_comppad); - SvREFCNT_dec(MUTABLE_SV(new_comppad)); - new_comppad = NULL; - break; case 'l': case 'L': - case 'n': break; case 'T': { /* Aho Corasick add-on structure for a trie node. @@ -13579,13 +14015,11 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) for (i = 0; i < count; i++) { d->what[i] = ri->data->what[i]; switch (d->what[i]) { - /* legal options are one of: sSfpontTua - see also regcomp.h and pregfree() */ + /* see also regcomp.h and regfree_internal() */ case 'a': /* actually an AV, but the dup function is identical. */ case 'r': case 's': case 'S': - case 'p': /* actually an AV, but the dup function is identical. */ case 'u': /* actually an HV, but the dup function is identical. */ d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param); break; @@ -13596,13 +14030,6 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) struct regnode_charclass_class); reti->regstclass = (regnode*)d->data[i]; break; - case 'o': - /* Compiled op trees are readonly and in shared memory, - and can thus be shared without duplication. */ - OP_REFCNT_LOCK; - d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]); - OP_REFCNT_UNLOCK; - break; case 'T': /* Trie stclasses are readonly and can thus be shared * without duplication. We free the stclass in pregfree @@ -13617,7 +14044,6 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) /* Fall through */ case 'l': case 'L': - case 'n': d->data[i] = ri->data->data[i]; break; default: @@ -13727,8 +14153,6 @@ Perl_save_re_context(pTHX) Copy(&PL_reg_state, state, 1, struct re_save_state); - PL_reg_start_tmp = 0; - PL_reg_start_tmpl = 0; PL_reg_oldsaved = NULL; PL_reg_oldsavedlen = 0; PL_reg_maxiter = 0;