X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/99807a43ff8067f13ddb17de59a7e1331a0051ea..f2979eac0a03fcba1cd431ef3d46f697176ebcc8:/regcomp.c?ds=sidebyside diff --git a/regcomp.c b/regcomp.c index 4556d1a..b62c30d 100644 --- a/regcomp.c +++ b/regcomp.c @@ -184,6 +184,7 @@ struct RExC_state_t { scan_frame *frame_head; scan_frame *frame_last; U32 frame_count; + U32 strict; #ifdef ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -253,6 +254,7 @@ struct RExC_state_t { #define RExC_frame_head (pRExC_state->frame_head) #define RExC_frame_last (pRExC_state->frame_last) #define RExC_frame_count (pRExC_state->frame_count) +#define RExC_strict (pRExC_state->strict) /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set * a flag to disable back-off on the fixed/floating substrings - if it's @@ -702,12 +704,6 @@ static const scan_data_t zero_scan_data = a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \ } STMT_END - -/* Allow for side effects in s */ -#define REGC(c,s) STMT_START { \ - if (!SIZE_ONLY) *(s) = (c); else (void)(s); \ -} STMT_END - /* Macros for recording node offsets. 20001227 mjd@plover.com * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in * element 2*n-1 of the array. Element #2n holds the byte length node #n. @@ -1044,13 +1040,13 @@ S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc) ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP); ssc_anything(ssc); - /* If any portion of the regex is to operate under locale rules, - * initialization includes it. The reason this isn't done for all regexes - * is that the optimizer was written under the assumption that locale was - * all-or-nothing. Given the complexity and lack of documentation in the - * optimizer, and that there are inadequate test cases for locale, many - * parts of it may not work properly, it is safest to avoid locale unless - * necessary. */ + /* If any portion of the regex is to operate under locale rules that aren't + * fully known at compile time, initialization includes it. The reason + * this isn't done for all regexes is that the optimizer was written under + * the assumption that locale was all-or-nothing. Given the complexity and + * lack of documentation in the optimizer, and that there are inadequate + * test cases for locale, many parts of it may not work properly, it is + * safest to avoid locale unless necessary. */ if (RExC_contains_locale) { ANYOF_POSIXL_SETALL(ssc); } @@ -1879,7 +1875,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, May be the same as tail. tail : item following the branch sequence count : words in the sequence - flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/ + flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/ depth : indent depth Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. @@ -2143,10 +2139,11 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, #endif switch (flags) { - case EXACT: break; + case EXACT: case EXACTL: break; case EXACTFA: case EXACTFU_SS: - case EXACTFU: folder = PL_fold_latin1; break; + case EXACTFU: + case EXACTFLU8: folder = PL_fold_latin1; break; case EXACTF: folder = PL_fold; break; default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] ); } @@ -2157,7 +2154,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, trie->wordcount = word_count; RExC_rxi->data->data[ data_slot ] = (void*)trie; trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) ); - if (flags == EXACT) + if (flags == EXACT || flags == EXACTL) trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 ); trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc( trie->wordcount+1, sizeof(reg_trie_wordinfo)); @@ -3201,7 +3198,7 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour StructCopy(source,op,struct regnode_charclass); stclass = (regnode *)op; } - OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */ + OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */ ARG_SET( stclass, data_slot ); aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); @@ -3500,7 +3497,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, * this final joining, sequences could have been split over boundaries, and * hence missed). The sequences only happen in folding, hence for any * non-EXACT EXACTish node */ - if (OP(scan) != EXACT) { + if (OP(scan) != EXACT && OP(scan) != EXACTL) { U8* s0 = (U8*) STRING(scan); U8* s = s0; U8* s_end = s0 + STR_LEN(scan); @@ -4148,14 +4145,24 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, EXACTFU | EXACTFU EXACTFU_SS | EXACTFU EXACTFA | EXACTFA + EXACTL | EXACTL + EXACTFLU8 | EXACTFLU8 */ -#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \ - ( EXACT == (X) ) ? EXACT : \ - ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \ - ( EXACTFA == (X) ) ? EXACTFA : \ - 0 ) +#define TRIE_TYPE(X) ( ( NOTHING == (X) ) \ + ? NOTHING \ + : ( EXACT == (X) ) \ + ? EXACT \ + : ( EXACTFU == (X) || EXACTFU_SS == (X) ) \ + ? EXACTFU \ + : ( EXACTFA == (X) ) \ + ? EXACTFA \ + : ( EXACTL == (X) ) \ + ? EXACTL \ + : ( EXACTFLU8 == (X) ) \ + ? EXACTFLU8 \ + : 0 ) /* dont use tail as the end marker for this traverse */ for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) { @@ -4471,7 +4478,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, continue; } } - else if (OP(scan) == EXACT) { + else if (OP(scan) == EXACT || OP(scan) == EXACTL) { SSize_t l = STR_LEN(scan); UV uc; if (UTF) { @@ -4589,7 +4596,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, case PLUS: if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) { next = NEXTOPER(scan); - if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) { + if (OP(next) == EXACT + || OP(next) == EXACTL + || (flags & SCF_DO_STCLASS)) + { mincount = 1; maxcount = REG_INFTY; next = regnext(scan); @@ -5008,7 +5018,8 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", flags &= ~SCF_DO_STCLASS; } min++; - delta++; /* Because of the 2 char string cr-lf */ + if (delta != SSize_t_MAX) + delta++; /* Because of the 2 char string cr-lf */ if (flags & SCF_DO_SUBSTR) { /* Cannot expect anything... */ scan_commit(pRExC_state, data, minlenp, is_inf); @@ -5070,6 +5081,7 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", } break; + case ANYOFL: case ANYOF: if (flags & SCF_DO_STCLASS_AND) ssc_and(pRExC_state, data->start_class, @@ -5181,32 +5193,6 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", && (scan->flags || data || (flags & SCF_DO_STCLASS)) && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { - if ( OP(scan) == UNLESSM && - scan->flags == 0 && - OP(NEXTOPER(NEXTOPER(scan))) == NOTHING && - OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED - ) { - regnode *opt; - regnode *upto= regnext(scan); - DEBUG_PARSE_r({ - DEBUG_STUDYDATA("OPFAIL",data,depth); - - /*DEBUG_PARSE_MSG("opfail");*/ - regprop(RExC_rx, RExC_mysv, upto, NULL, pRExC_state); - PerlIO_printf(Perl_debug_log, - "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n", - SvPV_nolen_const(RExC_mysv), - (IV)REG_NODE_NUM(upto), - (IV)(upto - scan) - ); - }); - OP(scan) = OPFAIL; - NEXT_OFF(scan) = upto - scan; - for (opt= scan + 1; opt < upto ; opt++) - OP(opt) = OPTIMIZED; - scan= upto; - continue; - } if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY || OP(scan) == UNLESSM ) { @@ -5632,8 +5618,11 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", { SSize_t final_minlen= min < stopmin ? min : stopmin; - if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) && (RExC_maxlen < final_minlen + delta)) { - RExC_maxlen = final_minlen + delta; + if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) { + if (final_minlen > SSize_t_MAX - delta) + RExC_maxlen = SSize_t_MAX; + else if (RExC_maxlen < final_minlen + delta) + RExC_maxlen = final_minlen + delta; } return final_minlen; } @@ -6545,6 +6534,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_uni_semantics = 0; RExC_contains_locale = 0; RExC_contains_i = 0; + RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT); pRExC_state->runtime_code_qr = NULL; RExC_frame_head= NULL; RExC_frame_last= NULL; @@ -6655,10 +6645,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_recurse_count = 0; pRExC_state->code_index = 0; -#if 0 /* REGC() is (currently) a NOP at the first pass. - * Clever compilers notice this and complain. --jhi */ - REGC((U8)REG_MAGIC, (char*)RExC_emit); -#endif DEBUG_PARSE_r( PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"); RExC_lastnum=0; @@ -6770,7 +6756,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, == REG_RUN_ON_COMMENT_SEEN); U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); - const char *fptr = STD_PAT_MODS; /*"msix"*/ + const char *fptr = STD_PAT_MODS; /*"msixn"*/ char *p; /* Allocate for the worst case, which is all the std flags are turned * on. If more precision is desired, we could do a population count of @@ -6872,7 +6858,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_emit_bound = ri->program + RExC_size + 1; pRExC_state->code_index = 0; - REGC((U8)REG_MAGIC, (char*) RExC_emit++); + *((char*) RExC_emit++) = (char) REG_MAGIC; if (reg(pRExC_state, 0, &flags,1) == NULL) { ReREFCNT_dec(rx); Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); @@ -6982,7 +6968,7 @@ reStudy: DEBUG_PEEP("first:",first,0); /* Ignore EXACT as we deal with it later. */ if (PL_regkind[OP(first)] == EXACT) { - if (OP(first) == EXACT) + if (OP(first) == EXACT || OP(first) == EXACTL) NOOP; /* Empty, get anchored substr later. */ else ri->regstclass = first; @@ -7332,7 +7318,7 @@ reStudy: && OP(regnext(first)) == END) r->extflags |= RXf_WHITE; else if ( r->extflags & RXf_SPLIT - && fop == EXACT + && (fop == EXACT || fop == EXACTL) && STR_LEN(first) == 1 && *(STRING(first)) == ' ' && OP(regnext(first)) == END ) @@ -9587,7 +9573,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) and must be globally applied -- japhy */ switch (*RExC_parse) { - /* Code for the imsx flags */ + /* Code for the imsxn flags */ CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp, x_mod_count); case LOCALE_PAT_MOD: @@ -10076,6 +10062,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case '!': /* (?!...) */ RExC_seen_zerolen++; + /* check if we're really just a "FAIL" assertion */ + --RExC_parse; + nextchar(pRExC_state); if (*RExC_parse == ')') { ret=reg_node(pRExC_state, OPFAIL); nextchar(pRExC_state); @@ -10443,7 +10432,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) goto parse_rest; } /* end switch */ } - else { /* (...) */ + else if (!(RExC_flags & RXf_PMf_NOCAPTURE)) { /* (...) */ capturing_parens: parno = RExC_npar; RExC_npar++; @@ -10465,6 +10454,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) Set_Node_Length(ret, 1); /* MJD */ Set_Node_Offset(ret, RExC_parse); /* MJD */ is_open = 1; + } else { + ret = NULL; } } else /* ! paren */ @@ -11066,8 +11057,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, on success. If is non-null, it means the caller can accept an input sequence - consisting of a just a single code point; <*valuep> is set to the value - of the only or first code point in the input. + consisting of just a single code point; <*valuep> is set to the value of the + only or first code point in the input. If is non-null, it means the caller can accept an input sequence consisting of one or more code points; <*substitute_parse> is a @@ -11147,17 +11138,18 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, RExC_parse++; /* Skip past the '{' */ - if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ + if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ || ! (endbrace == RExC_parse /* nothing between the {} */ - || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below - */ - && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) - */ + || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */ + && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better + error msg) */ { if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ vFAIL("\\N{NAME} must be resolved by the lexer"); } + RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ + if (endbrace == RExC_parse) { /* empty: \N{} */ if (node_p) { *node_p = reg_node(pRExC_state,NOTHING); @@ -11169,7 +11161,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, return 0; } - RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */ RExC_parse += 2; /* Skip past the 'U+' */ endchar = RExC_parse + strcspn(RExC_parse, ".}"); @@ -11179,7 +11170,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, has_multiple_chars = (endchar < endbrace); /* We get the first code point if we want it, and either there is only one, - * or we can accept both cases of one and more than one */ + * or we can accept both cases of one and there is more than one */ if (valuep && (substitute_parse || ! has_multiple_chars)) { STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse); I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES @@ -11228,7 +11219,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, } { - /* What is done here is to convert this to a sub-pattern of the form * \x{char1}\x{char2}... * and then either return it in <*substitute_parse> if non-null; or @@ -11346,7 +11336,9 @@ S_compute_EXACTish(RExC_state_t *pRExC_state) PERL_ARGS_ASSERT_COMPUTE_EXACTISH; if (! FOLD) { - return EXACT; + return (LOC) + ? EXACTL + : EXACT; } op = get_regex_charset(RExC_flags); @@ -11444,7 +11436,9 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, for those. */ && ! _invlist_contains_cp(PL_utf8_foldable, code_point)) { - OP(node) = EXACT; + OP(node) = (LOC) + ? EXACTL + : EXACT; } } else if (code_point <= MAX_UTF8_TWO_BYTE) { @@ -11657,6 +11651,7 @@ tryagain: FALSE, /* means parse the whole char class */ TRUE, /* allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ + RExC_strict, NULL); if (*RExC_parse != ']') { RExC_parse = oregcomp_parse; @@ -11892,6 +11887,7 @@ tryagain: FALSE, /* don't silence non-portable warnings. It would be a bug if these returned non-portables */ + RExC_strict, NULL); /* regclass() can only return RESTART_UTF8 if multi-char folds are allowed. */ @@ -12266,7 +12262,7 @@ tryagain: &result, &error_msg, PASS2, /* out warnings */ - FALSE, /* not strict */ + RExC_strict, TRUE, /* Output warnings for non- portables */ @@ -12295,8 +12291,8 @@ tryagain: &result, &error_msg, PASS2, /* out warnings */ - FALSE, /* not strict */ - TRUE, /* Output warnings + RExC_strict, + TRUE, /* Silence warnings for non- portables */ UTF); @@ -12329,8 +12325,8 @@ tryagain: * from \1 - \9 is a backreference, any multi-digit * escape which does not start with 0 and which when * evaluated as decimal could refer to an already - * parsed capture buffer is a backslash. Anything else - * is octal. + * parsed capture buffer is a back reference. Anything + * else is octal. * * Note this implies that \118 could be interpreted as * 118 OR as "\11" . "8" depending on whether there @@ -12438,39 +12434,64 @@ tryagain: goto loopdone; } - if (! FOLD /* The simple case, just append the literal */ - || (LOC /* Also don't fold for tricky chars under /l */ - && is_PROBLEMATIC_LOCALE_FOLD_cp(ender))) - { - if (UTF) { - const STRLEN unilen = reguni(pRExC_state, ender, s); - if (unilen > 0) { - s += unilen; - len += unilen; - } + if (! FOLD) { /* The simple case, just append the literal */ - /* The loop increments each time, as all but this - * path (and one other) through it add a single byte to - * the EXACTish node. But this one has changed len to - * be the correct final value, so subtract one to - * cancel out the increment that follows */ - len--; - } - else { - REGC((char)ender, s++); - } - - /* Can get here if folding only if is one of the /l - * characters whose fold depends on the locale. The - * occurrence of any of these indicate that we can't - * simplify things */ - if (FOLD) { - maybe_exact = FALSE; - maybe_exactfu = FALSE; + /* In the sizing pass, we need only the size of the + * character we are appending, hence we can delay getting + * its representation until PASS2. */ + if (SIZE_ONLY) { + if (UTF) { + const STRLEN unilen = UNISKIP(ender); + s += unilen; + + /* We have to subtract 1 just below (and again in + * the corresponding PASS2 code) because the loop + * increments each time, as all but this path + * (and one other) through it add a single byte to + * the EXACTish node. But these paths would change + * len to be the correct final value, so cancel out + * the increment that follows */ + len += unilen - 1; + } + else { + s++; + } + } else { /* PASS2 */ + not_fold_common: + if (UTF) { + U8 * new_s = uvchr_to_utf8((U8*)s, ender); + len += (char *) new_s - s - 1; + s = (char *) new_s; + } + else { + *(s++) = (char) ender; + } } } - else /* FOLD */ - if (! ( UTF + else if (LOC && is_PROBLEMATIC_LOCALE_FOLD_cp(ender)) { + + /* Here are folding under /l, and the code point is + * problematic. First, we know we can't simplify things */ + maybe_exact = FALSE; + maybe_exactfu = FALSE; + + /* A problematic code point in this context means that its + * fold isn't known until runtime, so we can't fold it now. + * (The non-problematic code points are the above-Latin1 + * ones that fold to also all above-Latin1. Their folds + * don't vary no matter what the locale is.) But here we + * have characters whose fold depends on the locale. + * Unlike the non-folding case above, we have to keep track + * of these in the sizing pass, so that we can make sure we + * don't split too-long nodes in the middle of a potential + * multi-char fold. And unlike the regular fold case + * handled in the else clauses below, we don't actually + * fold and don't have special cases to consider. What we + * do for both passes is the PASS2 code for non-folding */ + goto not_fold_common; + } + else /* A regular FOLD code point */ + if (! ( UTF /* See comments for join_exact() as to why we fold this * non-UTF at compile time */ || (node_type == EXACTFU @@ -12479,7 +12500,7 @@ tryagain: /* Here, are folding and are not UTF-8 encoded; therefore * the character must be in the range 0-255, and is not /l * (Not /l because we already handled these under /l in - * is_PROBLEMATIC_LOCALE_FOLD_cp */ + * is_PROBLEMATIC_LOCALE_FOLD_cp) */ if (IS_IN_SOME_FOLD_L1(ender)) { maybe_exact = FALSE; @@ -12508,8 +12529,7 @@ tryagain: * unfolded, and we have to calculate how many EXACTish * nodes it will take; and we may run out of room in a node * in the middle of a potential multi-char fold, and have - * to back off accordingly. (Hence we can't use REGC for - * the simple case just below.) */ + * to back off accordingly. */ UV folded; if (isASCII_uni(ender)) { @@ -12745,10 +12765,14 @@ tryagain: * differently depending on UTF8ness of the target string * (for /u), or depending on locale for /l */ if (maybe_exact) { - OP(ret) = EXACT; + OP(ret) = (LOC) + ? EXACTL + : EXACT; } else if (maybe_exactfu) { - OP(ret) = EXACTFU; + OP(ret) = (LOC) + ? EXACTFLU8 + : EXACTFU; } } alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender, @@ -13159,7 +13183,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, posix class */ FALSE, /* don't allow multi-char folds */ TRUE, /* silence non-portable warnings. */ - ¤t)) + TRUE, /* strict */ + ¤t + )) FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", (UV) *flagp); @@ -13326,7 +13352,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, TRUE, /* means parse just the next thing */ FALSE, /* don't allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ - ¤t)) + TRUE, /* strict */ + ¤t + )) FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", (UV) *flagp); /* regclass() will return with parsing just the \ sequence, @@ -13349,7 +13377,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, only if not a posix class */ FALSE, /* don't allow multi-char folds */ FALSE, /* don't silence non-portable warnings. */ - ¤t)) + TRUE, /* strict */ + ¤t + )) FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf"", (UV) *flagp); /* function call leaves parse pointing to the ']', except if we @@ -13550,7 +13580,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, TRUE, /* silence non-portable warnings. The above may very well have generated non-portable code points, but they're valid on this machine */ - NULL); + FALSE, /* similarly, no need for strict */ + NULL + ); if (!node) FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf, PTR2UV(flagp)); @@ -13686,7 +13718,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool silence_non_portable, /* Don't output warnings about too large characters */ - SV** ret_invlist) /* Return an inversion list, not a node */ + const bool strict, + SV** ret_invlist /* Return an inversion list, not a node */ + ) { /* parse a bracketed class specification. Most of these will produce an * ANYOF node; but something like [a] will produce an EXACT node; [aA], an @@ -13735,6 +13769,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, separate for a while from the non-complemented versions because of complications with /d matching */ + SV* simple_posixes = NULL; /* But under some conditions, the classes can be + treated more simply than the general case, + leading to less compilation and execution + work */ UV element_count = 0; /* Number of distinct elements in the class. Optimizations may be possible if this is tiny */ AV * multi_char_matches = NULL; /* Code points that fold to more than one @@ -13743,7 +13781,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, char * stop_ptr = RExC_end; /* where to stop parsing */ const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white space? */ - const bool strict = cBOOL(ret_invlist); /* Apply strict parsing rules? */ /* Unicode properties are stored in a swash; this holds the current one * being parsed. If this swash is the only above-latin1 component of the @@ -13802,7 +13839,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, DEBUG_PARSE("clas"); /* Assume we are going to generate an ANYOF node. */ - ret = reganode(pRExC_state, ANYOF, 0); + ret = reganode(pRExC_state, + (LOC) + ? ANYOFL + : ANYOF, + 0); if (SIZE_ONLY) { RExC_size += ANYOF_SKIP; @@ -13882,6 +13923,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (!range) { rangebegin = RExC_parse; element_count++; +#ifdef EBCDIC + literal_endpoint = 0; +#endif } if (UTF) { value = utf8n_to_uvchr((U8*)RExC_parse, @@ -14418,15 +14462,33 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, &cp_list); } } - else { /* Garden variety class. If is NASCII, NDIGIT, ... + else if (UNI_SEMANTICS + || classnum == _CC_ASCII + || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT + || classnum == _CC_XDIGIT))) + { + /* We usually have to worry about /d and /a affecting what + * POSIX classes match, with special code needed for /d + * because we won't know until runtime what all matches. + * But there is no extra work needed under /u, and + * [:ascii:] is unaffected by /a and /d; and :digit: and + * :xdigit: don't have runtime differences under /d. So we + * can special case these, and avoid some extra work below, + * and at runtime. */ + _invlist_union_maybe_complement_2nd( + simple_posixes, + PL_XPosix_ptrs[classnum], + namedclass % 2 != 0, + &simple_posixes); + } + else { /* Garden variety class. If is NUPPER, NALPHA, ... complement and use nposixes */ SV** posixes_ptr = namedclass % 2 == 0 ? &posixes : &nposixes; - SV** source_ptr = &PL_XPosix_ptrs[classnum]; _invlist_union_maybe_complement_2nd( *posixes_ptr, - *source_ptr, + PL_XPosix_ptrs[classnum], namedclass % 2 != 0, posixes_ptr); } @@ -14468,7 +14530,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, vFAIL2utf8f( "Invalid [] range \"%"UTF8f"\"", UTF8fARG(UTF, w, rangebegin)); - range = 0; /* not a valid range */ + NOT_REACHED; /* NOT REACHED */ } } else { @@ -14833,24 +14895,29 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, op = POSIXA; } } - else if (prevvalue == 'A') { - if (value == 'Z' + else if (AT_LEAST_ASCII_RESTRICTED || ! FOLD) { + /* We can optimize A-Z or a-z, but not if they could match + * something like the KELVIN SIGN under /i (/a means they + * can't) */ + if (prevvalue == 'A') { + if (value == 'Z' #ifdef EBCDIC - && literal_endpoint == 2 + && literal_endpoint == 2 #endif - ) { - arg = (FOLD) ? _CC_ALPHA : _CC_UPPER; - op = POSIXA; + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_UPPER; + op = POSIXA; + } } - } - else if (prevvalue == 'a') { - if (value == 'z' + else if (prevvalue == 'a') { + if (value == 'z' #ifdef EBCDIC - && literal_endpoint == 2 + && literal_endpoint == 2 #endif - ) { - arg = (FOLD) ? _CC_ALPHA : _CC_LOWER; - op = POSIXA; + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_LOWER; + op = POSIXA; + } } } } @@ -14904,6 +14971,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, SvREFCNT_dec(posixes); SvREFCNT_dec(nposixes); + SvREFCNT_dec(simple_posixes); SvREFCNT_dec(cp_list); SvREFCNT_dec(cp_foldable_list); return ret; @@ -15061,6 +15129,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * classes. The lists are kept separate up to now because we don't want to * fold the classes (folding of those is automatically handled by the swash * fetching code) */ + if (simple_posixes) { + _invlist_union(cp_list, simple_posixes, &cp_list); + SvREFCNT_dec_NN(simple_posixes); + } if (posixes || nposixes) { if (posixes && AT_LEAST_ASCII_RESTRICTED) { /* Under /a and /aa, nothing above ASCII matches these */ @@ -15281,7 +15353,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, value = start; if (! FOLD) { - op = EXACT; + op = (LOC) + ? EXACTL + : EXACT; } else if (LOC) { @@ -15798,17 +15872,6 @@ S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const } /* -- reguni - emit (if appropriate) a Unicode character -*/ -PERL_STATIC_INLINE STRLEN -S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s) -{ - PERL_ARGS_ASSERT_REGUNI; - - return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s); -} - -/* - reginsert - insert an operator in front of already-emitted operand * * Means relocating the operand. @@ -15994,10 +16057,12 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, if ( exact ) { switch (OP(scan)) { case EXACT: + case EXACTL: case EXACTF: case EXACTFA_NO_TRIE: case EXACTFA: case EXACTFU: + case EXACTFLU8: case EXACTFU_SS: case EXACTFL: if( exact == PSEUDO ) @@ -16411,7 +16476,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ SV* bitmap_invlist; /* Will hold what the bit map contains */ - if (flags & ANYOF_LOCALE_FLAGS) + if (OP(o) == ANYOFL) sv_catpvs(sv, "{loc}"); if (flags & ANYOF_LOC_FOLD) sv_catpvs(sv, "{i}"); @@ -16452,13 +16517,12 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ sv_catpvs(sv, "{non-utf8-latin1-all}"); } - /* output information about the unicode matching */ if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) sv_catpvs(sv, "{above_bitmap_all}"); - else if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) { + + if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) { SV *lv; /* Set if there is something outside the bit map. */ - bool byte_output = FALSE; /* If something in the bitmap has - been output */ + bool byte_output = FALSE; /* If something has been output */ SV *only_utf8_locale; /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist' @@ -16587,21 +16651,22 @@ Perl_re_intuit_string(pTHX_ REGEXP * const r) DEBUG_COMPILE_r( { - const char * const s = SvPV_nolen_const(prog->check_substr - ? prog->check_substr : prog->check_utf8); + const char * const s = SvPV_nolen_const(RX_UTF8(r) + ? prog->check_utf8 : prog->check_substr); if (!PL_colorset) reginitcolors(); PerlIO_printf(Perl_debug_log, "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", PL_colors[4], - prog->check_substr ? "" : "utf8 ", + RX_UTF8(r) ? "utf8 " : "", PL_colors[5],PL_colors[0], s, PL_colors[1], (strlen(s) > 60 ? "..." : "")); } ); - return prog->check_substr ? prog->check_substr : prog->check_utf8; + /* use UTF8 check substring if regexp pattern itself is in UTF8 */ + return RX_UTF8(r) ? prog->check_utf8 : prog->check_substr; } /*