X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/924ba0765e4151c6d9a29ad6b3c3f97c24673477..f6555ff3f309865a88085cd93354cd93bfb96fd3:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 2bad384..e7c6662 100644 --- a/regcomp.c +++ b/regcomp.c @@ -93,6 +93,8 @@ EXTERN_C const struct regexp_engine my_reg_engine; #define HAS_NONLATIN1_FOLD_CLOSURE(i) \ _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) +#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \ + _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) #define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) @@ -1718,7 +1720,7 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, May be the same as tail. tail : item following the branch sequence count : words in the sequence - flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/ + flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/ depth : indent depth Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node. @@ -2982,8 +2984,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, : MADE_TRIE; } -STATIC void -S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth) +STATIC regnode * +S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth) { /* The Trie is constructed and compressed now so we can build a fail array if * it's needed @@ -3021,13 +3023,26 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode U32 *fail; reg_ac_data *aho; const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T")); + regnode *stclass; GET_RE_DEBUG_FLAGS_DECL; - PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE; + PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif + if ( OP(source) == TRIE ) { + struct regnode_1 *op = (struct regnode_1 *) + PerlMemShared_calloc(1, sizeof(struct regnode_1)); + StructCopy(source,op,struct regnode_1); + stclass = (regnode *)op; + } else { + struct regnode_charclass *op = (struct regnode_charclass *) + PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); + StructCopy(source,op,struct regnode_charclass); + stclass = (regnode *)op; + } + OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */ ARG_SET( stclass, data_slot ); aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) ); @@ -3094,6 +3109,7 @@ S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode }); Safefree(q); /*RExC_seen |= REG_TRIEDFA_SEEN;*/ + return stclass; } @@ -3417,9 +3433,9 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, else { /* Here is a generic multi-char fold. */ U8* multi_end = s + len; - /* Count how many characters in it. In the case of /aa, no - * folds which contain ASCII code points are allowed, so - * check for those, and skip if found. */ + /* Count how many characters are in it. In the case of + * /aa, no folds which contain ASCII code points are + * allowed, so check for those, and skip if found. */ if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) { count = utf8_length(s, multi_end); s = multi_end; @@ -4249,11 +4265,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } flags &= ~SCF_DO_STCLASS; } - else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */ + else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is + EXACTFish */ SSize_t l = STR_LEN(scan); UV uc = *((U8*)STRING(scan)); SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2 separate code points */ + const U8 * s = (U8*)STRING(scan); /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) { @@ -4261,7 +4279,6 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, scan_commit(pRExC_state, data, minlenp, is_inf); } if (UTF) { - const U8 * const s = (U8 *)STRING(scan); uc = utf8_to_uvchr_buf(s, s + l, NULL); l = utf8_length(s, s + l); } @@ -4281,71 +4298,140 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, data->longest = &(data->longest_float); } } - if (OP(scan) == EXACTFL) { - /* We don't know what the folds are; it could be anything. XXX - * Actually, we only support UTF-8 encoding for code points - * above Latin1, so we could know what those folds are. */ - EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, - 0, - UV_MAX); + if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) { + ssc_clear_locale(data->start_class); } - else { /* Non-locale EXACTFish */ - EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); - if (flags & SCF_DO_STCLASS_AND) { - ssc_clear_locale(data->start_class); + + if (! UTF) { + + /* We punt and assume can match anything if the node begins + * with a multi-character fold. Things are complicated. For + * example, /ffi/i could match any of: + * "\N{LATIN SMALL LIGATURE FFI}" + * "\N{LATIN SMALL LIGATURE FF}I" + * "F\N{LATIN SMALL LIGATURE FI}" + * plus several other things; and making sure we have all the + * possibilities is hard. */ + if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); } - if (uc < 256) { /* We know what the Latin1 folds are ... */ - if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we - know if anything folds - with this */ - EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, + else { + + /* Any Latin1 range character can potentially match any + * other depending on the locale */ + if (OP(scan) == EXACTFL) { + _invlist_union(EXACTF_invlist, PL_Latin1, + &EXACTF_invlist); + } + else { + /* But otherwise, it matches at least itself. We can + * quickly tell if it has a distinct fold, and if so, + * it matches that as well */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (IS_IN_SOME_FOLD_L1(uc)) { + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, PL_fold_latin1[uc]); - if (OP(scan) != EXACTFA) { /* The folds below aren't - legal under /iaa */ - if (isARG2_lower_or_UPPER_ARG1('s', uc)) { - EXACTF_invlist - = add_cp_to_invlist(EXACTF_invlist, - LATIN_SMALL_LETTER_SHARP_S); - } - else if (uc == LATIN_SMALL_LETTER_SHARP_S) { - EXACTF_invlist - = add_cp_to_invlist(EXACTF_invlist, 's'); - EXACTF_invlist - = add_cp_to_invlist(EXACTF_invlist, 'S'); - } } + } - /* We also know if there are above-Latin1 code points - * that fold to this (none legal for ASCII and /iaa) */ - if ((! isASCII(uc) || OP(scan) != EXACTFA) - && HAS_NONLATIN1_FOLD_CLOSURE(uc)) - { - /* XXX We could know exactly what does fold to this - * if the reverse folds are loaded, as currently in - * S_regclass() */ - _invlist_union(EXACTF_invlist, - PL_AboveLatin1, - &EXACTF_invlist); + /* Some characters match above-Latin1 ones under /i. This + * is true of EXACTFL ones when the locale is UTF-8 */ + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc) + && (! isASCII(uc) || (OP(scan) != EXACTFA + && OP(scan) != EXACTFA_NO_TRIE))) + { + add_above_Latin1_folds(pRExC_state, + (U8) uc, + &EXACTF_invlist); + } + } + } + else { /* Pattern is UTF-8 */ + U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + STRLEN foldlen = UTF8SKIP(s); + const U8* e = s + STR_LEN(scan); + SV** listp; + + /* The only code points that aren't folded in a UTF EXACTFish + * node are are the problematic ones in EXACTFL nodes */ + if (OP(scan) == EXACTFL + && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) + { + /* We need to check for the possibility that this EXACTFL + * node begins with a multi-char fold. Therefore we fold + * the first few characters of it so that we can make that + * check */ + U8 *d = folded; + int i; + + for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) { + if (isASCII(*s)) { + *(d++) = (U8) toFOLD(*s); + s++; + } + else { + STRLEN len; + to_utf8_fold(s, d, &len); + d += len; + s += UTF8SKIP(s); } } + + /* And set up so the code below that looks in this folded + * buffer instead of the node's string */ + e = d; + foldlen = UTF8SKIP(folded); + s = folded; + } + + /* When we reach here 's' points to the fold of the first + * character(s) of the node; and 'e' points to far enough along + * the folded string to be just past any possible multi-char + * fold. 'foldlen' is the length in bytes of the first + * character in 's' + * + * Unlike the non-UTF-8 case, the macro for determining if a + * string is a multi-char fold requires all the characters to + * already be folded. This is because of all the complications + * if not. Note that they are folded anyway, except in EXACTFL + * nodes. Like the non-UTF case above, we punt if the node + * begins with a multi-char fold */ + + if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) { + EXACTF_invlist = + _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX); } - else { /* Non-locale, above Latin1. XXX We don't currently - know what participates in folds with this, so have - to assume anything could */ - - /* XXX We could know exactly what does fold to this if the - * reverse folds are loaded, as currently in S_regclass(). - * But we do know that under /iaa nothing in the ASCII - * range can participate */ - if (OP(scan) == EXACTFA) { - _invlist_union_complement_2nd(EXACTF_invlist, - PL_XPosix_ptrs[_CC_ASCII], - &EXACTF_invlist); + else { /* Single char fold */ + + /* It matches all the things that fold to it, which are + * found in PL_utf8_foldclosures (including itself) */ + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc); + if (! PL_utf8_foldclosures) { + _load_PL_utf8_foldclosures(); } - else { - EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist, - 0, UV_MAX); + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) s, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV k; + for (k = 0; k <= av_tindex(list); k++) { + SV** c_p = av_fetch(list, k, FALSE); + UV c; + assert(c_p); + + c = SvUV(*c_p); + + /* /aa doesn't allow folds between ASCII and non- */ + if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE) + && isASCII(c) != isASCII(uc)) + { + continue; + } + + EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c); + } } } } @@ -4693,13 +4779,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* It is counted once already... */ data->pos_min += minnext * (mincount - counted); #if 0 -PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf - " SSize_t_MAX=%"UVdf" minnext=%"UVdf - " maxcount=%"UVdf" mincount=%"UVdf"\n", +PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf + " SSize_t_MAX=%"UVuf" minnext=%"UVuf + " maxcount=%"UVuf" mincount=%"UVuf"\n", (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, (UV)mincount); if (deltanext != SSize_t_MAX) -PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n", +PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", (UV)(-counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); #endif @@ -5430,7 +5516,8 @@ S_add_data(RExC_state_t* const pRExC_state, const char* const s, const U32 n) return count; } -/*XXX: todo make this not included in a non debugging perl */ +/*XXX: todo make this not included in a non debugging perl, but appears to be + * used anyway there, in 'use re' */ #ifndef PERL_IN_XSUB_RE void Perl_reginitcolors(pTHX) @@ -5633,7 +5720,7 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, /* if we know we have at least two args, create an empty string, * then concatenate args to that. For no args, return an empty string */ if (!pat && pat_count != 1) { - pat = newSVpvn("", 0); + pat = newSVpvs(""); SAVEFREESV(pat); alloced = TRUE; } @@ -6751,22 +6838,8 @@ reStudy: else if (PL_regkind[OP(first)] == TRIE && ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0) { - regnode *trie_op; - /* this can happen only on restudy */ - if ( OP(first) == TRIE ) { - struct regnode_1 *trieop = (struct regnode_1 *) - PerlMemShared_calloc(1, sizeof(struct regnode_1)); - StructCopy(first,trieop,struct regnode_1); - trie_op=(regnode *)trieop; - } else { - struct regnode_charclass *trieop = (struct regnode_charclass *) - PerlMemShared_calloc(1, sizeof(struct regnode_charclass)); - StructCopy(first,trieop,struct regnode_charclass); - trie_op=(regnode *)trieop; - } - OP(trie_op)+=2; - make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0); - ri->regstclass = trie_op; + /* this can happen only on restudy */ + ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0); } #endif else if (REGNODE_SIMPLE(OP(first))) @@ -9069,6 +9142,23 @@ Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, count += 2; } } + +void +Perl__load_PL_utf8_foldclosures (pTHX) +{ + assert(! PL_utf8_foldclosures); + + /* If the folds haven't been read in, call a fold function + * to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES_CASE+1]; + + /* This string is just a short named one above \xff */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ + } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); +} #endif #ifdef PERL_ARGS_ASSERT__INVLISTEQ @@ -9412,8 +9502,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) int internal_argval = 0; /* internal_argval is only useful if !argok */ - if (has_intervening_patws && SIZE_ONLY) { - ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated"); + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent"); } while ( *RExC_parse && *RExC_parse != ')' ) { if ( *RExC_parse == ':' ) { @@ -9519,8 +9610,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) else if (*RExC_parse == '?') { /* (?...) */ bool is_logical = 0; const char * const seqstart = RExC_parse; - if (has_intervening_patws && SIZE_ONLY) { - ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated"); + if (has_intervening_patws) { + RExC_parse++; + vFAIL("In '(?...)', the '(' and '?' must be adjacent"); } RExC_parse++; @@ -9687,18 +9779,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case '@': /* (?@...) */ vFAIL2("Sequence (?%c...) not implemented", (int)paren); break; - case '#': /* (?#...) */ - /* XXX As soon as we disallow separating the '?' and '*' (by - * spaces or (?#...) comment), it is believed that this case - * will be unreachable and can be removed. See - * [perl #117327] */ - while (*RExC_parse && *RExC_parse != ')') - RExC_parse++; - if (*RExC_parse != ')') - FAIL("Sequence (?#... not terminated"); - nextchar(pRExC_state); - *flagp = TRYAGAIN; - return NULL; case '0' : /* (?0) */ case 'R' : /* (?R) */ if (*RExC_parse != ')') @@ -10674,7 +10754,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, * modifier. The other meaning does not, so use a temporary until we find * out which we are being called with */ p = (RExC_flags & RXf_PMf_EXTENDED) - ? regwhite( pRExC_state, RExC_parse ) + ? regpatws(pRExC_state, RExC_parse, + TRUE) /* means recognize comments */ : RExC_parse; /* Disambiguate between \N meaning a named character versus \N meaning @@ -11638,7 +11719,8 @@ tryagain: case '#': if (RExC_flags & RXf_PMf_EXTENDED) { - if ( reg_skipcomment( pRExC_state ) ) + RExC_parse = reg_skipcomment( pRExC_state, RExC_parse ); + if (RExC_parse < RExC_end) goto tryagain; } /* FALLTHROUGH */ @@ -11719,7 +11801,8 @@ tryagain: oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite( pRExC_state, p ); + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ switch ((U8)*p) { case '^': case '$': @@ -11947,15 +12030,6 @@ tryagain: break; default: /* A literal character */ - if (! SIZE_ONLY - && RExC_flags & RXf_PMf_EXTENDED - && ckWARN_d(WARN_DEPRECATED) - && is_PATWS_non_low_safe(p, RExC_end, UTF)) - { - vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1), - "Escape literal pattern white space under /x"); - } - normal_default: if (UTF8_IS_START(*p) && UTF) { STRLEN numlen; @@ -11973,7 +12047,8 @@ tryagain: */ if ( RExC_flags & RXf_PMf_EXTENDED) - p = regwhite( pRExC_state, p ); + p = regpatws(pRExC_state, p, + TRUE); /* means recognize comments */ /* If the next thing is a quantifier, it applies to this * character only, which means that this character has to be in @@ -12328,39 +12403,11 @@ tryagain: } STATIC char * -S_regwhite( RExC_state_t *pRExC_state, char *p ) -{ - const char *e = RExC_end; - - PERL_ARGS_ASSERT_REGWHITE; - - while (p < e) { - if (isSPACE(*p)) - ++p; - else if (*p == '#') { - bool ended = 0; - do { - if (*p++ == '\n') { - ended = 1; - break; - } - } while (p < e); - if (!ended) - RExC_seen |= REG_RUN_ON_COMMENT_SEEN; - } - else - break; - } - return p; -} - -STATIC char * -S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) +S_regpatws(pTHX_ RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) { /* Returns the next non-pattern-white space, non-comment character (the * latter only if 'recognize_comment is true) in the string p, which is - * ended by RExC_end. If there is no line break ending a comment, - * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */ + * ended by RExC_end. See also reg_skipcomment */ const char *e = RExC_end; PERL_ARGS_ASSERT_REGPATWS; @@ -12371,16 +12418,7 @@ S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment ) p += len; } else if (recognize_comment && *p == '#') { - bool ended = 0; - do { - p++; - if (is_LNBREAK_safe(p, e, UTF)) { - ended = 1; - break; - } - } while (p < e); - if (!ended) - RExC_seen |= REG_RUN_ON_COMMENT_SEEN; + p = reg_skipcomment(pRExC_state, p); } else break; @@ -12708,7 +12746,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, while (RExC_parse < RExC_end) { SV* current = NULL; RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ + TRUE); /* means recognize comments */ switch (*RExC_parse) { case '?': if (RExC_parse[1] == '[') depth++, RExC_parse++; @@ -12825,7 +12863,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, /* Skip white space */ RExC_parse = regpatws(pRExC_state, RExC_parse, - TRUE); /* means recognize comments */ + TRUE /* means recognize comments */ ); if (RExC_parse >= RExC_end) { Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'"); } @@ -13154,6 +13192,74 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, } #undef IS_OPERAND +STATIC void +S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist) +{ + /* This hard-codes the Latin1/above-Latin1 folding rules, so that an + * innocent-looking character class, like /[ks]/i won't have to go out to + * disk to find the possible matches. + * + * This should be called only for a Latin1-range code points, cp, which is + * known to be involved in a fold with other code points above Latin1. It + * would give false results if /aa has been specified. Multi-char folds + * are outside the scope of this, and must be handled specially. + * + * XXX It would be better to generate these via regen, in case a new + * version of the Unicode standard adds new mappings, though that is not + * really likely, and may be caught by the default: case of the switch + * below. */ + + PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS; + + switch (cp) { + case 'k': + case 'K': + *invlist = + add_cp_to_invlist(*invlist, KELVIN_SIGN); + break; + case 's': + case 'S': + *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S); + break; + case MICRO_SIGN: + *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU); + *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU); + break; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN); + break; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + *invlist = add_cp_to_invlist(*invlist, + LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); + break; + case LATIN_SMALL_LETTER_SHARP_S: + *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S); + break; + case 'F': case 'f': + case 'I': case 'i': + case 'L': case 'l': + case 'T': case 't': + case 'A': case 'a': + case 'H': case 'h': + case 'J': case 'j': + case 'N': case 'n': + case 'W': case 'w': + case 'Y': case 'y': + /* These all are targets of multi-character folds from code points + * that require UTF8 to express, so they can't match unless the + * target string is in UTF-8, so no action here is necessary, as + * regexec.c properly handles the general case for UTF-8 matching + * and multi-char folds */ + break; + default: + /* Use deprecated warning to increase the chances of this being + * output */ + ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp); + break; + } +} + /* The names of properties whose definitions are not known at compile time are * stored in this SV, after a constant heading. So if the length has been * changed since initialization, then there is a run-time definition. */ @@ -13297,7 +13403,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */ @@ -13307,7 +13413,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_naughty++; if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } } @@ -13345,7 +13451,7 @@ parseit: if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (UCHARAT(RExC_parse) == ']') { @@ -13460,7 +13566,6 @@ parseit: } if (!SIZE_ONLY) { SV* invlist; - char* formatted; char* name; if (UCHARAT(RExC_parse) == '^') { @@ -13481,14 +13586,13 @@ parseit: * will have its name be <__NAME_i>. The design is * discussed in commit * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ - formatted = Perl_form(aTHX_ + name = savepv(Perl_form(aTHX_ "%s%.*s%s\n", (FOLD) ? "__" : "", (int)n, RExC_parse, (FOLD) ? "_i" : "" - ); - name = savepvn(formatted, strlen(formatted)); + )); /* Look up the property name, and get its swash and * inversion list, if the property is found */ @@ -13517,6 +13621,19 @@ parseit: "Property '%"UTF8f"' is unknown", UTF8fARG(UTF, n, name)); } + + /* If the property name doesn't already have a package + * name, add the current one to it so that it can be + * referred to outside it. [perl #121777] */ + if (! instr(name, "::") && PL_curstash) { + char* full_name = Perl_form(aTHX_ + "%s::%s", + HvNAME(PL_curstash), + name); + n = strlen(full_name); + Safefree(name); + name = savepvn(full_name, n); + } Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", (value == 'p' ? '+' : '!'), UTF8fARG(UTF, n, name)); @@ -13756,6 +13873,11 @@ parseit: ANYOF_POSIXL_ZERO(ret); } + /* Coverity thinks it is possible for this to be negative; both + * jhi and khw think it's not, but be safer */ + assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL) + || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0); + /* See if it already matches the complement of this POSIX * class */ if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL) @@ -13856,7 +13978,7 @@ parseit: if (skip_white) { RExC_parse = regpatws(pRExC_state, RExC_parse, - FALSE /* means don't recognize comments */); + FALSE /* means don't recognize comments */ ); } if (range) { @@ -13966,7 +14088,7 @@ parseit: AV* this_array; STRLEN cp_count = utf8_length(foldbuf, foldbuf + foldlen); - SV* multi_fold = sv_2mortal(newSVpvn("", 0)); + SV* multi_fold = sv_2mortal(newSVpvs("")); Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); @@ -14033,7 +14155,7 @@ parseit: && ((prevvalue >= 'a' && value <= 'z') || (prevvalue >= 'A' && value <= 'Z'))) { - _invlist_intersection(this_range, PL_ASCII, + _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII], &this_range); /* Since this above only contains ascii, the intersection of it @@ -14241,6 +14363,26 @@ parseit: op = POSIXA; } } + else if (prevvalue == 'A') { + if (value == 'Z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_UPPER; + op = POSIXA; + } + } + else if (prevvalue == 'a') { + if (value == 'z' +#ifdef EBCDIC + && literal_endpoint == 2 +#endif + ) { + arg = (FOLD) ? _CC_ALPHA : _CC_LOWER; + op = POSIXA; + } + } } /* Here, we have changed away from its initial value iff we found @@ -14336,18 +14478,7 @@ parseit: /* This is a hash that for a particular fold gives all * characters that are involved in it */ if (! PL_utf8_foldclosures) { - - /* If the folds haven't been read in, call a fold function - * to force that */ - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES_CASE+1]; - - /* This string is just a short named one above \xff */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); - assert(PL_utf8_tofold); /* Verify that worked */ - } - PL_utf8_foldclosures - = _swash_inversion_hash(PL_utf8_tofold); + _load_PL_utf8_foldclosures(); } } @@ -14364,15 +14495,6 @@ parseit: if (j < 256) { - /* We have the latin1 folding rules hard-coded here so - * that an innocent-looking character class, like - * /[ks]/i won't have to go out to disk to find the - * possible matches. XXX It would be better to - * generate these via regen, in case a new version of - * the Unicode standard adds new mappings, though that - * is not really likely, and may be caught by the - * default: case of the switch below. */ - if (IS_IN_SOME_FOLD_L1(j)) { /* ASCII is always matched; non-ASCII is matched @@ -14389,72 +14511,12 @@ parseit: } } - if (HAS_NONLATIN1_FOLD_CLOSURE(j) + if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j) && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED)) { - /* Certain Latin1 characters have matches outside - * Latin1. To get here, is one of those - * characters. None of these matches is valid for - * ASCII characters under /aa, which is why the 'if' - * just above excludes those. These matches only - * happen when the target string is utf8. The code - * below adds the single fold closures for to the - * inversion list. */ - - switch (j) { - case 'k': - case 'K': - *use_list = - add_cp_to_invlist(*use_list, KELVIN_SIGN); - break; - case 's': - case 'S': - *use_list = add_cp_to_invlist(*use_list, - LATIN_SMALL_LETTER_LONG_S); - break; - case MICRO_SIGN: - *use_list = add_cp_to_invlist(*use_list, - GREEK_CAPITAL_LETTER_MU); - *use_list = add_cp_to_invlist(*use_list, - GREEK_SMALL_LETTER_MU); - break; - case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: - case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: - *use_list = - add_cp_to_invlist(*use_list, ANGSTROM_SIGN); - break; - case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: - *use_list = add_cp_to_invlist(*use_list, - LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); - break; - case LATIN_SMALL_LETTER_SHARP_S: - *use_list = add_cp_to_invlist(*use_list, - LATIN_CAPITAL_LETTER_SHARP_S); - break; - case 'F': case 'f': - case 'I': case 'i': - case 'L': case 'l': - case 'T': case 't': - case 'A': case 'a': - case 'H': case 'h': - case 'J': case 'j': - case 'N': case 'n': - case 'W': case 'w': - case 'Y': case 'y': - /* These all are targets of multi-character - * folds from code points that require UTF8 - * to express, so they can't match unless - * the target string is in UTF-8, so no - * action here is necessary, as regexec.c - * properly handles the general case for - * UTF-8 matching and multi-char folds */ - break; - default: - /* Use deprecated warning to increase the - * chances of this being output */ - ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j); - break; - } + add_above_Latin1_folds(pRExC_state, + (U8) j, + use_list); } continue; } @@ -14483,9 +14545,8 @@ parseit: for (k = 0; k <= av_tindex(list); k++) { SV** c_p = av_fetch(list, k, FALSE); UV c; - if (c_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } + assert(c_p); + c = SvUV(*c_p); /* /aa doesn't allow folds between ASCII and non- */ @@ -14953,35 +15014,34 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, /* reg_skipcomment() - Absorbs an /x style # comments from the input stream. - Returns true if there is more text remaining in the stream. - Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment - terminates the pattern without including a newline. + Absorbs an /x style # comment from the input stream, + returning a pointer to the first character beyond the comment, or if the + comment terminates the pattern without anything following it, this returns + one past the final character of the pattern (in other words, RExC_end) and + sets the REG_RUN_ON_COMMENT_SEEN flag. - Note its the callers responsibility to ensure that we are + Note it's the callers responsibility to ensure that we are actually in /x mode */ -STATIC bool -S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) +PERL_STATIC_INLINE char* +S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state, char* p) { - bool ended = 0; - PERL_ARGS_ASSERT_REG_SKIPCOMMENT; - while (RExC_parse < RExC_end) - if (*RExC_parse++ == '\n') { - ended = 1; - break; + assert(*p = '#'); + + while (p < RExC_end) { + if (*(++p) == '\n') { + return p+1; } - if (!ended) { - /* we ran off the end of the pattern without ending - the comment, so we have to add an \n when wrapping */ - RExC_seen |= REG_RUN_ON_COMMENT_SEEN; - return 0; - } else - return 1; + } + + /* we ran off the end of the pattern without ending the comment, so we have + * to add an \n when wrapping */ + RExC_seen |= REG_RUN_ON_COMMENT_SEEN; + return p; } /* nextchar() @@ -15019,16 +15079,14 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) continue; } if (RExC_flags & RXf_PMf_EXTENDED) { - if (isSPACE(*RExC_parse)) { - RExC_parse++; - continue; - } - else if (*RExC_parse == '#') { - if ( reg_skipcomment( pRExC_state ) ) - continue; - } + char * p = regpatws(pRExC_state, RExC_parse, + TRUE); /* means recognize comments */ + if (p != RExC_parse) { + RExC_parse = p; + continue; + } } - return retval; + return retval; } } @@ -16134,7 +16192,16 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) PerlMemShared_free(aho->fail); /* do this last!!!! */ PerlMemShared_free(ri->data->data[n]); - PerlMemShared_free(ri->regstclass); + /* we should only ever get called once, so + * assert as much, and also guard the free + * which /might/ happen twice. At the least + * it will make code anlyzers happy and it + * doesn't cost much. - Yves */ + assert(ri->regstclass); + if (ri->regstclass) { + PerlMemShared_free(ri->regstclass); + ri->regstclass = 0; + } } } break;