X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/eb055bf0f4aa39363f10d941305f5dc9c3cdbc53..d9a91485293e1414746fd028b3782f699519105e:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 23f3fe8..547b911 100644 --- a/regcomp.c +++ b/regcomp.c @@ -74,10 +74,6 @@ #define PERL_IN_REGCOMP_C #include "perl.h" -#ifndef PERL_IN_XSUB_RE -# include "INTERN.h" -#endif - #define REG_COMP_C #ifdef PERL_IN_XSUB_RE # include "re_comp.h" @@ -167,6 +163,7 @@ struct RExC_state_t { I32 seen_zerolen; regnode_offset *open_parens; /* offsets to open parens */ regnode_offset *close_parens; /* offsets to close parens */ + I32 parens_buf_size; /* #slots malloced open/close_parens */ regnode *end_op; /* END node in program */ I32 utf8; /* whether the pattern is utf8 or not */ I32 orig_utf8; /* whether the pattern was originally in utf8 */ @@ -197,6 +194,7 @@ struct RExC_state_t { scan_frame *frame_last; U32 frame_count; AV *warn_text; + HV *unlexed_names; #ifdef ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -256,6 +254,7 @@ struct RExC_state_t { #define RExC_maxlen (pRExC_state->maxlen) #define RExC_npar (pRExC_state->npar) #define RExC_total_parens (pRExC_state->total_par) +#define RExC_parens_buf_size (pRExC_state->parens_buf_size) #define RExC_nestroot (pRExC_state->nestroot) #define RExC_seen_zerolen (pRExC_state->seen_zerolen) #define RExC_utf8 (pRExC_state->utf8) @@ -284,6 +283,7 @@ struct RExC_state_t { #define RExC_warn_text (pRExC_state->warn_text) #define RExC_in_script_run (pRExC_state->in_script_run) #define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ) +#define RExC_unlexed_names (pRExC_state->unlexed_names) /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set * a flag to disable back-off on the fixed/floating substrings - if it's @@ -355,7 +355,7 @@ struct RExC_state_t { if (DEPENDS_SEMANTICS) { \ set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \ RExC_uni_semantics = 1; \ - if (RExC_seen_d_op && LIKELY(RExC_total_parens >= 0)) { \ + if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \ /* No need to restart the parse if we haven't seen \ * anything that differs between /u and /d, and no need \ * to restart immediately if we're going to reparse \ @@ -366,11 +366,10 @@ struct RExC_state_t { } \ } STMT_END -#define BRANCH_MAX_OFFSET U16_MAX #define REQUIRE_BRANCHJ(flagp, restart_retval) \ STMT_START { \ RExC_use_BRANCHJ = 1; \ - if (LIKELY(RExC_total_parens >= 0)) { \ + if (LIKELY(! IN_PARENS_PASS)) { \ /* No need to restart the parse immediately if we're \ * going to reparse anyway to count parens */ \ *flagp |= RESTART_PARSE; \ @@ -378,10 +377,19 @@ struct RExC_state_t { } \ } STMT_END +/* Until we have completed the parse, we leave RExC_total_parens at 0 or + * less. After that, it must always be positive, because the whole re is + * considered to be surrounded by virtual parens. Setting it to negative + * indicates there is some construct that needs to know the actual number of + * parens to be properly handled. And that means an extra pass will be + * required after we've counted them all */ +#define ALL_PARENS_COUNTED (RExC_total_parens > 0) #define REQUIRE_PARENS_PASS \ - STMT_START { \ - if (RExC_total_parens == 0) RExC_total_parens = -1; \ + STMT_START { /* No-op if have completed a pass */ \ + if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \ } STMT_END +#define IN_PARENS_PASS (RExC_total_parens < 0) + /* This is used to return failure (zero) early from the calling function if * various flags in 'flags' are set. Two flags always cause a return: @@ -698,7 +706,7 @@ static const scan_data_t zero_scan_data = { /* Used to point after bad bytes for an error message, but avoid skipping * past a nul byte. */ -#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1) +#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1) /* Set up to clean up after our imminent demise */ #define PREPARE_TO_DIE \ @@ -1546,6 +1554,10 @@ S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state, return TRUE; } +#define INVLIST_INDEX 0 +#define ONLY_LOCALE_MATCHES_INDEX 1 +#define DEFERRED_USER_DEFINED_INDEX 2 + STATIC SV* S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, const regnode_charclass* const node) @@ -1556,6 +1568,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, * returned list must, and will, contain every code point that is a * possibility. */ + dVAR; SV* invlist = NULL; SV* only_utf8_locale_invlist = NULL; unsigned int i; @@ -1571,28 +1584,24 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, SV **const ary = AvARRAY(av); assert(RExC_rxi->data->what[n] == 's'); - if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */ - invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]), NULL)); - } - else if (ary[0] && ary[0] != &PL_sv_undef) { + if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) { - /* Here, no compile-time swash, and there are things that won't be - * known until runtime -- we have to assume it could be anything */ + /* Here there are things that won't be known until runtime -- we + * have to assume it could be anything */ invlist = sv_2mortal(_new_invlist(1)); return _add_range_to_invlist(invlist, 0, UV_MAX); } - else if (ary[3] && ary[3] != &PL_sv_undef) { + else if (ary[INVLIST_INDEX]) { - /* Here no compile-time swash, and no run-time only data. Use the - * node's inversion list */ - invlist = sv_2mortal(invlist_clone(ary[3], NULL)); + /* Use the node's inversion list */ + invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL)); } /* Get the code points valid only under UTF-8 locales */ - if ((ANYOF_FLAGS(node) & ANYOFL_FOLD) - && ary[2] && ary[2] != &PL_sv_undef) + if ( (ANYOF_FLAGS(node) & ANYOFL_FOLD) + && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) { - only_utf8_locale_invlist = ary[2]; + only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX]; } } @@ -1617,15 +1626,19 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, } /* Add in the points from the bit map */ - for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { - if (ANYOF_BITMAP_TEST(node, i)) { - unsigned int start = i++; + if (OP(node) != ANYOFH) { + for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { + if (ANYOF_BITMAP_TEST(node, i)) { + unsigned int start = i++; - for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) { - /* empty */ + for (; i < NUM_ANYOF_CODE_POINTS + && ANYOF_BITMAP_TEST(node, i); ++i) + { + /* empty */ + } + invlist = _add_range_to_invlist(invlist, start, i-1); + new_node_has_latin1 = TRUE; } - invlist = _add_range_to_invlist(invlist, start, i-1); - new_node_has_latin1 = TRUE; } } @@ -1647,11 +1660,26 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, if (ANYOF_FLAGS(node) & ANYOF_INVERT) { _invlist_invert(invlist); } - else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) { + else if (ANYOF_FLAGS(node) & ANYOFL_FOLD) { + if (new_node_has_latin1) { - /* Under /li, any 0-255 could fold to any other 0-255, depending on the - * locale. We can skip this if there are no 0-255 at all. */ - _invlist_union(invlist, PL_Latin1, &invlist); + /* Under /li, any 0-255 could fold to any other 0-255, depending on + * the locale. We can skip this if there are no 0-255 at all. */ + _invlist_union(invlist, PL_Latin1, &invlist); + + invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I); + invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); + } + else { + if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) { + invlist = add_cp_to_invlist(invlist, 'I'); + } + if (_invlist_contains_cp(invlist, + LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE)) + { + invlist = add_cp_to_invlist(invlist, 'i'); + } + } } /* Similarly add the UTF-8 locale possible matches. These have to be @@ -1685,6 +1713,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * another SSC or a regular ANYOF class. Can create false positives. */ SV* anded_cp_list; + U8 and_with_flags = (OP(and_with) == ANYOFH) ? 0 : ANYOF_FLAGS(and_with); U8 anded_flags; PERL_ARGS_ASSERT_SSC_AND; @@ -1695,7 +1724,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * the code point inversion list and just the relevant flags */ if (is_ANYOF_SYNTHETIC(and_with)) { anded_cp_list = ((regnode_ssc *)and_with)->invlist; - anded_flags = ANYOF_FLAGS(and_with); + anded_flags = and_with_flags; /* XXX This is a kludge around what appears to be deficiencies in the * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag, @@ -1719,14 +1748,14 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, else { anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with); if (OP(and_with) == ANYOFD) { - anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS; + anded_flags = and_with_flags & ANYOF_COMMON_FLAGS; } else { - anded_flags = ANYOF_FLAGS(and_with) + anded_flags = and_with_flags &( ANYOF_COMMON_FLAGS |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP); - if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) { + if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) { anded_flags &= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; } @@ -1766,7 +1795,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * <= (C1 & ~C2) | (P1 & ~P2) * */ - if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT) + if ((and_with_flags & ANYOF_INVERT) && ! is_ANYOF_SYNTHETIC(and_with)) { unsigned int i; @@ -1778,7 +1807,7 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, /* If either P1 or P2 is empty, the intersection will be also; can skip * the loop */ - if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) { + if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) { ANYOF_POSIXL_ZERO(ssc); } else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { @@ -1838,16 +1867,16 @@ S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, else { ssc->invlist = anded_cp_list; ANYOF_POSIXL_ZERO(ssc); - if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) { + if (and_with_flags & ANYOF_MATCHES_POSIXL) { ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc); } } } else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc) - || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) + || (and_with_flags & ANYOF_MATCHES_POSIXL)) { /* One or the other of P1, P2 is non-empty. */ - if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) { + if (and_with_flags & ANYOF_MATCHES_POSIXL) { ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc); } ssc_union(ssc, anded_cp_list, FALSE); @@ -1868,6 +1897,7 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, SV* ored_cp_list; U8 ored_flags; + U8 or_with_flags = (OP(or_with) == ANYOFH) ? 0 : ANYOF_FLAGS(or_with); PERL_ARGS_ASSERT_SSC_OR; @@ -1877,17 +1907,17 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * the code point inversion list and just the relevant flags */ if (is_ANYOF_SYNTHETIC(or_with)) { ored_cp_list = ((regnode_ssc*) or_with)->invlist; - ored_flags = ANYOF_FLAGS(or_with); + ored_flags = or_with_flags; } else { ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with); - ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS; + ored_flags = or_with_flags & ANYOF_COMMON_FLAGS; if (OP(or_with) != ANYOFD) { ored_flags - |= ANYOF_FLAGS(or_with) + |= or_with_flags & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP); - if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) { + if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) { ored_flags |= ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; } @@ -1914,12 +1944,12 @@ S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, * (which results in actually simpler code than the non-inverted case) * */ - if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT) + if ((or_with_flags & ANYOF_INVERT) && ! is_ANYOF_SYNTHETIC(or_with)) { /* We ignore P2, leaving P1 going forward */ } /* else Not inverted */ - else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) { + else if (or_with_flags & ANYOF_MATCHES_POSIXL) { ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc); if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) { unsigned int i; @@ -2038,7 +2068,7 @@ S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc) U32 count = 0; /* Running total of number of code points matched by 'ssc' */ UV start, end; /* Start and end points of current range in inversion - list */ + XXX outdated. UTF-8 locales are common, what about invert? list */ const U32 max_code_points = (LOC) ? 256 : (( ! UNI_SEMANTICS @@ -2090,8 +2120,7 @@ S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc) populate_ANYOF_from_invlist( (regnode *) ssc, &invlist); - set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, - NULL, NULL, NULL, FALSE); + set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL); /* Make sure is clone-safe */ ssc->invlist = NULL; @@ -2675,7 +2704,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, trie_words = newAV(); }); - re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1); + re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD); assert(re_trie_maxbuff); if (!SvIOK(re_trie_maxbuff)) { sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); @@ -3906,7 +3935,7 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour * that a character in the pattern corresponds to at most a single * character in the target string. (And I do mean character, and not byte * here, unlike other parts of the documentation that have never been - * updated to account for multibyte Unicode.) sharp s in EXACTF and + * updated to account for multibyte Unicode.) Sharp s in EXACTF and * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA * nodes it can match "\x{17F}\x{17F}". These, along with other ones in * EXACTFL nodes, violate the assumption, and they are the only instances @@ -4335,6 +4364,23 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, } #endif } + + if ( STR_LEN(scan) == 1 + && isALPHA_A(* STRING(scan)) + && ( OP(scan) == EXACTFAA + || ( OP(scan) == EXACTFU + && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(scan))))) + { + U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */ + + /* Replace a length 1 ASCII fold pair node with an ANYOFM node, + * with the mask set to the complement of the bit that differs + * between upper and lower case, and the lowest code point of the + * pair (which the '&' forces) */ + OP(scan) = ANYOFM; + ARG_SET(scan, *STRING(scan) & mask); + FLAGS(scan) = mask; + } } #ifdef DEBUGGING @@ -4393,6 +4439,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* recursed: which subroutines have we recursed into */ /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */ { + dVAR; /* There must be at least this number of characters to match */ SSize_t min = 0; I32 pars = 0, code; @@ -5275,6 +5322,27 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, OP(next) = EXACTFU; } + if ( STR_LEN(next) == 1 + && isALPHA_A(* STRING(next)) + && ( OP(next) == EXACTFAA + || ( OP(next) == EXACTFU + && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))) + { + /* These differ in just one bit */ + U8 mask = ~ ('A' ^ 'a'); + + assert(isALPHA_A(* STRING(next))); + + /* Then replace it by an ANYOFM node, with + * the mask set to the complement of the + * bit that differs between upper and lower + * case, and the lowest code point of the + * pair (which the '&' forces) */ + OP(next) = ANYOFM; + ARG_SET(next, *STRING(next) & mask); + FLAGS(next) = mask; + } + if (flags & SCF_DO_STCLASS) { mincount = 0; maxcount = REG_INFTY; @@ -5553,9 +5621,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, STRLEN l; const char * const s = SvPV_const(data->last_found, l); SSize_t old = b - data->last_start_min; + assert(old >= 0); if (UTF) - old = utf8_hop((U8*)s, old) - (U8*)s; + old = utf8_hop_forward((U8*)s, old, + (U8 *) SvEND(data->last_found)) + - (U8*)s; l -= old; /* Get the added string: */ last_str = newSVpvn_utf8(s + old, l, UTF); @@ -5763,6 +5834,7 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", case ANYOFD: case ANYOFL: case ANYOFPOSIXL: + case ANYOFH: case ANYOF: if (flags & SCF_DO_STCLASS_AND) ssc_and(pRExC_state, data->start_class, @@ -5828,21 +5900,11 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", } break; - case NASCII: - invert = 1; - /* FALLTHROUGH */ - case ASCII: - my_invlist = invlist_clone(PL_Posix_ptrs[_CC_ASCII], NULL); - - /* This can be handled as a Posix class */ - goto join_posix_and_ascii; - case NPOSIXA: /* For these, we always know the exact set of what's matched */ invert = 1; /* FALLTHROUGH */ case POSIXA: - assert(FLAGS(scan) != _CC_ASCII); my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL); goto join_posix_and_ascii; @@ -5929,14 +5991,27 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", last, &data_fake, stopparen, recursed_depth, NULL, f, depth+1); if (scan->flags) { - if (deltanext) { - FAIL("Variable length lookbehind not implemented"); - } - else if (minnext > (I32)U8_MAX) { + if ( deltanext < 0 + || deltanext > (I32) U8_MAX + || minnext > (I32)U8_MAX + || minnext + deltanext > (I32)U8_MAX) + { FAIL2("Lookbehind longer than %" UVuf " not implemented", (UV)U8_MAX); } - scan->flags = (U8)minnext; + + /* The 'next_off' field has been repurposed to count the + * additional starting positions to try beyond the initial + * one. (This leaves it at 0 for non-variable length + * matches to avoid breakage for those not using this + * extension) */ + if (deltanext) { + scan->next_off = deltanext; + ckWARNexperimental(RExC_parse, + WARN_EXPERIMENTAL__VLB, + "Variable length lookbehind is experimental"); + } + scan->flags = (U8)minnext + deltanext; } if (data) { if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) @@ -6021,14 +6096,21 @@ Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", stopparen, recursed_depth, NULL, f, depth+1); if (scan->flags) { - if (deltanext) { - FAIL("Variable length lookbehind not implemented"); - } - else if (*minnextp > (I32)U8_MAX) { + assert(0); /* This code has never been tested since this + is normally not compiled */ + if ( deltanext < 0 + || deltanext > (I32) U8_MAX + || *minnextp > (I32)U8_MAX + || *minnextp + deltanext > (I32)U8_MAX) + { FAIL2("Lookbehind longer than %" UVuf " not implemented", (UV)U8_MAX); } - scan->flags = (U8)*minnextp; + + if (deltanext) { + scan->next_off = deltanext; + } + scan->flags = (U8)*minnextp + deltanext; } *minnextp += min; @@ -7254,6 +7336,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, OP *expr, const regexp_engine* eng, REGEXP *old_re, bool *is_bare_re, const U32 orig_rx_flags, const U32 pm_flags) { + dVAR; REGEXP *Rx; /* Capital 'R' means points to a REGEXP */ STRLEN plen; char *exp; @@ -7311,6 +7394,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } pRExC_state->warn_text = NULL; + pRExC_state->unlexed_names = NULL; pRExC_state->code_blocks = NULL; if (is_bare_re) @@ -7609,6 +7693,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_naughty = 0; RExC_npar = 1; + RExC_parens_buf_size = 0; RExC_emit_start = RExC_rxi->program; pRExC_state->code_index = 0; @@ -7618,9 +7703,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* Do the parse */ if (reg(pRExC_state, 0, &flags, 1)) { - /* Success!, But if RExC_total_parens < 0, we need to redo the parse - * knowing how many parens there actually are */ - if (RExC_total_parens < 0) { + /* Success!, But we may need to redo the parse knowing how many parens + * there actually are */ + if (IN_PARENS_PASS) { flags |= RESTART_PARSE; } @@ -7662,7 +7747,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo parse\n")); } - if (RExC_total_parens > 0) { + if (ALL_PARENS_COUNTED) { /* Make enough room for all the known parens, and zero it */ Renew(RExC_open_parens, RExC_total_parens, regnode_offset); Zero(RExC_open_parens, RExC_total_parens, regnode_offset); @@ -8760,7 +8845,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) /* It might be a forward reference; we can't fail until we * know, by completing the parse to get all the groups, and * then reparsing */ - if (RExC_total_parens > 0) { + if (ALL_PARENS_COUNTED) { vFAIL("Reference to nonexistent named group"); } else { @@ -8775,7 +8860,6 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) } #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ - int num; \ if (RExC_lastparse!=RExC_parse) { \ Perl_re_printf( aTHX_ "%s", \ Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \ @@ -8791,16 +8875,15 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) } else \ Perl_re_printf( aTHX_ "%16s",""); \ \ - num=REG_NODE_NUM(REGNODE_p(RExC_emit)); \ - if (RExC_lastnum!=num) \ - Perl_re_printf( aTHX_ "|%4d", num); \ + if (RExC_lastnum!=RExC_emit) \ + Perl_re_printf( aTHX_ "|%4d", RExC_emit); \ else \ Perl_re_printf( aTHX_ "|%4s",""); \ Perl_re_printf( aTHX_ "|%*s%-4s", \ (int)((depth*2)), "", \ (funcname) \ ); \ - RExC_lastnum=num; \ + RExC_lastnum=RExC_emit; \ RExC_lastparse=RExC_parse; \ }) @@ -9084,9 +9167,7 @@ Perl__new_invlist(pTHX_ IV initial_size) initial_size = 10; } - /* Allocate the initial space */ new_list = newSV_type(SVt_INVLIST); - initialize_invlist_guts(new_list, initial_size); return new_list; @@ -9342,100 +9423,6 @@ Perl__invlist_search(SV* const invlist, const UV cp) } void -Perl__invlist_populate_swatch(SV* const invlist, - const UV start, const UV end, U8* swatch) -{ - /* populates a swatch of a swash the same way swatch_get() does in utf8.c, - * but is used when the swash has an inversion list. This makes this much - * faster, as it uses a binary search instead of a linear one. This is - * intimately tied to that function, and perhaps should be in utf8.c, - * except it is intimately tied to inversion lists as well. It assumes - * that is all 0's on input */ - - UV current = start; - const IV len = _invlist_len(invlist); - IV i; - const UV * array; - - PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH; - - if (len == 0) { /* Empty inversion list */ - return; - } - - array = invlist_array(invlist); - - /* Find which element it is */ - i = _invlist_search(invlist, start); - - /* We populate from to */ - while (current < end) { - UV upper; - - /* The inversion list gives the results for every possible code point - * after the first one in the list. Only those ranges whose index is - * even are ones that the inversion list matches. For the odd ones, - * and if the initial code point is not in the list, we have to skip - * forward to the next element */ - if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) { - i++; - if (i >= len) { /* Finished if beyond the end of the array */ - return; - } - current = array[i]; - if (current >= end) { /* Finished if beyond the end of what we - are populating */ - if (LIKELY(end < UV_MAX)) { - return; - } - - /* We get here when the upper bound is the maximum - * representable on the machine, and we are looking for just - * that code point. Have to special case it */ - i = len; - goto join_end_of_list; - } - } - assert(current >= start); - - /* The current range ends one below the next one, except don't go past - * */ - i++; - upper = (i < len && array[i] < end) ? array[i] : end; - - /* Here we are in a range that matches. Populate a bit in the 3-bit U8 - * for each code point in it */ - for (; current < upper; current++) { - const STRLEN offset = (STRLEN)(current - start); - swatch[offset >> 3] |= 1 << (offset & 7); - } - - join_end_of_list: - - /* Quit if at the end of the list */ - if (i >= len) { - - /* But first, have to deal with the highest possible code point on - * the platform. The previous code assumes that is one - * beyond where we want to populate, but that is impossible at the - * platform's infinity, so have to handle it specially */ - if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1))) - { - const STRLEN offset = (STRLEN)(end - start); - swatch[offset >> 3] |= 1 << (offset & 7); - } - return; - } - - /* Advance to the next range, which will be for code points not in the - * inversion list */ - current = array[i]; - } - - return; -} - -void Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output) { @@ -10271,18 +10258,15 @@ Perl__invlist_invert(pTHX_ SV* const invlist) SV* Perl_invlist_clone(pTHX_ SV* const invlist, SV* new_invlist) { - /* Return a new inversion list that is a copy of the input one, which is * unchanged. The new list will not be mortal even if the old one was. */ - const STRLEN nominal_length = _invlist_len(invlist); /* Why not +1 XXX */ + const STRLEN nominal_length = _invlist_len(invlist); const STRLEN physical_length = SvCUR(invlist); const bool offset = *(get_invlist_offset_addr(invlist)); PERL_ARGS_ASSERT_INVLIST_CLONE; - /* Need to allocate extra space to accommodate Perl's addition of a - * trailing NUL to SvPV's, since it thinks they are always strings */ if (new_invlist == NULL) { new_invlist = _new_invlist(nominal_length); } @@ -10573,7 +10557,8 @@ Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) /* * As best we can, determine the characters that can match the start of - * the given EXACTF-ish node. + * the given EXACTF-ish node. This is for use in creating ssc nodes, so there + * can be false positive matches * * Returns the invlist as a new SV*; it is the caller's responsibility to * call SvREFCNT_dec() when done with it. @@ -10581,6 +10566,7 @@ Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) STATIC SV* S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) { + dVAR; const U8 * s = (U8*)STRING(node); SSize_t bytelen = STR_LEN(node); UV uc; @@ -10605,9 +10591,14 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) } else { /* Any Latin1 range character can potentially match any - * other depending on the locale */ + * other depending on the locale, and in Turkic locales, U+130 and + * U+131 */ if (OP(node) == EXACTFL) { _invlist_union(invlist, PL_Latin1, &invlist); + invlist = add_cp_to_invlist(invlist, + LATIN_SMALL_LETTER_DOTLESS_I); + invlist = add_cp_to_invlist(invlist, + LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); } else { /* But otherwise, it matches at least itself. We can @@ -10711,6 +10702,26 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) invlist = add_cp_to_invlist(invlist, c); } + + if (OP(node) == EXACTFL) { + + /* If either [iI] are present in an EXACTFL node the above code + * should have added its normal case pair, but under a Turkish + * locale they could match instead the case pairs from it. Add + * those as potential matches as well */ + if (isALPHA_FOLD_EQ(fc, 'I')) { + invlist = add_cp_to_invlist(invlist, + LATIN_SMALL_LETTER_DOTLESS_I); + invlist = add_cp_to_invlist(invlist, + LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE); + } + else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) { + invlist = add_cp_to_invlist(invlist, 'I'); + } + else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { + invlist = add_cp_to_invlist(invlist, 'i'); + } + } } } @@ -10923,7 +10934,7 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) return; default: fail_modifiers: - RExC_parse += SKIP_IF_CHAR(RExC_parse); + RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end); /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized", UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); @@ -11026,6 +11037,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) I32 freeze_paren = 0; I32 after_freeze = 0; I32 num; /* numeric backreferences */ + SV * max_open; /* Max number of unclosed parens */ char * parse_start = RExC_parse; /* MJD */ char * const oregcomp_parse = RExC_parse; @@ -11035,6 +11047,17 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) PERL_ARGS_ASSERT_REG; DEBUG_PARSE("reg "); + + max_open = get_sv(RE_COMPILE_RECURSION_LIMIT, GV_ADD); + assert(max_open); + if (!SvIOK(max_open)) { + sv_setiv(max_open, RE_COMPILE_RECURSION_INIT); + } + if (depth > 4 * SvIV(max_open)) { /* We increase depth by 4 for each open + paren */ + vFAIL("Too many nested open parens"); + } + *flagp = 0; /* Tentatively. */ /* Having this true makes it feasible to have a lot fewer tests for the @@ -11323,7 +11346,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) } /* End of switch */ if ( ! op ) { - RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += UTF + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; if (has_upper || verb_len == 0) { vFAIL2utf8f( "Unknown verb pattern '%" UTF8f "'", @@ -11403,7 +11428,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) return handle_named_backref(pRExC_state, flagp, parse_start, ')'); } - RExC_parse += SKIP_IF_CHAR(RExC_parse); + RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end); /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); @@ -11620,7 +11645,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /* It might be a forward reference; we can't fail until * we know, by completing the parse to get all the * groups, and then reparsing */ - if (RExC_total_parens > 0) { + if (ALL_PARENS_COUNTED) { RExC_parse++; vFAIL("Reference to nonexistent group"); } @@ -11646,7 +11671,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) /* It might be a forward reference; we can't fail until we * know, by completing the parse to get all the groups, and * then reparsing */ - if (RExC_total_parens > 0) { + if (ALL_PARENS_COUNTED) { if (num >= RExC_total_parens) { RExC_parse++; vFAIL("Reference to nonexistent group"); @@ -11678,7 +11703,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) case '?': /* (??...) */ is_logical = 1; if (*RExC_parse != '{') { - RExC_parse += SKIP_IF_CHAR(RExC_parse); + RExC_parse += SKIP_IF_CHAR(RExC_parse, RExC_end); /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ vFAIL2utf8f( "Sequence (%" UTF8f "...) not recognized", @@ -11876,7 +11901,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) insert_if_check_paren: if (UCHARAT(RExC_parse) != ')') { - RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += UTF + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; vFAIL("Switch condition not recognized"); } nextchar(pRExC_state); @@ -11938,7 +11965,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) #endif return ret; } - RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += UTF + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; vFAIL("Unknown switch condition (?(...))"); } case '[': /* (?[ ... ]) */ @@ -11948,6 +11977,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) RExC_parse--; /* for vFAIL to print correctly */ vFAIL("Sequence (? incomplete"); break; + + case ')': + if (RExC_strict) { /* [perl #132851] */ + ckWARNreg(RExC_parse, "Empty (?) without any modifiers"); + } + /* FALLTHROUGH */ default: /* e.g., (?i) */ RExC_parse = (char *) seqstart + 1; parse_flags: @@ -11977,34 +12012,47 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) capturing_parens: parno = RExC_npar; RExC_npar++; - if (RExC_total_parens <= 0) { + if (! ALL_PARENS_COUNTED) { /* If we are in our first pass through (and maybe only pass), * we need to allocate memory for the capturing parentheses - * data structures. Since we start at npar=1, when it reaches - * 2, for the first time it has something to put in it. Above - * 2 means we extend what we already have */ - if (RExC_npar == 2) { + * data structures. + */ + + if (!RExC_parens_buf_size) { + /* first guess at number of parens we might encounter */ + RExC_parens_buf_size = 10; + /* setup RExC_open_parens, which holds the address of each * OPEN tag, and to make things simpler for the 0 index the * start of the program - this is used later for offsets */ - Newxz(RExC_open_parens, RExC_npar, regnode_offset); + Newxz(RExC_open_parens, RExC_parens_buf_size, + regnode_offset); RExC_open_parens[0] = 1; /* +1 for REG_MAGIC */ /* setup RExC_close_parens, which holds the address of each * CLOSE tag, and to make things simpler for the 0 index * the end of the program - this is used later for offsets * */ - Newxz(RExC_close_parens, RExC_npar, regnode_offset); + Newxz(RExC_close_parens, RExC_parens_buf_size, + regnode_offset); /* we dont know where end op starts yet, so we dont need to * set RExC_close_parens[0] like we do RExC_open_parens[0] * above */ } - else { - Renew(RExC_open_parens, RExC_npar, regnode_offset); - Zero(RExC_open_parens + RExC_npar - 1, 1, regnode_offset); + else if (RExC_npar > RExC_parens_buf_size) { + I32 old_size = RExC_parens_buf_size; + + RExC_parens_buf_size *= 2; + + Renew(RExC_open_parens, RExC_parens_buf_size, + regnode_offset); + Zero(RExC_open_parens + old_size, + RExC_parens_buf_size - old_size, regnode_offset); - Renew(RExC_close_parens, RExC_npar, regnode_offset); - Zero(RExC_close_parens + RExC_npar - 1, 1, regnode_offset); + Renew(RExC_close_parens, RExC_parens_buf_size, + regnode_offset); + Zero(RExC_close_parens + old_size, + RExC_parens_buf_size - old_size, regnode_offset); } } @@ -12016,7 +12064,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ "%*s%*s Setting open paren #%" IVdf " to %d\n", 22, "| |", (int)(depth * 2 + 1), "", - (IV)parno, REG_NODE_NUM(REGNODE_p(ret)))); + (IV)parno, ret)); RExC_open_parens[parno]= ret; } @@ -12086,7 +12134,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) RETURN_FAIL_ON_RESTART(flags, flagp); FAIL2("panic: regbranch returned failure, flags=%#" UVxf, (UV) flags); } - REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ + if (! REGTAIL(pRExC_state, lastbr, br)) { /* BRANCH -> BRANCH. */ + REQUIRE_BRANCHJ(flagp, 0); + } lastbr = br; *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); } @@ -12105,7 +12155,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ "%*s%*s Setting close paren #%" IVdf " to %d\n", 22, "| |", (int)(depth * 2 + 1), "", - (IV)parno, REG_NODE_NUM(REGNODE_p(ender)))); + (IV)parno, ender)); RExC_close_parens[parno]= ender; if (RExC_nestroot == parno) RExC_nestroot = 0; @@ -12139,7 +12189,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ "%*s%*s Setting close paren #0 (END) to %d\n", 22, "| |", (int)(depth * 2 + 1), "", - REG_NODE_NUM(REGNODE_p(ender)))); + ender)); RExC_close_parens[0]= ender; } @@ -12151,13 +12201,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) regprop(RExC_rx, RExC_mysv2, REGNODE_p(ender), NULL, pRExC_state); Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n", SvPV_nolen_const(RExC_mysv1), - (IV)REG_NODE_NUM(REGNODE_p(lastbr)), + (IV)lastbr, SvPV_nolen_const(RExC_mysv2), - (IV)REG_NODE_NUM(REGNODE_p(ender)), + (IV)ender, (IV)(ender - lastbr) ); ); - REGTAIL(pRExC_state, lastbr, ender); + if (! REGTAIL(pRExC_state, lastbr, ender)) { + REQUIRE_BRANCHJ(flagp, 0); + } if (have_branch) { char is_nothing= 1; @@ -12168,9 +12220,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) for (br = REGNODE_p(ret); br; br = regnext(br)) { const U8 op = PL_regkind[OP(br)]; if (op == BRANCH) { - REGTAIL_STUDY(pRExC_state, - REGNODE_OFFSET(NEXTOPER(br)), - ender); + if (! REGTAIL_STUDY(pRExC_state, + REGNODE_OFFSET(NEXTOPER(br)), + ender)) + { + REQUIRE_BRANCHJ(flagp, 0); + } if ( OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != REGNODE_p(ender)) is_nothing= 0; @@ -12201,7 +12256,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) SvPV_nolen_const(RExC_mysv1), (IV)REG_NODE_NUM(ret_as_regnode), SvPV_nolen_const(RExC_mysv2), - (IV)REG_NODE_NUM(REGNODE_p(ender)), + (IV)ender, (IV)(ender - ret) ); ); @@ -12237,7 +12292,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp, U32 depth) Set_Node_Cur_Length(REGNODE_p(ret), parse_start); Set_Node_Offset(REGNODE_p(ret), parse_start + 1); FLAGS(REGNODE_p(ret)) = flag; - REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL)); + if (! REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL))) + { + REQUIRE_BRANCHJ(flagp, 0); + } } } @@ -12331,14 +12389,12 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) /* FIXME adding one for every branch after the first is probably * excessive now we have TRIE support. (hv) */ MARK_NAUGHTY(1); - if ( chain > (SSize_t) BRANCH_MAX_OFFSET - && ! RExC_use_BRANCHJ) - { + if (! REGTAIL(pRExC_state, chain, latest)) { /* XXX We could just redo this branch, but figuring out what - * bookkeeping needs to be reset is a pain */ + * bookkeeping needs to be reset is a pain, and it's likely + * that other branches that goto END will also be too large */ REQUIRE_BRANCHJ(flagp, 0); } - REGTAIL(pRExC_state, chain, latest); } chain = latest; c++; @@ -12638,20 +12694,23 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, * points) that this \N sequence matches. This is set, and the input is * parsed for errors, even if the function returns FALSE, as detailed below. * - * There are 5 possibilities here, as detailed in the next 5 paragraphs. + * There are 6 possibilities here, as detailed in the next 6 paragraphs. * * Probably the most common case is for the \N to specify a single code point. * *cp_count will be set to 1, and *code_point_p will be set to that code * point. * - * Another possibility is for the input to be an empty \N{}, which for - * backwards compatibility we accept. *cp_count will be set to 0. *node_p - * will be set to a generated NOTHING node. + * Another possibility is for the input to be an empty \N{}. This is no + * longer accepted, and will generate a fatal error. + * + * Another possibility is for a custom charnames handler to be in effect which + * translates the input name to an empty string. *cp_count will be set to 0. + * *node_p will be set to a generated NOTHING node. * * Still another possibility is for the \N to mean [^\n]. *cp_count will be * set to 0. *node_p will be set to a generated REG_ANY node. * - * The fourth possibility is that \N resolves to a sequence of more than one + * The fifth possibility is that \N resolves to a sequence of more than one * code points. *cp_count will be set to the number of code points in the * sequence. *node_p will be set to a generated node returned by this * function calling S_reg(). @@ -12659,7 +12718,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, * The final possibility is that it is premature to be calling this function; * the parse needs to be restarted. This can happen when this changes from * /d to /u rules, or when the pattern needs to be upgraded to UTF-8. The - * latter occurs only when the fourth possibility would otherwise be in + * latter occurs only when the fifth possibility would otherwise be in * effect, and is because one of those code points requires the pattern to be * recompiled as UTF-8. The function returns FALSE, and sets the * RESTART_PARSE and NEED_UTF8 flags in *flagp, as appropriate. When this @@ -12676,12 +12735,11 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, * so we need a way to take a snapshot of what they resolve to at the time of * the original parse. [perl #56444]. * - * That parsing is skipped for single-quoted regexes, so we may here get - * '\N{NAME}'. This is a fatal error. These names have to be resolved by the - * parser. But if the single-quoted regex is something like '\N{U+41}', that - * is legal and handled here. The code point is Unicode, and has to be - * translated into the native character set for non-ASCII platforms. - */ + * That parsing is skipped for single-quoted regexes, so here we may get + * '\N{NAME}', which is parsed now. If the single-quoted regex is something + * like '\N{U+41}', that code point is Unicode, and has to be translated into + * the native character set for non-ASCII platforms. The other possibilities + * are already native, so no translation is done. */ char * endbrace; /* points to '}' following the name */ char* p = RExC_parse; /* Temporary */ @@ -12690,7 +12748,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, char *orig_end; char *save_start; I32 flags; - Size_t count = 0; /* code point count kept internally by this function */ GET_RE_DEBUG_FLAGS_DECL; @@ -12713,7 +12770,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, /* Disambiguate between \N meaning a named character versus \N meaning * [^\n]. The latter is assumed when the {...} following the \N is a legal - * quantifier, or there is no '{' at all */ + * quantifier, or if there is no '{' at all */ if (*p != '{' || regcurly(p)) { RExC_parse = p; if (cp_count) { @@ -12746,15 +12803,19 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, vFAIL2("Missing right brace on \\%c{}", 'N'); } - /* Here, we have decided it should be a named character or sequence */ - REQUIRE_UNI_RULES(flagp, FALSE); /* Unicode named chars imply Unicode - semantics */ + /* Here, we have decided it should be a named character or sequence. These + * imply Unicode semantics */ + REQUIRE_UNI_RULES(flagp, FALSE); - if (endbrace == RExC_parse) { /* empty: \N{} */ + /* \N{_} is what toke.c returns to us to indicate a name that evaluates to + * nothing at all (not allowed under strict) */ + if (endbrace - RExC_parse == 1 && *RExC_parse == '_') { + RExC_parse = endbrace; if (strict) { RExC_parse++; /* Position after the "}" */ vFAIL("Zero length \\N{}"); } + if (cp_count) { *cp_count = 0; } @@ -12767,15 +12828,122 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, return TRUE; } - /* If we haven't got something that begins with 'U+', then it didn't get lexed. */ - if ( endbrace - RExC_parse < 2 - || strnNE(RExC_parse, "U+", 2)) - { - RExC_parse = endbrace; /* position msg's '<--HERE' */ - vFAIL("\\N{NAME} must be resolved by the lexer"); - } + if (endbrace - RExC_parse < 2 || ! strBEGINs(RExC_parse, "U+")) { + + /* Here, the name isn't of the form U+.... This can happen if the + * pattern is single-quoted, so didn't get evaluated in toke.c. Now + * is the time to find out what the name means */ + + const STRLEN name_len = endbrace - RExC_parse; + SV * value_sv; /* What does this name evaluate to */ + SV ** value_svp; + const U8 * value; /* string of name's value */ + STRLEN value_len; /* and its length */ + + /* RExC_unlexed_names is a hash of names that weren't evaluated by + * toke.c, and their values. Make sure is initialized */ + if (! RExC_unlexed_names) { + RExC_unlexed_names = newHV(); + } + + /* If we have already seen this name in this pattern, use that. This + * allows us to only call the charnames handler once per name per + * pattern. A broken or malicious handler could return something + * different each time, which could cause the results to vary depending + * on if something gets added or subtracted from the pattern that + * causes the number of passes to change, for example */ + if ((value_svp = hv_fetch(RExC_unlexed_names, RExC_parse, + name_len, 0))) + { + value_sv = *value_svp; + } + else { /* Otherwise we have to go out and get the name */ + const char * error_msg = NULL; + value_sv = get_and_check_backslash_N_name(RExC_parse, endbrace, + UTF, + &error_msg); + if (error_msg) { + RExC_parse = endbrace; + vFAIL(error_msg); + } + + /* If no error message, should have gotten a valid return */ + assert (value_sv); + + /* Save the name's meaning for later use */ + if (! hv_store(RExC_unlexed_names, RExC_parse, name_len, + value_sv, 0)) + { + Perl_croak(aTHX_ "panic: hv_store() unexpectedly failed"); + } + } + + /* Here, we have the value the name evaluates to in 'value_sv' */ + value = (U8 *) SvPV(value_sv, value_len); + + /* See if the result is one code point vs 0 or multiple */ + if (value_len > 0 && value_len <= ((SvUTF8(value_sv)) + ? UTF8SKIP(value) + : 1)) + { + /* Here, exactly one code point. If that isn't what is wanted, + * fail */ + if (! code_point_p) { + RExC_parse = p; + return FALSE; + } + + /* Convert from string to numeric code point */ + *code_point_p = (SvUTF8(value_sv)) + ? valid_utf8_to_uvchr(value, NULL) + : *value; + + /* Have parsed this entire single code point \N{...}. *cp_count + * has already been set to 1, so don't do it again. */ + RExC_parse = endbrace; + nextchar(pRExC_state); + return TRUE; + } /* End of is a single code point */ + + /* Count the code points, if caller desires. The API says to do this + * even if we will later return FALSE */ + if (cp_count) { + *cp_count = 0; + + *cp_count = (SvUTF8(value_sv)) + ? utf8_length(value, value + value_len) + : value_len; + } + + /* Fail if caller doesn't want to handle a multi-code-point sequence. + * But don't back the pointer up if the caller wants to know how many + * code points there are (they need to handle it themselves in this + * case). */ + if (! node_p) { + if (! cp_count) { + RExC_parse = p; + } + return FALSE; + } + + /* Convert this to a sub-pattern of the form "(?: ... )", and then call + * reg recursively to parse it. That way, it retains its atomicness, + * while not having to worry about any special handling that some code + * points may have. */ + + substitute_parse = newSVpvs("?:"); + sv_catsv(substitute_parse, value_sv); + sv_catpv(substitute_parse, ")"); + +#ifdef EBCDIC + /* The value should already be native, so no need to convert on EBCDIC + * platforms.*/ + assert(! RExC_recode_x_to_native); +#endif - /* This code purposely indented below because of future changes coming */ + } + else { /* \N{U+...} */ + Size_t count = 0; /* code point count kept internally */ /* We can get to here when the input is \N{U+...} or when toke.c has * converted a name to the \N{U+...} form. This include changing a @@ -12910,6 +13078,8 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, RExC_recode_x_to_native = 1; #endif + } + /* Here, we have the string the name evaluates to, ready to be parsed, * stored in 'substitute_parse' as a series of valid "\x{...}\x{...}" * constructs. This can be called from within a substitute parse already. @@ -12973,169 +13143,6 @@ S_compute_EXACTish(RExC_state_t *pRExC_state) return op + EXACTF; } -PERL_STATIC_INLINE void -S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, - regnode_offset node, I32* flagp, STRLEN len, - UV code_point, bool downgradable) -{ - /* This knows the details about sizing an EXACTish node, setting flags for - * it (by setting <*flagp>, and potentially populating it with a single - * character. - * - * If (the length in bytes) is non-zero, this function assumes that - * the node has already been populated, and just does the sizing. In this - * case should be the final code point that has already been - * placed into the node. This value will be ignored except that under some - * circumstances <*flagp> is set based on it. - * - * If is zero, the function assumes that the node is to contain only - * the single character given by and calculates what - * should be. It populates the node's STRING with or its - * fold if folding. - * - * In both cases <*flagp> is appropriately set - * - * It knows that under FOLD, the Latin Sharp S and UTF characters above - * 255, must be folded (the former only when the rules indicate it can - * match 'ss') - * - * When it does the populating, it looks at the flag 'downgradable'. If - * true with a node that folds, it checks if the single code point - * participates in a fold, and if not downgrades the node to an EXACT. - * This helps the optimizer */ - - bool len_passed_in = cBOOL(len != 0); - U8 character[UTF8_MAXBYTES_CASE+1]; - - PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT; - - if (! len_passed_in) { - if (UTF) { - if (UVCHR_IS_INVARIANT(code_point)) { - if (LOC || ! FOLD) { /* /l defers folding until runtime */ - *character = (U8) code_point; - } - else { /* Here is /i and not /l. */ - *character = toFOLD((U8) code_point); - - /* We can downgrade to an EXACT node if this character - * isn't a folding one. Note that this assumes that - * nothing above Latin1 folds to some other invariant than - * one of these alphabetics; otherwise we would also have - * to check: - * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) - * || ASCII_FOLD_RESTRICTED)) - */ - if (downgradable && PL_fold[code_point] == code_point) { - OP(REGNODE_p(node)) = EXACT; - } - } - len = 1; - } - else if (FOLD && ( ! LOC - || ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point))) - { /* Folding, and ok to do so now */ - UV folded = _to_uni_fold_flags( - code_point, - character, - &len, - FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED) - ? FOLD_FLAGS_NOMIX_ASCII - : 0)); - if (downgradable - && folded == code_point /* This quickly rules out many - cases, avoiding the - _invlist_contains_cp() overhead - for those. */ - && ! _invlist_contains_cp(PL_in_some_fold, code_point)) - { - OP(REGNODE_p(node)) = (LOC) - ? EXACTL - : EXACT; - } - } - else if (code_point <= MAX_UTF8_TWO_BYTE) { - - /* Not folding this cp, and can output it directly */ - *character = UTF8_TWO_BYTE_HI(code_point); - *(character + 1) = UTF8_TWO_BYTE_LO(code_point); - len = 2; - } - else { - uvchr_to_utf8( character, code_point); - len = UTF8SKIP(character); - } - } /* Else pattern isn't UTF8. */ - else if (! FOLD) { - *character = (U8) code_point; - len = 1; - } /* Else is folded non-UTF8 */ -#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ - || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ - || UNICODE_DOT_DOT_VERSION > 0) - else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) { -#else - else if (1) { -#endif - *character = (U8) (DEPENDS_SEMANTICS) - ? toFOLD(code_point) - : (LOC) - ? code_point - : toLOWER_L1(code_point); - len = 1; - - /* Can turn into an EXACT node if we know the fold at compile time, - * and it folds to itself and doesn't particpate in other folds */ - if (downgradable - && ! LOC - && PL_fold_latin1[code_point] == code_point - && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point) - || (isASCII(code_point) && ASCII_FOLD_RESTRICTED))) - { - OP(REGNODE_p(node)) = EXACT; - } - } /* else is Sharp s. May need to fold it */ - else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) { - *character = 's'; - *(character + 1) = 's'; - len = 2; - } - else { - *character = LATIN_SMALL_LETTER_SHARP_S; - len = 1; - } - } - - if (downgradable) { - change_engine_size(pRExC_state, STR_SZ(len)); - } - - RExC_emit += STR_SZ(len); - STR_LEN(REGNODE_p(node)) = len; - if (! len_passed_in) { - Copy((char *) character, STRING(REGNODE_p(node)), len, char); - } - - *flagp |= HASWIDTH; - - /* A single character node is SIMPLE, except for the special-cased SHARP S - * under /di. */ - if ((len == 1 || (UTF && len == UVCHR_SKIP(code_point))) -#if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \ - || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \ - || UNICODE_DOT_DOT_VERSION > 0) - && ( code_point != LATIN_SMALL_LETTER_SHARP_S - || ! FOLD || ! DEPENDS_SEMANTICS) -#endif - ) { - *flagp |= SIMPLE; - } - - if (OP(REGNODE_p(node)) == EXACTFL) { - RExC_contains_locale = 1; - } -} - STATIC bool S_new_regcurly(const char *s, const char *e) { @@ -13269,6 +13276,7 @@ S_backref_value(char *p, char *e) STATIC regnode_offset S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { + dVAR; regnode_offset ret = 0; I32 flags = 0; char *parse_start; @@ -13441,25 +13449,17 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* FALLTHROUGH */ case 'b': { + U8 flags = 0; regex_charset charset = get_regex_charset(RExC_flags); RExC_seen_zerolen++; RExC_seen |= REG_LOOKBEHIND_SEEN; op = BOUND + charset; - if (op == BOUND) { - RExC_seen_d_op = TRUE; - } - else if (op == BOUNDL) { - RExC_contains_locale = 1; - } - - ret = reg_node(pRExC_state, op); - *flagp |= SIMPLE; if (RExC_parse >= RExC_end || *(RExC_parse + 1) != '{') { - FLAGS(REGNODE_p(ret)) = TRADITIONAL_BOUND; + flags = TRADITIONAL_BOUND; if (op > BOUNDA) { /* /aa is same as /a */ - OP(REGNODE_p(ret)) = BOUNDA; + op = BOUNDA; } } else { @@ -13467,9 +13467,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) char name = *RExC_parse; char * endbrace = NULL; RExC_parse += 2; - if (RExC_parse < RExC_end) { - endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse); - } + endbrace = (char *) memchr(RExC_parse, '}', RExC_end - RExC_parse); if (! endbrace) { vFAIL2("Missing right brace on \\%c{}", name); @@ -13495,25 +13493,25 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { goto bad_bound_type; } - FLAGS(REGNODE_p(ret)) = GCB_BOUND; + flags = GCB_BOUND; break; case 'l': if (length != 2 || *(RExC_parse + 1) != 'b') { goto bad_bound_type; } - FLAGS(REGNODE_p(ret)) = LB_BOUND; + flags = LB_BOUND; break; case 's': if (length != 2 || *(RExC_parse + 1) != 'b') { goto bad_bound_type; } - FLAGS(REGNODE_p(ret)) = SB_BOUND; + flags = SB_BOUND; break; case 'w': if (length != 2 || *(RExC_parse + 1) != 'b') { goto bad_bound_type; } - FLAGS(REGNODE_p(ret)) = WB_BOUND; + flags = WB_BOUND; break; default: bad_bound_type: @@ -13526,8 +13524,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_parse = endbrace; REQUIRE_UNI_RULES(flagp, 0); - if (op >= BOUNDA) { /* /aa is same as /a */ - OP(REGNODE_p(ret)) = BOUNDU; + if (op == BOUND) { + op = BOUNDU; + } + else if (op >= BOUNDA) { /* /aa is same as /a */ + op = BOUNDU; length += 4; /* Don't have to worry about UTF-8, in this message because @@ -13542,9 +13543,22 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } } + if (op == BOUND) { + RExC_seen_d_op = TRUE; + } + else if (op == BOUNDL) { + RExC_contains_locale = 1; + } + if (invert) { - OP(REGNODE_p(ret)) += NBOUND - BOUND; + op += NBOUND - BOUND; } + + ret = reg_node(pRExC_state, op); + FLAGS(REGNODE_p(ret)) = flags; + + *flagp |= SIMPLE; + goto finish_meta_pat; } @@ -13798,7 +13812,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* It might be a forward reference; we can't fail until we * know, by completing the parse to get all the groups, and * then reparsing */ - if (RExC_total_parens > 0) { + if (ALL_PARENS_COUNTED) { if (num >= RExC_total_parens) { vFAIL("Reference to nonexistent group"); } @@ -13900,7 +13914,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* We can convert EXACTF nodes to EXACTFU if they contain only * characters that match identically regardless of the target * string's UTF8ness. The reason to do this is that EXACTF is not - * trie-able, EXACTFU is. + * trie-able, EXACTFU is, and EXACTFU requires fewer operations at + * runtime. * * Similarly, we can convert EXACTFL nodes to EXACTFLU8 if they * contain only above-Latin1 characters (hence must be in UTF8), @@ -13949,7 +13964,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) || UTF8_IS_INVARIANT(UCHARAT(RExC_parse)) || UTF8_IS_START(UCHARAT(RExC_parse))); - /* Here, we have a literal character. Find the maximal string of * them in the input that we can fit into a single EXACTish node. * We quit at the first non-literal or when the node gets full, or @@ -14324,7 +14338,8 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto loopdone; } - /* This code point means we can't simplify things */ + /* This problematic code point means we can't simplify + * things */ maybe_exactfu = FALSE; /* Here, we are adding a problematic fold character. @@ -14499,14 +14514,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * identifies, so when it is set to less than the full node, we can * skip the rest of this */ if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) { + PERL_UINT_FAST8_T backup_count = 0; const STRLEN full_len = len; assert(len >= MAX_NODE_STRING_SIZE); - /* Here, points to the final byte of the final character. - * Look backwards through the string until find a non- - * problematic character */ + /* Here, points to just beyond where we have output the + * final character of the node. Look backwards through the + * string until find a non- problematic character */ if (! UTF) { @@ -14515,13 +14531,15 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto loopdone; } - while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { } + while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { + backup_count++; + } len = s - s0 + 1; } else { /* Point to the first byte of the final character */ - s = (char *) utf8_hop((U8 *) s, -1); + s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0); while (s >= s0) { /* Search backwards until find a non-problematic char */ @@ -14557,6 +14575,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * special case the very first byte in the string, so * we don't read outside the string */ s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1); + backup_count++; } /* End of loop backwards through the string */ /* If there were only problematic characters in the string, @@ -14580,12 +14599,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } else { /* Here, the node does contain some characters that aren't - * problematic. If one such is the final character in the - * node, we are done */ - if (len == full_len) { + * problematic. If we didn't have to backup any, then the + * final character in the node is non-problematic, and we + * can take the node as-is */ + if (backup_count == 0) { goto loopdone; } - else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) { + else if (backup_count == 1) { /* If the final character is problematic, but the * penultimate is not, back-off that last character to @@ -14752,9 +14772,16 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) * sets up the bitmap and any flags, removing those code points from the * inversion list, setting it to NULL should it become completely empty */ + dVAR; + PERL_ARGS_ASSERT_POPULATE_ANYOF_FROM_INVLIST; assert(PL_regkind[OP(node)] == ANYOF); + /* There is no bitmap for this node type */ + if (OP(node) == ANYOFH) { + return; + } + ANYOF_BITMAP_ZERO(node); if (*invlist_ptr) { @@ -15860,7 +15887,9 @@ redo_curchar: RExC_parse = RExC_end; } else if (RExC_parse != save_parse) { - RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += (UTF) + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; } vFAIL("Expecting '(?flags:(?[...'"); } @@ -16630,7 +16659,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS * characters, with the corresponding bit set if that character is in the - * list. For characters above this, a range list or swash is used. There + * list. For characters above this, an inversion list is used. There * are extra bits for \w, etc. in locale ANYOFs, as what these match is not * determinable at compile time * @@ -16642,15 +16671,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * UTF-8 */ + dVAR; UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE; IV range = 0; UV value = OOB_UNICODE, save_value = OOB_UNICODE; - regnode_offset ret; + regnode_offset ret = -1; /* Initialized to an illegal value */ STRLEN numlen; int namedclass = OOB_NAMEDCLASS; char *rangebegin = NULL; - bool need_class = 0; - SV *listsv = NULL; + SV *listsv = NULL; /* List of \p{user-defined} whose definitions + aren't available at the time this was called */ STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more than just initialized. */ SV* properties = NULL; /* Code points that match \p{} \P{} */ @@ -16679,18 +16709,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const bool skip_white = cBOOL( ret_invlist || (RExC_flags & RXf_PMf_EXTENDED_MORE)); - /* Unicode properties are stored in a swash; this holds the current one - * being parsed. If this swash is the only above-latin1 component of the - * character class, an optimization is to pass it directly on to the - * execution engine. Otherwise, it is set to NULL to indicate that there - * are other things in the class that have to be dealt with at execution - * time */ - SV* swash = NULL; /* Code points that match \p{} \P{} */ - - /* Set if a component of this character class is user-defined; just passed - * on to the engine */ - bool has_user_defined_property = FALSE; - /* inversion list of code points this node matches only when the target * string is in UTF-8. These are all non-ASCII, < 256. (Because is under * /d) */ @@ -16724,7 +16742,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, bool warn_super = ALWAYS_WARN_SUPER; const char * orig_parse = RExC_parse; - bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ /* This variable is used to mark where the end in the input is of something * that looks like a POSIX construct but isn't. During the parse, when @@ -16739,7 +16756,17 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, one. */ U8 anyof_flags = 0; /* flag bits if the node is an ANYOF-type */ U32 posixl = 0; /* bit field of posix classes matched under /l */ - bool use_anyofd = FALSE; /* ? Is this to be an ANYOFD node */ + + +/* Flags as to what things aren't knowable until runtime. (Note that these are + * mutually exclusive.) */ +#define HAS_USER_DEFINED_PROPERTY 0x01 /* /u any user-defined properties that + haven't been defined as of yet */ +#define HAS_D_RUNTIME_DEPENDENCY 0x02 /* /d if the target being matched is + UTF-8 or not */ +#define HAS_L_RUNTIME_DEPENDENCY 0x04 /* /l what the posix classes match and + what gets folded */ + U32 has_runtime_dependency = 0; /* OR of the above flags */ GET_RE_DEBUG_FLAGS_DECL; @@ -16763,7 +16790,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, allow_multi_folds = FALSE; #endif - listsv = newSVpvs_flags("# comment\n", SVs_TEMP); + /* We include the /i status at the beginning of this so that we can + * know it at runtime */ + listsv = sv_2mortal(Perl_newSVpvf(aTHX_ "#%d\n", cBOOL(FOLD))); initial_listsv_len = SvCUR(listsv); SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */ @@ -16964,6 +16993,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, "Ignoring zero length \\N{} in character class"); } else { /* cp_count > 1 */ + assert(cp_count > 1); if (! RExC_in_multi_char_class) { if (invert || range || *RExC_parse == '-') { if (strict) { @@ -17002,17 +17032,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, case 'P': { char *e; - char *i; - - /* We will handle any undefined properties ourselves */ - U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF - /* And we actually would prefer to get - * the straight inversion list of the - * swash, since we will be accessing it - * anyway, to save a little time */ - |_CORE_SWASH_INIT_ACCEPT_INVLIST; - - SvREFCNT_dec(swash); /* Free any left-overs */ /* \p means they want Unicode semantics */ REQUIRE_UNI_RULES(flagp, 0); @@ -17057,7 +17076,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } /* The \p isn't immediately followed by a '{' */ else if (! isALPHA(*RExC_parse)) { - RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += (UTF) + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; vFAIL2("Character following \\%c must be '{' or a " "single-character Unicode property name", (U8) value); @@ -17068,169 +17089,88 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } { char* name = RExC_parse; - char* base_name; /* name after any packages are stripped */ - char* lookup_name = NULL; - const char * const colon_colon = "::"; - bool invert; - - SV* invlist; - - /* Temporary workaround for [perl #133136]. For this - * precise input that is in the .t that is failing, load - * utf8.pm, which is what the test wants, so that that - * .t passes */ - if ( memEQs(RExC_start, e + 1 - RExC_start, - "foo\\p{Alnum}") - && ! hv_common(GvHVn(PL_incgv), - NULL, - "utf8.pm", sizeof("utf8.pm") - 1, - 0, HV_FETCH_ISEXISTS, NULL, 0)) - { - require_pv("utf8.pm"); - } - invlist = parse_uniprop_string(name, n, FOLD, &invert); - if (invlist) { - if (invert) { - value ^= 'P' ^ 'p'; - } - } - else { - /* Try to get the definition of the property into - * . If /i is in effect, the effective property - * will have its name be <__NAME_i>. The design is - * discussed in commit - * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ - name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse)); - SAVEFREEPV(name); - - for (i = RExC_parse; i < RExC_parse + n; i++) { - if (isCNTRL(*i) && *i != '\t') { - RExC_parse = e + 1; - vFAIL2("Can't find Unicode property definition \"%s\"", name); + /* Any message returned about expanding the definition */ + SV* msg = newSVpvs_flags("", SVs_TEMP); + + /* If set TRUE, the property is user-defined as opposed to + * official Unicode */ + bool user_defined = FALSE; + + SV * prop_definition = parse_uniprop_string( + name, n, UTF, FOLD, + FALSE, /* This is compile-time */ + + /* We can't defer this defn when + * the full result is required in + * this call */ + ! cBOOL(ret_invlist), + + &user_defined, + msg, + 0 /* Base level */ + ); + if (SvCUR(msg)) { /* Assumes any error causes a msg */ + assert(prop_definition == NULL); + RExC_parse = e + 1; + if (SvUTF8(msg)) { /* msg being UTF-8 makes the whole + thing so, or else the display is + mojibake */ + RExC_utf8 = TRUE; } + /* diag_listed_as: Can't find Unicode property definition "%s" in regex; marked by <-- HERE in m/%s/ */ + vFAIL2utf8f("%" UTF8f, UTF8fARG(SvUTF8(msg), + SvCUR(msg), SvPVX(msg))); } - if (FOLD) { - lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name)); + if (! is_invlist(prop_definition)) { - /* The function call just below that uses this can fail - * to return, leaking memory if we don't do this */ - SAVEFREEPV(lookup_name); - } - - /* Look up the property name, and get its swash and - * inversion list, if the property is found */ - swash = _core_swash_init("utf8", - (lookup_name) - ? lookup_name - : name, - &PL_sv_undef, - 1, /* binary */ - 0, /* not tr/// */ - NULL, /* No inversion list */ - &swash_init_flags - ); - if (! swash || ! (invlist = _get_swash_invlist(swash))) { - HV* curpkg = (IN_PERL_COMPILETIME) - ? PL_curstash - : CopSTASH(PL_curcop); - UV final_n = n; - bool has_pkg; - - if (swash) { /* Got a swash but no inversion list. - Something is likely wrong that will - be sorted-out later */ - SvREFCNT_dec_NN(swash); - swash = NULL; - } - - /* Here didn't find it. It could be a an error (like a - * typo) in specifying a Unicode property, or it could - * be a user-defined property that will be available at - * run-time. The names of these must begin with 'In' - * or 'Is' (after any packages are stripped off). So - * if not one of those, or if we accept only - * compile-time properties, is an error; otherwise add - * it to the list for run-time look up. */ - if ((base_name = rninstr(name, name + n, - colon_colon, colon_colon + 2))) - { /* Has ::. We know this must be a user-defined - property */ - base_name += 2; - final_n -= base_name - name; - has_pkg = TRUE; + /* Here, the definition isn't known, so we have gotten + * returned a string that will be evaluated if and when + * encountered at runtime. We add it to the list of + * such properties, along with whether it should be + * complemented or not */ + if (value == 'P') { + sv_catpvs(listsv, "!"); } else { - base_name = name; - has_pkg = FALSE; - } - - if ( final_n < 3 - || base_name[0] != 'I' - || (base_name[1] != 's' && base_name[1] != 'n') - || ret_invlist) - { - const char * const msg - = (has_pkg) - ? "Illegal user-defined property name" - : "Can't find Unicode property definition"; - RExC_parse = e + 1; - - /* diag_listed_as: Can't find Unicode property definition "%s" */ - vFAIL3utf8f("%s \"%" UTF8f "\"", - msg, UTF8fARG(UTF, n, name)); + sv_catpvs(listsv, "+"); } + sv_catsv(listsv, prop_definition); - /* If the property name doesn't already have a package - * name, add the current one to it so that it can be - * referred to outside it. [perl #121777] */ - if (! has_pkg && curpkg) { - char* pkgname = HvNAME(curpkg); - if (memNEs(pkgname, HvNAMELEN(curpkg), "main")) { - char* full_name = Perl_form(aTHX_ - "%s::%s", - pkgname, - name); - n = strlen(full_name); - name = savepvn(full_name, n); - SAVEFREEPV(name); - } - } - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%" UTF8f "%s\n", - (value == 'p' ? '+' : '!'), - (FOLD) ? "__" : "", - UTF8fARG(UTF, n, name), - (FOLD) ? "_i" : ""); - has_user_defined_property = TRUE; - optimizable = FALSE; /* Will have to leave this an - ANYOF node */ + has_runtime_dependency |= HAS_USER_DEFINED_PROPERTY; /* We don't know yet what this matches, so have to flag * it */ anyof_flags |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP; } else { + assert (prop_definition && is_invlist(prop_definition)); - /* Here, did get the swash and its inversion list. If - * the swash is from a user-defined property, then this - * whole character class should be regarded as such */ - if (swash_init_flags - & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY) + /* Here we do have the complete property definition + * + * Temporary workaround for [perl #133136]. For this + * precise input that is in the .t that is failing, + * load utf8.pm, which is what the test wants, so that + * that .t passes */ + if ( memEQs(RExC_start, e + 1 - RExC_start, + "foo\\p{Alnum}") + && ! hv_common(GvHVn(PL_incgv), + NULL, + "utf8.pm", sizeof("utf8.pm") - 1, + 0, HV_FETCH_ISEXISTS, NULL, 0)) { - has_user_defined_property = TRUE; + require_pv("utf8.pm"); } - } - } - if (invlist) { - if (! has_user_defined_property && + + if (! user_defined && /* We warn on matching an above-Unicode code point * if the match would return true, except don't * warn for \p{All}, which has exactly one element * = 0 */ - (_invlist_contains_cp(invlist, 0x110000) - && (! (_invlist_len(invlist) == 1 - && *invlist_array(invlist) == 0)))) + (_invlist_contains_cp(prop_definition, 0x110000) + && (! (_invlist_len(prop_definition) == 1 + && *invlist_array(prop_definition) == 0)))) { warn_super = TRUE; } @@ -17238,23 +17178,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Invert if asking for the complement */ if (value == 'P') { _invlist_union_complement_2nd(properties, - invlist, + prop_definition, &properties); - - /* The swash can't be used as-is, because we've - * inverted things; delay removing it to here after - * have copied its invlist above */ - if (! swash) { - SvREFCNT_dec_NN(invlist); - } - SvREFCNT_dec(swash); - swash = NULL; } else { - _invlist_union(properties, invlist, &properties); - if (! swash) { - SvREFCNT_dec_NN(invlist); - } + _invlist_union(properties, prop_definition, &properties); } } } @@ -17325,7 +17253,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_parse += numlen; if (numlen != 3) { if (strict) { - RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += (UTF) + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; vFAIL("Need exactly 3 octal digits"); } else if ( numlen < 3 /* like \08, \178 */ @@ -17398,43 +17328,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, SV* scratch_list = NULL; /* What the Posix classes (like \w, [:space:]) match in locale - * isn't knowable under locale until actual match time. Room - * must be reserved (one time per outer bracketed class) to - * store such classes. The space will contain a bit for each - * named class that is to be matched against. This isn't - * needed for \p{} and pseudo-classes, as they are not affected - * by locale, and hence are dealt with separately */ - if (! need_class) { - need_class = 1; - anyof_flags |= ANYOF_MATCHES_POSIXL; - - /* We can't change this into some other type of node - * (unless this is the only element, in which case there - * are nodes that mean exactly this) as has runtime - * dependencies */ - optimizable = FALSE; - } - - /* Coverity thinks it is possible for this to be negative; both - * jhi and khw think it's not, but be safer */ - assert(! (anyof_flags & ANYOF_MATCHES_POSIXL) - || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0); - - /* See if it already matches the complement of this POSIX - * class */ - if ( (anyof_flags & ANYOF_MATCHES_POSIXL) - && POSIXL_TEST(posixl, namedclass + ((namedclass % 2) - ? -1 - : 1))) - { - posixl_matches_all = TRUE; - break; /* No need to continue. Since it matches both - e.g., \w and \W, it matches everything, and the - bracketed class can be optimized into qr/./s */ - } - - /* Add this class to those that should be checked at runtime */ + * isn't knowable under locale until actual match time. A + * special node is used for these which has extra space for a + * bitmap, with a bit reserved for each named class that is to + * be matched against. This isn't needed for \p{} and + * pseudo-classes, as they are not affected by locale, and + * hence are dealt with separately */ POSIXL_SET(posixl, namedclass); + has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY; + anyof_flags |= ANYOF_MATCHES_POSIXL; /* The above-Latin1 characters are not subject to locale rules. * Just add them to the unconditionally-matched list */ @@ -17621,7 +17523,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * * See [perl #89750] */ if (FOLD && allow_multi_folds && value == prevvalue) { - if (value == LATIN_SMALL_LETTER_SHARP_S + if ( value == LATIN_SMALL_LETTER_SHARP_S || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold, value))) { @@ -17963,8 +17865,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Our calculated list will be for Unicode rules. For locale * matching, we have to keep a separate list that is consulted at - * runtime only when the locale indicates Unicode rules. For - * non-locale, we just use the general list */ + * runtime only when the locale indicates Unicode rules (and we + * don't include potential matches in the ASCII/Latin1 range, as + * any code point could fold to any other, based on the run-time + * locale). For non-locale, we just use the general list */ if (LOC) { use_list = &only_utf8_locale_list; } @@ -17996,7 +17900,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, if (j < 256) { - if (IS_IN_SOME_FOLD_L1(j)) { + /* Under /l, we don't know what code points below 256 + * fold to, except we do know the MICRO SIGN folds to + * an above-255 character if the locale is UTF-8, so we + * add it to the special list (in *use_list) Otherwise + * we know now what things can match, though some folds + * are valid under /d only if the target is UTF-8. + * Those go in a separate list */ + if ( IS_IN_SOME_FOLD_L1(j) + && ! (LOC && j != MICRO_SIGN)) + { /* ASCII is always matched; non-ASCII is matched * only under Unicode rules (which could happen @@ -18087,8 +18000,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* And combine the result (if any) with any inversion lists from posix * classes. The lists are kept separate up to now because we don't want to - * fold the classes (folding of those is automatically handled by the swash - * fetching code) */ + * fold the classes */ if (simple_posixes) { /* These are the classes known to be unaffected by /a, /aa, and /d */ if (cp_list) { @@ -18128,7 +18040,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * the target string is in UTF-8. But things like \W match all the * upper Latin1 characters if the target string is not in UTF-8. * - * Handle the case where there something like \W separately */ + * Handle the case with something like \W separately */ if (nposixes) { SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1, NULL); @@ -18202,9 +18114,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, SvREFCNT_dec(nonascii_but_latin1_properties); - /* Get rid of any characters that we now know are matched - * unconditionally from the conditional list, which may make - * that list empty */ + /* Get rid of any characters from the conditional list that we + * now know are matched unconditionally, which may make that + * list empty */ _invlist_subtract(upper_latin1_only_utf8_matches, cp_list, &upper_latin1_only_utf8_matches); @@ -18269,10 +18181,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * folded until runtime */ /* If we didn't do folding, it's because some information isn't available - * until runtime; set the run-time fold flag for these. (We don't have to - * worry about properties folding, as that is taken care of by the swash - * fetching). We know to set the flag if we have a non-NULL list for UTF-8 - * locales, or the class matches at least one 0-255 range code point */ + * until runtime; set the run-time fold flag for these We know to set the + * flag if we have a non-NULL list for UTF-8 locales, or the class matches + * at least one 0-255 range code point */ if (LOC && FOLD) { /* Some things on the list might be unconditionally included because of @@ -18287,7 +18198,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, only_utf8_locale_list = NULL; } } - if (only_utf8_locale_list) { + if ( only_utf8_locale_list + || (cp_list && ( _invlist_contains_cp(cp_list, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) + || _invlist_contains_cp(cp_list, LATIN_SMALL_LETTER_DOTLESS_I)))) + { + has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY; anyof_flags |= ANYOFL_FOLD | ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD; @@ -18297,6 +18212,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, invlist_iterinit(cp_list); if (invlist_iternext(cp_list, &start, &end) && start < 256) { anyof_flags |= ANYOFL_FOLD; + has_runtime_dependency |= HAS_L_RUNTIME_DEPENDENCY; } invlist_iterfinish(cp_list); } @@ -18305,336 +18221,814 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, && ( upper_latin1_only_utf8_matches || (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))) { - use_anyofd = TRUE; RExC_seen_d_op = TRUE; - optimizable = FALSE; + has_runtime_dependency |= HAS_D_RUNTIME_DEPENDENCY; } - /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known - * at compile time. Besides not inverting folded locale now, we can't - * invert if there are things such as \w, which aren't known until runtime - * */ + /* Optimize inverted patterns (e.g. [^a-z]) when everything is known at + * compile time. */ if ( cp_list && invert - && ! use_anyofd - && ! (anyof_flags & (ANYOF_LOCALE_FLAGS)) - && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) + && ! has_runtime_dependency) { _invlist_invert(cp_list); - /* Any swash can't be used as-is, because we've inverted things */ - if (swash) { - SvREFCNT_dec_NN(swash); - swash = NULL; - } - /* Clear the invert flag since have just done it here */ invert = FALSE; } if (ret_invlist) { *ret_invlist = cp_list; - SvREFCNT_dec(swash); return RExC_emit; } + /* All possible optimizations below still have these characteristics. + * (Multi-char folds aren't SIMPLE, but they don't get this far in this + * routine) */ + *flagp |= HASWIDTH|SIMPLE; + + if (anyof_flags & ANYOF_LOCALE_FLAGS) { + RExC_contains_locale = 1; + } + /* Some character classes are equivalent to other nodes. Such nodes take - * up less room and generally fewer operations to execute than ANYOF nodes. - * */ + * up less room, and some nodes require fewer operations to execute, than + * ANYOF nodes. EXACTish nodes may be joinable with adjacent nodes to + * improve efficiency. */ if (optimizable) { - int posix_class = -1; /* Illegal value */ - U8 ANYOFM_mask = 0xFF; - UV start, end; + PERL_UINT_FAST8_T i; + Size_t partial_cp_count = 0; + UV start[MAX_FOLD_FROMS+1] = { 0 }; /* +1 for the folded-to char */ + UV end[MAX_FOLD_FROMS+1] = { 0 }; - if (UNLIKELY(posixl_matches_all)) { - ret = reg_node(pRExC_state, SANY); - goto not_anyof; - } + if (cp_list) { /* Count the code points in enough ranges that we would + see all the ones possible in any fold in this version + of Unicode */ - if (cp_list && ! invert) { invlist_iterinit(cp_list); - if (! invlist_iternext(cp_list, &start, &end)) { + for (i = 0; i <= MAX_FOLD_FROMS; i++) { + if (! invlist_iternext(cp_list, &start[i], &end[i])) { + break; + } + partial_cp_count += end[i] - start[i] + 1; + } + + invlist_iterfinish(cp_list); + } - /* Here, the list is empty. This happens, for example, when a - * Unicode property that doesn't match anything is the only - * element in the character class (perluniprops.pod notes such - * properties). */ + /* If we know at compile time that this matches every possible code + * point, any run-time dependencies don't matter */ + if (start[0] == 0 && end[0] == UV_MAX) { + if (invert) { ret = reganode(pRExC_state, OPFAIL, 0); - *flagp |= HASWIDTH|SIMPLE; + } + else { + ret = reg_node(pRExC_state, SANY); + MARK_NAUGHTY(1); + } + goto not_anyof; + } + + /* Similarly, for /l posix classes, if both a class and its + * complement match, any run-time dependencies don't matter */ + if (posixl) { + for (namedclass = 0; namedclass < ANYOF_POSIXL_MAX; + namedclass += 2) + { + if ( POSIXL_TEST(posixl, namedclass) /* class */ + && POSIXL_TEST(posixl, namedclass + 1)) /* its complement */ + { + if (invert) { + ret = reganode(pRExC_state, OPFAIL, 0); + } + else { + ret = reg_node(pRExC_state, SANY); + MARK_NAUGHTY(1); + } + goto not_anyof; + } + } + /* For well-behaved locales, some classes are subsets of others, + * so complementing the subset and including the non-complemented + * superset should match everything, like [\D[:alnum:]], and + * [[:^alpha:][:alnum:]], but some implementations of locales are + * buggy, and khw thinks its a bad idea to have optimization change + * behavior, even if it avoids an OS bug in a given case */ + +#define isSINGLE_BIT_SET(n) isPOWER_OF_2(n) + + /* If is a single posix /l class, can optimize to just that op. + * Such a node will not match anything in the Latin1 range, as that + * is not determinable until runtime, but will match whatever the + * class does outside that range. (Note that some classes won't + * match anything outside the range, like [:ascii:]) */ + if ( isSINGLE_BIT_SET(posixl) + && (partial_cp_count == 0 || start[0] > 255)) + { + U8 classnum; + SV * class_above_latin1 = NULL; + bool already_inverted; + bool are_equivalent; + + /* Compute which bit is set, which is the same thing as, e.g., + * ANYOF_CNTRL. From + * https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn + * */ + static const int MultiplyDeBruijnBitPosition2[32] = + { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 + }; + + namedclass = MultiplyDeBruijnBitPosition2[(posixl + * 0x077CB531U) >> 27]; + classnum = namedclass_to_classnum(namedclass); + + /* The named classes are such that the inverted number is one + * larger than the non-inverted one */ + already_inverted = namedclass + - classnum_to_namedclass(classnum); + + /* Create an inversion list of the official property, inverted + * if the constructed node list is inverted, and restricted to + * only the above latin1 code points, which are the only ones + * known at compile time */ + _invlist_intersection_maybe_complement_2nd( + PL_AboveLatin1, + PL_XPosix_ptrs[classnum], + already_inverted, + &class_above_latin1); + are_equivalent = _invlistEQ(class_above_latin1, cp_list, + FALSE); + SvREFCNT_dec_NN(class_above_latin1); + + if (are_equivalent) { + + /* Resolve the run-time inversion flag with this possibly + * inverted class */ + invert = invert ^ already_inverted; + + ret = reg_node(pRExC_state, + POSIXL + invert * (NPOSIXL - POSIXL)); + FLAGS(REGNODE_p(ret)) = classnum; + goto not_anyof; + } + } + } + + /* khw can't think of any other possible transformation involving + * these. */ + if (has_runtime_dependency & HAS_USER_DEFINED_PROPERTY) { + goto is_anyof; + } + + if (! has_runtime_dependency) { + + /* If the list is empty, nothing matches. This happens, for + * example, when a Unicode property that doesn't match anything is + * the only element in the character class (perluniprops.pod notes + * such properties). */ + if (partial_cp_count == 0) { + if (invert) { + ret = reg_node(pRExC_state, SANY); + } + else { + ret = reganode(pRExC_state, OPFAIL, 0); + } + goto not_anyof; } - if (start == end) { /* The range is a single code point */ - if (! invlist_iternext(cp_list, &start, &end) + /* If matches everything but \n */ + if ( start[0] == 0 && end[0] == '\n' - 1 + && start[1] == '\n' + 1 && end[1] == UV_MAX) + { + assert (! invert); + ret = reg_node(pRExC_state, REG_ANY); + MARK_NAUGHTY(1); + goto not_anyof; + } + } - /* Don't do this optimization if it would require - * changing the pattern to UTF-8 */ - && (start < 256 || UTF)) - { - /* Here, the list contains a single code point. Can - * optimize into an EXACTish node */ + /* Next see if can optimize classes that contain just a few code points + * into an EXACTish node. The reason to do this is to let the + * optimizer join this node with adjacent EXACTish ones. + * + * An EXACTFish node can be generated even if not under /i, and vice + * versa. But care must be taken. An EXACTFish node has to be such + * that it only matches precisely the code points in the class, but we + * want to generate the least restrictive one that does that, to + * increase the odds of being able to join with an adjacent node. For + * example, if the class contains [kK], we have to make it an EXACTFAA + * node to prevent the KELVIN SIGN from matching. Whether we are under + * /i or not is irrelevant in this case. Less obvious is the pattern + * qr/[\x{02BC}]n/i. U+02BC is MODIFIER LETTER APOSTROPHE. That is + * supposed to match the single character U+0149 LATIN SMALL LETTER N + * PRECEDED BY APOSTROPHE. And so even though there is no simple fold + * that includes \X{02BC}, there is a multi-char fold that does, and so + * the node generated for it must be an EXACTFish one. On the other + * hand qr/:/i should generate a plain EXACT node since the colon + * participates in no fold whatsoever, and having it EXACT tells the + * optimizer the target string cannot match unless it has a colon in + * it. + * + * We don't typically generate an EXACTish node if doing so would + * require changing the pattern to UTF-8, as that affects /d and + * otherwise is slower. However, under /i, not changing to UTF-8 can + * miss some potential multi-character folds. We calculate the + * EXACTish node, and then decide if something would be missed if we + * don't upgrade */ + if ( ! posixl + && ! invert + + /* Only try if there are no more code points in the class than + * in the max possible fold */ + && partial_cp_count > 0 && partial_cp_count <= MAX_FOLD_FROMS + 1 + + && (start[0] < 256 || UTF || FOLD)) + { + if (partial_cp_count == 1 && ! upper_latin1_only_utf8_matches) + { + /* We can always make a single code point class into an + * EXACTish node. */ + + if (LOC) { + + /* Here is /l: Use EXACTL, except /li indicates EXACTFL, + * as that means there is a fold not known until runtime so + * shows as only a single code point here. */ + op = (FOLD) ? EXACTFL : EXACTL; + } + else if (! FOLD) { /* Not /l and not /i */ + op = (start[0] < 256) ? EXACT : EXACT_ONLY8; + } + else if (start[0] < 256) { /* /i, not /l, and the code point is + small */ + + /* Under /i, it gets a little tricky. A code point that + * doesn't participate in a fold should be an EXACT node. + * We know this one isn't the result of a simple fold, or + * there'd be more than one code point in the list, but it + * could be part of a multi- character fold. In that case + * we better not create an EXACT node, as we would wrongly + * be telling the optimizer that this code point must be in + * the target string, and that is wrong. This is because + * if the sequence around this code point forms a + * multi-char fold, what needs to be in the string could be + * the code point that folds to the sequence. + * + * This handles the case of below-255 code points, as we + * have an easy look up for those. The next clause handles + * the above-256 one */ + op = IS_IN_SOME_FOLD_L1(start[0]) + ? EXACTFU + : EXACT; + } + else { /* /i, larger code point. Since we are under /i, and + have just this code point, we know that it can't + fold to something else, so PL_InMultiCharFold + applies to it */ + op = _invlist_contains_cp(PL_InMultiCharFold, + start[0]) + ? EXACTFU_ONLY8 + : EXACT_ONLY8; + } + + value = start[0]; + } + else if ( ! (has_runtime_dependency & ~HAS_D_RUNTIME_DEPENDENCY) + && _invlist_contains_cp(PL_in_some_fold, start[0])) + { + /* Here, the only runtime dependency, if any, is from /d, and + * the class matches more than one code point, and the lowest + * code point participates in some fold. It might be that the + * other code points are /i equivalent to this one, and hence + * they would representable by an EXACTFish node. Above, we + * eliminated classes that contain too many code points to be + * EXACTFish, with the test for MAX_FOLD_FROMS + * + * First, special case the ASCII fold pairs, like 'B' and 'b'. + * We do this because we have EXACTFAA at our disposal for the + * ASCII range */ + if (partial_cp_count == 2 && isASCII(start[0])) { + + /* The only ASCII characters that participate in folds are + * alphabetics */ + assert(isALPHA(start[0])); + if ( end[0] == start[0] /* First range is a single + character, so 2nd exists */ + && isALPHA_FOLD_EQ(start[0], start[1])) + { - value = start; + /* Here, is part of an ASCII fold pair */ - if (! FOLD) { - op = (LOC) - ? EXACTL - : EXACT; - } - else if (LOC) { + if ( ASCII_FOLD_RESTRICTED + || HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(start[0])) + { + /* If the second clause just above was true, it + * means we can't be under /i, or else the list + * would have included more than this fold pair. + * Therefore we have to exclude the possibility of + * whatever else it is that folds to these, by + * using EXACTFAA */ + op = EXACTFAA; + } + else if (HAS_NONLATIN1_FOLD_CLOSURE(start[0])) { + + /* Here, there's no simple fold that start[0] is part + * of, but there is a multi-character one. If we + * are not under /i, we want to exclude that + * possibility; if under /i, we want to include it + * */ + op = (FOLD) ? EXACTFU : EXACTFAA; + } + else { + + /* Here, the only possible fold start[0] particpates in + * is with start[1]. /i or not isn't relevant */ + op = EXACTFU; + } - /* A locale node under folding with one code point can - * be an EXACTFL, as its fold won't be calculated until - * runtime */ - op = EXACTFL; + value = toFOLD(start[0]); + } + } + else if ( ! upper_latin1_only_utf8_matches + || ( _invlist_len(upper_latin1_only_utf8_matches) + == 2 + && PL_fold_latin1[ + invlist_highest(upper_latin1_only_utf8_matches)] + == start[0])) + { + /* Here, the smallest character is non-ascii or there are + * more than 2 code points matched by this node. Also, we + * either don't have /d UTF-8 dependent matches, or if we + * do, they look like they could be a single character that + * is the fold of the lowest one in the always-match list. + * This test quickly excludes most of the false positives + * when there are /d UTF-8 depdendent matches. These are + * like LATIN CAPITAL LETTER A WITH GRAVE matching LATIN + * SMALL LETTER A WITH GRAVE iff the target string is + * UTF-8. (We don't have to worry above about exceeding + * the array bounds of PL_fold_latin1[] because any code + * point in 'upper_latin1_only_utf8_matches' is below 256.) + * + * EXACTFAA would apply only to pairs (hence exactly 2 code + * points) in the ASCII range, so we can't use it here to + * artificially restrict the fold domain, so we check if + * the class does or does not match some EXACTFish node. + * Further, if we aren't under /i, and and the folded-to + * character is part of a multi-character fold, we can't do + * this optimization, as the sequence around it could be + * that multi-character fold, and we don't here know the + * context, so we have to assume it is that multi-char + * fold, to prevent potential bugs. + * + * To do the general case, we first find the fold of the + * lowest code point (which may be higher than the lowest + * one), then find everything that folds to it. (The data + * structure we have only maps from the folded code points, + * so we have to do the earlier step.) */ + + Size_t foldlen; + U8 foldbuf[UTF8_MAXBYTES_CASE]; + UV folded = _to_uni_fold_flags(start[0], + foldbuf, &foldlen, 0); + unsigned int first_fold; + const unsigned int * remaining_folds; + Size_t folds_to_this_cp_count = _inverse_folds( + folded, + &first_fold, + &remaining_folds); + Size_t folds_count = folds_to_this_cp_count + 1; + SV * fold_list = _new_invlist(folds_count); + unsigned int i; + + /* If there are UTF-8 dependent matches, create a temporary + * list of what this node matches, including them. */ + SV * all_cp_list = NULL; + SV ** use_this_list = &cp_list; + + if (upper_latin1_only_utf8_matches) { + all_cp_list = _new_invlist(0); + use_this_list = &all_cp_list; + _invlist_union(cp_list, + upper_latin1_only_utf8_matches, + use_this_list); } - else { - /* Here, we are generally folding, but there is only - * one code point to match. If we have to, we use an - * EXACT node, but it would be better for joining with - * adjacent nodes in the optimization phase if we used - * the same EXACTFish node that any such are likely to - * be. We can do this iff the code point doesn't - * participate in any folds. For example, an EXACTF of - * a colon is the same as an EXACT one, since nothing - * folds to or from a colon. */ - if (value < 256) { - if (IS_IN_SOME_FOLD_L1(value)) { - op = EXACT; - } + /* Having gotten everything that participates in the fold + * containing the lowest code point, we turn that into an + * inversion list, making sure everything is included. */ + fold_list = add_cp_to_invlist(fold_list, start[0]); + fold_list = add_cp_to_invlist(fold_list, folded); + if (folds_to_this_cp_count > 0) { + fold_list = add_cp_to_invlist(fold_list, first_fold); + for (i = 0; i + 1 < folds_to_this_cp_count; i++) { + fold_list = add_cp_to_invlist(fold_list, + remaining_folds[i]); } - else { - if (_invlist_contains_cp(PL_in_some_fold, value)) { - op = EXACT; + } + + /* If the fold list is identical to what's in this ANYOF + * node, the node can be represented by an EXACTFish one + * instead */ + if (_invlistEQ(*use_this_list, fold_list, + 0 /* Don't complement */ ) + ) { + + /* But, we have to be careful, as mentioned above. + * Just the right sequence of characters could match + * this if it is part of a multi-character fold. That + * IS what we want if we are under /i. But it ISN'T + * what we want if not under /i, as it could match when + * it shouldn't. So, when we aren't under /i and this + * character participates in a multi-char fold, we + * don't optimize into an EXACTFish node. So, for each + * case below we have to check if we are folding + * and if not, if it is not part of a multi-char fold. + * */ + if (start[0] > 255) { /* Highish code point */ + if (FOLD || ! _invlist_contains_cp( + PL_InMultiCharFold, folded)) + { + op = (LOC) + ? EXACTFLU8 + : (ASCII_FOLD_RESTRICTED) + ? EXACTFAA + : EXACTFU_ONLY8; + value = folded; } + } /* Below, the lowest code point < 256 */ + else if ( FOLD + && folded == 's' + && DEPENDS_SEMANTICS) + { /* An EXACTF node containing a single character + 's', can be an EXACTFU if it doesn't get + joined with an adjacent 's' */ + op = EXACTFU_S_EDGE; + value = folded; } + else if ( FOLD + || ! HAS_NONLATIN1_FOLD_CLOSURE(start[0])) + { + if (upper_latin1_only_utf8_matches) { + op = EXACTF; - /* If we haven't found the node type, above, it means - * we can use the prevailing one */ - if (op == END) { - op = compute_EXACTish(pRExC_state); + /* We can't use the fold, as that only matches + * under UTF-8 */ + value = start[0]; + } + else if ( UNLIKELY(start[0] == MICRO_SIGN) + && ! UTF) + { /* EXACTFUP is a special node for this + character */ + op = (ASCII_FOLD_RESTRICTED) + ? EXACTFAA + : EXACTFUP; + value = MICRO_SIGN; + } + else if ( ASCII_FOLD_RESTRICTED + && ! isASCII(start[0])) + { /* For ASCII under /iaa, we can use EXACTFU + below */ + op = EXACTFAA; + value = folded; + } + else { + op = EXACTFU; + value = folded; + } } } - } - } /* End of first range contains just a single code point */ - else if (start == 0) { - if (end == UV_MAX) { - op = SANY; - *flagp |= HASWIDTH|SIMPLE; - MARK_NAUGHTY(1); - } - else if (end == '\n' - 1 - && invlist_iternext(cp_list, &start, &end) - && start == '\n' + 1 && end == UV_MAX) - { - op = REG_ANY; - *flagp |= HASWIDTH|SIMPLE; - MARK_NAUGHTY(1); + + SvREFCNT_dec_NN(fold_list); + SvREFCNT_dec(all_cp_list); } } - invlist_iterfinish(cp_list); if (op != END) { - ret = reg_node(pRExC_state, op); - if (PL_regkind[op] == EXACT) { - alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value, - TRUE /* downgradable to EXACT */ - ); - } - goto not_anyof; - } - /* Here, didn't find an optimization. See if this matches any - * of the POSIX classes. First try ASCII */ + /* Here, we have calculated what EXACTish node we would use. + * But we don't use it if it would require converting the + * pattern to UTF-8, unless not using it could cause us to miss + * some folds (hence be buggy) */ - if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 0)) { - ret = reg_node(pRExC_state, ASCII); - *flagp |= HASWIDTH|SIMPLE; - goto not_anyof; + if (! UTF && value > 255) { + SV * in_multis = NULL; + + assert(FOLD); + + /* If there is no code point that is part of a multi-char + * fold, then there aren't any matches, so we don't do this + * optimization. Otherwise, it could match depending on + * the context around us, so we do upgrade */ + _invlist_intersection(PL_InMultiCharFold, cp_list, &in_multis); + if (UNLIKELY(_invlist_len(in_multis) != 0)) { + REQUIRE_UTF8(flagp); + } + else { + op = END; + } } - if (_invlistEQ(cp_list, PL_XPosix_ptrs[_CC_ASCII], 1)) { - ret = reg_node(pRExC_state, NASCII); - *flagp |= HASWIDTH|SIMPLE; + if (op != END) { + U8 len = (UTF) ? UVCHR_SKIP(value) : 1; + + ret = regnode_guts(pRExC_state, op, len, "exact"); + FILL_NODE(ret, op); + RExC_emit += 1 + STR_SZ(len); + STR_LEN(REGNODE_p(ret)) = len; + if (len == 1) { + *STRING(REGNODE_p(ret)) = value; + } + else { + uvchr_to_utf8((U8 *) STRING(REGNODE_p(ret)), value); + } goto not_anyof; } + } + } - /* Then try the other POSIX classes. The POSIXA ones are - * about the same speed as ANYOF ops, but take less room; - * the ones that have above-Latin1 code point matches are - * somewhat faster than ANYOF. */ + if (! has_runtime_dependency) { + + /* See if this can be turned into an ANYOFM node. Think about the + * bit patterns in two different bytes. In some positions, the + * bits in each will be 1; and in other positions both will be 0; + * and in some positions the bit will be 1 in one byte, and 0 in + * the other. Let 'n' be the number of positions where the bits + * differ. We create a mask which has exactly 'n' 0 bits, each in + * a position where the two bytes differ. Now take the set of all + * bytes that when ANDed with the mask yield the same result. That + * set has 2**n elements, and is representable by just two 8 bit + * numbers: the result and the mask. Importantly, matching the set + * can be vectorized by creating a word full of the result bytes, + * and a word full of the mask bytes, yielding a significant speed + * up. Here, see if this node matches such a set. As a concrete + * example consider [01], and the byte representing '0' which is + * 0x30 on ASCII machines. It has the bits 0011 0000. Take the + * mask 1111 1110. If we AND 0x31 and 0x30 with that mask we get + * 0x30. Any other bytes ANDed yield something else. So [01], + * which is a common usage, is optimizable into ANYOFM, and can + * benefit from the speed up. We can only do this on UTF-8 + * invariant bytes, because they have the same bit patterns under + * UTF-8 as not. */ + PERL_UINT_FAST8_T inverted = 0; +#ifdef EBCDIC + const PERL_UINT_FAST8_T max_permissible = 0xFF; +#else + const PERL_UINT_FAST8_T max_permissible = 0x7F; +#endif + /* If doesn't fit the criteria for ANYOFM, invert and try again. + * If that works we will instead later generate an NANYOFM, and + * invert back when through */ + if (invlist_highest(cp_list) > max_permissible) { + _invlist_invert(cp_list); + inverted = 1; + } - for (posix_class = 0; - posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC; - posix_class++) - { - int try_inverted; + if (invlist_highest(cp_list) <= max_permissible) { + UV this_start, this_end; + UV lowest_cp = UV_MAX; /* inited to suppress compiler warn */ + U8 bits_differing = 0; + Size_t full_cp_count = 0; + bool first_time = TRUE; - for (try_inverted = 0; try_inverted < 2; try_inverted++) - { + /* Go through the bytes and find the bit positions that differ + * */ + invlist_iterinit(cp_list); + while (invlist_iternext(cp_list, &this_start, &this_end)) { + unsigned int i = this_start; - /* Check if matches POSIXA, normal or inverted */ - if (PL_Posix_ptrs[posix_class]) { - if (_invlistEQ(cp_list, - PL_Posix_ptrs[posix_class], - try_inverted)) - { - ret = reg_node(pRExC_state, (try_inverted) - ? NPOSIXA - : POSIXA); - FLAGS(REGNODE_p(ret)) = posix_class; - *flagp |= HASWIDTH|SIMPLE; - goto not_anyof; - } - } + if (first_time) { + if (! UVCHR_IS_INVARIANT(i)) { + goto done_anyofm; + } - /* Check if matches POSIXU, normal or inverted */ - if (_invlistEQ(cp_list, - PL_XPosix_ptrs[posix_class], - try_inverted)) - { - ret = reg_node(pRExC_state, (try_inverted) - ? NPOSIXU - : POSIXU); + first_time = FALSE; + lowest_cp = this_start; - FLAGS(REGNODE_p(ret)) = posix_class; - *flagp |= HASWIDTH|SIMPLE; - goto not_anyof; - } + /* We have set up the code point to compare with. + * Don't compare it with itself */ + i++; + } + + /* Find the bit positions that differ from the lowest code + * point in the node. Keep track of all such positions by + * OR'ing */ + for (; i <= this_end; i++) { + if (! UVCHR_IS_INVARIANT(i)) { + goto done_anyofm; } + + bits_differing |= i ^ lowest_cp; } - /* If it didn't match a POSIX class, it might be able to be - * turned into an ANYOFM node. Compare two different bytes, - * bit-by-bit. In some positions, the bits in each will be 1; - * and in other positions both will be 0; and in some positions - * the bit will be 1 in one byte, and 0 in the other. Let 'n' - * be the number of positions where the bits differ. We create - * a mask which has exactly 'n' 0 bits, each in a position - * where the two bytes differ. Now take the set of all bytes - * that when ANDed with the mask yield the same result. That - * set has 2**n elements, and is representable by just two 8 - * bit numbers: the result and the mask. Importantly, matching - * the set can be vectorized by creating a word full of the - * result bytes, and a word full of the mask bytes, yielding a - * significant speed up. Here, see if this node matches such a - * set. As a concrete example consider [01], and the byte - * representing '0' which is 0x30 on ASCII machines. It has - * the bits 0011 0000. Take the mask 1111 1110. If we AND - * 0x31 and 0x30 with that mask we get 0x30. Any other bytes - * ANDed yield something else. So [01], which is a common - * usage, is optimizable into ANYOFM, and can benefit from the - * speed up. We can only do this on UTF-8 invariant bytes, - * because the variance would throw this off. */ + full_cp_count += this_end - this_start + 1; + } + invlist_iterfinish(cp_list); + + /* At the end of the loop, we count how many bits differ from + * the bits in lowest code point, call the count 'd'. If the + * set we found contains 2**d elements, it is the closure of + * all code points that differ only in those bit positions. To + * convince yourself of that, first note that the number in the + * closure must be a power of 2, which we test for. The only + * way we could have that count and it be some differing set, + * is if we got some code points that don't differ from the + * lowest code point in any position, but do differ from each + * other in some other position. That means one code point has + * a 1 in that position, and another has a 0. But that would + * mean that one of them differs from the lowest code point in + * that position, which possibility we've already excluded. */ + if ( (inverted || full_cp_count > 1) + && full_cp_count == 1U << PL_bitcount[bits_differing]) { - PERL_UINT_FAST8_T inverted = 0; -#ifdef EBCDIC - const PERL_UINT_FAST8_T max_permissible = 0xFF; -#else - const PERL_UINT_FAST8_T max_permissible = 0x7F; -#endif - if (invlist_highest(cp_list) > max_permissible) { - _invlist_invert(cp_list); - inverted = 1; - } + U8 ANYOFM_mask; + + op = ANYOFM + inverted;; + + /* We need to make the bits that differ be 0's */ + ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS */ + + /* The argument is the lowest code point */ + ret = reganode(pRExC_state, op, lowest_cp); + FLAGS(REGNODE_p(ret)) = ANYOFM_mask; + } + } + done_anyofm: + + if (inverted) { + _invlist_invert(cp_list); + } - if (invlist_highest(cp_list) <= max_permissible) { - Size_t cp_count = 0; - bool first_time = TRUE; - unsigned int lowest_cp = 0xFF; - U8 bits_differing = 0; + if (op != END) { + goto not_anyof; + } + } + + if (! (anyof_flags & ANYOF_LOCALE_FLAGS)) { + PERL_UINT_FAST8_T type; + SV * intersection = NULL; + SV* d_invlist = NULL; + + /* See if this matches any of the POSIX classes. The POSIXA and + * POSIXD ones are about the same speed as ANYOF ops, but take less + * room; the ones that have above-Latin1 code point matches are + * somewhat faster than ANYOF. */ + + for (type = POSIXA; type >= POSIXD; type--) { + int posix_class; + + if (type == POSIXL) { /* But not /l posix classes */ + continue; + } + + for (posix_class = 0; + posix_class <= _HIGHEST_REGCOMP_DOT_H_SYNC; + posix_class++) + { + SV** our_code_points = &cp_list; + SV** official_code_points; + int try_inverted; + + if (type == POSIXA) { + official_code_points = &PL_Posix_ptrs[posix_class]; + } + else { + official_code_points = &PL_XPosix_ptrs[posix_class]; + } - /* Only needed on EBCDIC, as there, variants and non- are mixed - * together. Could #ifdef it out on ASCII, but probably the - * compiler will optimize it out */ - bool has_variant = FALSE; + /* Skip non-existent classes of this type. e.g. \v only + * has an entry in PL_XPosix_ptrs */ + if (! *official_code_points) { + continue; + } - /* Go through the bytes and find the bit positions that differ */ - invlist_iterinit(cp_list); - while (invlist_iternext(cp_list, &start, &end)) { - unsigned int i = start; + /* Try both the regular class, and its inversion */ + for (try_inverted = 0; try_inverted < 2; try_inverted++) { + bool this_inverted = invert ^ try_inverted; - cp_count += end - start + 1; + if (type != POSIXD) { - if (first_time) { - if (! UVCHR_IS_INVARIANT(i)) { - has_variant = TRUE; + /* This class that isn't /d can't match if we have + * /d dependencies */ + if (has_runtime_dependency + & HAS_D_RUNTIME_DEPENDENCY) + { continue; } + } + else /* is /d */ if (! this_inverted) { - first_time = FALSE; - lowest_cp = start; + /* /d classes don't match anything non-ASCII below + * 256 unconditionally (which cp_list contains) */ + _invlist_intersection(cp_list, PL_UpperLatin1, + &intersection); + if (_invlist_len(intersection) != 0) { + continue; + } - i++; + SvREFCNT_dec(d_invlist); + d_invlist = invlist_clone(cp_list, NULL); + + /* But under UTF-8 it turns into using /u rules. + * Add the things it matches under these conditions + * so that we check below that these are identical + * to what the tested class should match */ + if (upper_latin1_only_utf8_matches) { + _invlist_union( + d_invlist, + upper_latin1_only_utf8_matches, + &d_invlist); + } + our_code_points = &d_invlist; } - - /* Find the bit positions that differ from the lowest - * code point in the node. Keep track of all such - * positions by OR'ing */ - for (; i <= end; i++) { - if (! UVCHR_IS_INVARIANT(i)) { - has_variant = TRUE; + else { /* POSIXD, inverted. If this doesn't have this + flag set, it isn't /d. */ + if (! (anyof_flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) + { continue; } + our_code_points = &cp_list; + } - bits_differing |= i ^ lowest_cp; + /* Here, have weeded out some things. We want to see + * if the list of characters this node contains + * ('*our_code_points') precisely matches those of the + * class we are currently checking against + * ('*official_code_points'). */ + if (_invlistEQ(*our_code_points, + *official_code_points, + try_inverted)) + { + /* Here, they precisely match. Optimize this ANYOF + * node into its equivalent POSIX one of the + * correct type, possibly inverted */ + ret = reg_node(pRExC_state, (try_inverted) + ? type + NPOSIXA + - POSIXA + : type); + FLAGS(REGNODE_p(ret)) = posix_class; + SvREFCNT_dec(d_invlist); + SvREFCNT_dec(intersection); + goto not_anyof; } } - invlist_iterfinish(cp_list); - - /* At the end of the loop, we count how many bits differ - * from the bits in lowest code point, call the count 'd'. - * If the set we found contains 2**d elements, it is the - * closure of all code points that differ only in those bit - * positions. To convince yourself of that, first note - * that the number in the closure must be a power of 2, - * which we test for. The only way we could have that - * count and it be some differing set, is if we got some - * code points that don't differ from the lowest code point - * in any position, but do differ from each other in some - * other position. That means one code point has a 1 in - * that position, and another has a 0. But that would mean - * that one of them differs from the lowest code point in - * that position, which possibility we've already excluded. - * */ - if ( ! has_variant - && cp_count == 1U << PL_bitcount[bits_differing]) - { - assert(inverted || cp_count > 1); - op = ANYOFM + inverted;; + } + } + SvREFCNT_dec(d_invlist); + SvREFCNT_dec(intersection); + } - /* We need to make the bits that differ be 0's */ - ANYOFM_mask = ~ bits_differing; /* This goes into FLAGS - */ + /* If didn't find an optimization and there is no need for a + * bitmap, optimize to indicate that */ + if ( start[0] >= NUM_ANYOF_CODE_POINTS + && ! LOC + && ! upper_latin1_only_utf8_matches + && anyof_flags == 0) + { + UV highest_cp = invlist_highest(cp_list); + + /* If the lowest and highest code point in the class have the same + * UTF-8 first byte, then all do, and we can store that byte for + * regexec.c to use so that it can more quickly scan the target + * string for potential matches for this class. We co-opt the the + * flags field for this. Zero means, they don't have the same + * first byte. We do accept here very large code points (for + * future use), but don't bother with this optimization for them, + * as it would cause other complications */ + if (highest_cp > IV_MAX) { + anyof_flags = 0; + } + else { + U8 low_utf8[UTF8_MAXBYTES+1]; + U8 high_utf8[UTF8_MAXBYTES+1]; - /* The argument is the lowest code point */ - ret = reganode(pRExC_state, op, lowest_cp); - FLAGS(REGNODE_p(ret)) = ANYOFM_mask; + (void) uvchr_to_utf8(low_utf8, start[0]); + (void) uvchr_to_utf8(high_utf8, invlist_highest(cp_list)); - *flagp |= HASWIDTH|SIMPLE; - } - } - if (inverted) { - _invlist_invert(cp_list); - } - if (op != END) { - goto not_anyof; - } + anyof_flags = (low_utf8[0] == high_utf8[0]) + ? low_utf8[0] + : 0; } + + op = ANYOFH; } } /* End of seeing if can optimize it into a different node */ - /* It's going to be an ANYOF node. */ - op = (use_anyofd) - ? ANYOFD - : ((posixl) - ? ANYOFPOSIXL - : ((LOC) - ? ANYOFL - : ANYOF)); + is_anyof: /* It's going to be an ANYOF node. */ + if (op != ANYOFH) { + op = (has_runtime_dependency & HAS_D_RUNTIME_DEPENDENCY) + ? ANYOFD + : ((posixl) + ? ANYOFPOSIXL + : ((LOC) + ? ANYOFL + : ANYOF)); + } + ret = regnode_guts(pRExC_state, op, regarglen[op], "anyof"); FILL_NODE(ret, op); /* We set the argument later */ RExC_emit += 1 + regarglen[op]; @@ -18673,29 +19067,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, ANYOF_FLAGS(REGNODE_p(ret)) |= ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP; } - /* If there is a swash and more than one element, we can't use the swash in - * the optimization below. */ - if (swash && element_count > 1) { - SvREFCNT_dec_NN(swash); - swash = NULL; - } - - /* Note that the optimization of using 'swash' if it is the only thing in - * the class doesn't have us change swash at all, so it can include things - * that are also in the bitmap; otherwise we have purposely deleted that - * duplicate information */ set_ANYOF_arg(pRExC_state, REGNODE_p(ret), cp_list, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION) ? listsv : NULL, - only_utf8_locale_list, - swash, has_user_defined_property); - - *flagp |= HASWIDTH|SIMPLE; - - if (ANYOF_FLAGS(REGNODE_p(ret)) & ANYOF_LOCALE_FLAGS) { - RExC_contains_locale = 1; - } - + only_utf8_locale_list); return ret; not_anyof: @@ -18705,7 +19080,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, Set_Node_Offset_Length(REGNODE_p(ret), orig_parse - RExC_start, RExC_parse - orig_parse);; - SvREFCNT_dec_NN(cp_list);; + SvREFCNT_dec(cp_list);; return ret; } @@ -18716,31 +19091,21 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, regnode* const node, SV* const cp_list, SV* const runtime_defns, - SV* const only_utf8_locale_list, - SV* const swash, - const bool has_user_defined_property) + SV* const only_utf8_locale_list) { /* Sets the arg field of an ANYOF-type node 'node', using information about * the node passed-in. If there is nothing outside the node's bitmap, the * arg is set to ANYOF_ONLY_HAS_BITMAP. Otherwise, it sets the argument to * the count returned by add_data(), having allocated and stored an array, - * av, that that count references, as follows: - * av[0] stores the character class description in its textual form. - * This is used later (regexec.c:Perl_regclass_swash()) to - * initialize the appropriate swash, and is also useful for dumping - * the regnode. This is set to &PL_sv_undef if the textual - * description is not needed at run-time (as happens if the other - * elements completely define the class) - * av[1] if &PL_sv_undef, is a placeholder to later contain the swash - * computed from av[0]. But if no further computation need be done, - * the swash is stored here now (and av[0] is &PL_sv_undef). - * av[2] stores the inversion list of code points that match only if the - * current locale is UTF-8 - * av[3] stores the cp_list inversion list for use in addition or instead - * of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef. - * (Otherwise everything needed is already in av[0] and av[1]) - * av[4] is set if any component of the class is from a user-defined - * property; used only if av[3] exists */ + * av, as follows: + * + * av[0] stores the inversion list defining this class as far as known at + * this time, or PL_sv_undef if nothing definite is now known. + * av[1] stores the inversion list of code points that match only if the + * current locale is UTF-8, or if none, PL_sv_undef if there is an + * av[2], or no entry otherwise. + * av[2] stores the list of user-defined properties whose subroutine + * definitions aren't known at this time, or no entry if none. */ UV n; @@ -18755,26 +19120,16 @@ S_set_ANYOF_arg(pTHX_ RExC_state_t* const pRExC_state, AV * const av = newAV(); SV *rv; - av_store(av, 0, (runtime_defns) - ? SvREFCNT_inc(runtime_defns) : &PL_sv_undef); - if (swash) { - assert(cp_list); - av_store(av, 1, swash); - SvREFCNT_dec_NN(cp_list); - } - else { - av_store(av, 1, &PL_sv_undef); - if (cp_list) { - av_store(av, 3, cp_list); - av_store(av, 4, newSVuv(has_user_defined_property)); - } - } + if (cp_list) { + av_store(av, INVLIST_INDEX, cp_list); + } if (only_utf8_locale_list) { - av_store(av, 2, only_utf8_locale_list); + av_store(av, ONLY_LOCALE_MATCHES_INDEX, only_utf8_locale_list); } - else { - av_store(av, 2, &PL_sv_undef); + + if (runtime_defns) { + av_store(av, DEFERRED_USER_DEFINED_INDEX, SvREFCNT_inc(runtime_defns)); } rv = newRV_noinc(MUTABLE_SV(av)); @@ -18795,14 +19150,15 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, { /* For internal core use only. - * Returns the swash for the input 'node' in the regex 'prog'. - * If is 'true', will attempt to create the swash if not already - * done. + * Returns the inversion list for the input 'node' in the regex 'prog'. + * If is 'true', will attempt to create the inversion list if not + * already done. * If is non-null, will return the printable contents of the - * swash. This can be used to get debugging information even before the - * swash exists, by calling this function with 'doinit' set to false, in - * which case the components that will be used to eventually create the - * swash are returned (in a printable form). + * property definition. This can be used to get debugging information + * even before the inversion list exists, by calling this function with + * 'doinit' set to false, in which case the components that will be used + * to eventually create the inversion list are returned (in a printable + * form). * If is not NULL, it is where this routine is to * store an inversion list of code points that should match only if the * execution-time locale is a UTF-8 one. @@ -18810,18 +19166,17 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, * inversion list of the code points that would be instead returned in * if this were NULL. Thus, what gets output in * when this parameter is used, is just the non-code point data that - * will go into creating the swash. This currently should be just + * will go into creating the inversion list. This currently should be just * user-defined properties whose definitions were not known at compile * time. Using this parameter allows for easier manipulation of the - * swash's data by the caller. It is illegal to call this function with - * this parameter set, but not + * inversion list's data by the caller. It is illegal to call this + * function with this parameter set, but not * * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note - * that, in spite of this function's name, the swash it returns may include - * the bitmap data as well */ + * that, in spite of this function's name, the inversion list it returns + * may include the bitmap data as well */ - SV *sw = NULL; - SV *si = NULL; /* Input swash initialization string */ + SV *si = NULL; /* Input initialization string */ SV* invlist = NULL; RXi_GET_DECL(prog, progi); @@ -18837,69 +19192,73 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, SV * const rv = MUTABLE_SV(data->data[n]); AV * const av = MUTABLE_AV(SvRV(rv)); SV **const ary = AvARRAY(av); - U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - si = *ary; /* ary[0] = the string to initialize the swash with */ + invlist = ary[INVLIST_INDEX]; - if (av_tindex_skip_len_mg(av) >= 2) { - if (only_utf8_locale_ptr - && ary[2] - && ary[2] != &PL_sv_undef) - { - *only_utf8_locale_ptr = ary[2]; - } - else { - assert(only_utf8_locale_ptr); - *only_utf8_locale_ptr = NULL; - } - - /* Elements 3 and 4 are either both present or both absent. [3] - * is any inversion list generated at compile time; [4] - * indicates if that inversion list has any user-defined - * properties in it. */ - if (av_tindex_skip_len_mg(av) >= 3) { - invlist = ary[3]; - if (SvUV(ary[4])) { - swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + if (av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX) { + *only_utf8_locale_ptr = ary[ONLY_LOCALE_MATCHES_INDEX]; + } + + if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) { + si = ary[DEFERRED_USER_DEFINED_INDEX]; + } + + if (doinit && (si || invlist)) { + if (si) { + bool user_defined; + SV * msg = newSVpvs_flags("", SVs_TEMP); + + SV * prop_definition = handle_user_defined_property( + "", 0, FALSE, /* There is no \p{}, \P{} */ + SvPVX_const(si)[1] - '0', /* /i or not has been + stored here for just + this occasion */ + TRUE, /* run time */ + FALSE, /* This call must find the defn */ + si, /* The property definition */ + &user_defined, + msg, + 0 /* base level call */ + ); + + if (SvCUR(msg)) { + assert(prop_definition == NULL); + + Perl_croak(aTHX_ "%" UTF8f, + UTF8fARG(SvUTF8(msg), SvCUR(msg), SvPVX(msg))); } - } - else { - invlist = NULL; - } - } - /* Element [1] is reserved for the set-up swash. If already there, - * return it; if not, create it and store it there */ - if (ary[1] && SvROK(ary[1])) { - sw = ary[1]; - } - else if (doinit && ((si && si != &PL_sv_undef) - || (invlist && invlist != &PL_sv_undef))) { - assert(si); - sw = _core_swash_init("utf8", /* the utf8 package */ - "", /* nameless */ - si, - 1, /* binary */ - 0, /* not from tr/// */ - invlist, - &swash_init_flags); - (void)av_store(av, 1, sw); + if (invlist) { + _invlist_union(invlist, prop_definition, &invlist); + SvREFCNT_dec_NN(prop_definition); + } + else { + invlist = prop_definition; + } + + STATIC_ASSERT_STMT(ONLY_LOCALE_MATCHES_INDEX == 1 + INVLIST_INDEX); + STATIC_ASSERT_STMT(DEFERRED_USER_DEFINED_INDEX == 1 + ONLY_LOCALE_MATCHES_INDEX); + + av_store(av, INVLIST_INDEX, invlist); + av_fill(av, (ary[ONLY_LOCALE_MATCHES_INDEX]) + ? ONLY_LOCALE_MATCHES_INDEX: + INVLIST_INDEX); + si = NULL; + } } } } - /* If requested, return a printable version of what this swash matches */ + /* If requested, return a printable version of what this ANYOF node matches + * */ if (listsvp) { SV* matches_string = NULL; - /* The swash should be used, if possible, to get the data, as it - * contains the resolved data. But this function can be called at - * compile-time, before everything gets resolved, in which case we - * return the currently best available information, which is the string - * that will eventually be used to do that resolving, 'si' */ - if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) - && (si && si != &PL_sv_undef)) - { + /* This function can be called at compile-time, before everything gets + * resolved, in which case we return the currently best available + * information, which is the string that will eventually be used to do + * that resolving, 'si' */ + if (si) { /* Here, we only have 'si' (and possibly some passed-in data in * 'invlist', which is handled below) If the caller only wants * 'si', use that. */ @@ -18992,12 +19351,10 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, if (SvCUR(matches_string)) { /* Get rid of trailing blank */ SvCUR_set(matches_string, SvCUR(matches_string) - 1); } - } /* end of has an 'si' but no swash */ + } /* end of has an 'si' */ } - /* If we have a swash in place, its equivalent inversion list was above - * placed into 'invlist'. If not, this variable may contain a stored - * inversion list which is information beyond what is in 'si' */ + /* Add the stuff that's already known */ if (invlist) { /* Again, if the caller doesn't want the output inversion list, put @@ -19021,7 +19378,7 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, *listsvp = matches_string; } - return sw; + return invlist; } #endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */ @@ -19134,7 +19491,9 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) || UTF8_IS_INVARIANT(*RExC_parse) || UTF8_IS_START(*RExC_parse)); - RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; + RExC_parse += (UTF) + ? UTF8_SAFE_SKIP(RExC_parse, RExC_end) + : 1; skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE /* Don't force /x */ ); @@ -19144,6 +19503,9 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) STATIC void S_change_engine_size(pTHX_ RExC_state_t *pRExC_state, const Ptrdiff_t size) { + /* 'size' is the delta to add or subtract from the current memory allocated + * to the regex engine being constructed */ + PERL_ARGS_ASSERT_CHANGE_ENGINE_SIZE; RExC_size += size; @@ -19301,7 +19663,11 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, src = REGNODE_p(RExC_emit); RExC_emit += size; dst = REGNODE_p(RExC_emit); - if (RExC_open_parens) { + + /* If we are in a "count the parentheses" pass, the numbers are unreliable, + * and [perl #133871] shows this can lead to problems, so skip this + * realignment of parens until a later pass when they are reliable */ + if (! IN_PARENS_PASS && RExC_open_parens) { int paren; /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/ /* remember that RExC_npar is rex->nparens + 1, @@ -19374,10 +19740,13 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, const U8 op, } /* -- regtail - set the next-pointer at the end of a node chain of p to val. +- regtail - set the next-pointer at the end of a node chain of p to val. If + that value won't fit in the space available, instead returns FALSE. + (Except asserts if we can't fit in the largest space the regex + engine is designed for.) - SEE ALSO: regtail_study */ -STATIC void +STATIC bool S_regtail(pTHX_ RExC_state_t * pRExC_state, const regnode_offset p, const regnode_offset val, @@ -19399,7 +19768,7 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state, DEBUG_PARSE_MSG((scan==p ? "tail" : "")); regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state); Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n", - SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(REGNODE_p(scan)), + SvPV_nolen_const(RExC_mysv), scan, (temp == NULL ? "->" : ""), (temp == NULL ? PL_reg_name[OP(REGNODE_p(val))] : "") ); @@ -19410,11 +19779,21 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state, } if (reg_off_by_arg[OP(REGNODE_p(scan))]) { + assert(val - scan <= U32_MAX); ARG_SET(REGNODE_p(scan), val - scan); } else { + if (val - scan > U16_MAX) { + /* Since not all callers check the return value, populate this with + * something that won't loop and will likely lead to a crash if + * execution continues */ + NEXT_OFF(REGNODE_p(scan)) = U16_MAX; + return FALSE; + } NEXT_OFF(REGNODE_p(scan)) = val - scan; } + + return TRUE; } #ifdef DEBUGGING @@ -19431,10 +19810,14 @@ that it is purely analytical. Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used to control which is which. +This used to return a value that was ignored. It was a problem that it is +#ifdef'd to be another function that didn't return a value. khw has changed it +so both currently return a pass/fail return. + */ /* TODO: All four parms should be const */ -STATIC U8 +STATIC bool S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, const regnode_offset val, U32 depth) { @@ -19458,7 +19841,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, bool unfolded_multi_char; /* Unexamined in this routine */ if (join_exact(pRExC_state, scan, &min, &unfolded_multi_char, 1, REGNODE_p(val), depth+1)) - return EXACT; + return TRUE; /* Was return EXACT */ } #endif if ( exact ) { @@ -19490,7 +19873,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, regprop(RExC_rx, RExC_mysv, REGNODE_p(scan), NULL, pRExC_state); Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n", SvPV_nolen_const(RExC_mysv), - REG_NODE_NUM(REGNODE_p(scan)), + scan, PL_reg_name[exact]); }); if (temp == NULL) @@ -19503,18 +19886,23 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode_offset p, Perl_re_printf( aTHX_ "~ attach to %s (%" IVdf ") offset to %" IVdf "\n", SvPV_nolen_const(RExC_mysv), - (IV)REG_NODE_NUM(REGNODE_p(val)), + (IV)val, (IV)(val - scan) ); }); if (reg_off_by_arg[OP(REGNODE_p(scan))]) { + assert(val - scan <= U32_MAX); ARG_SET(REGNODE_p(scan), val - scan); } else { + if (val - scan > U16_MAX) { + NEXT_OFF(REGNODE_p(scan)) = U16_MAX; + return FALSE; + } NEXT_OFF(REGNODE_p(scan)) = val - scan; } - return exact; + return TRUE; /* Was 'return exact' */ } #endif @@ -19776,6 +20164,7 @@ void Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo, const RExC_state_t *pRExC_state) { #ifdef DEBUGGING + dVAR; int k; RXi_GET_DECL(prog, progi); GET_RE_DEBUG_FLAGS_DECL; @@ -19925,7 +20314,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ /* 2: embedded, otherwise 1 */ Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); else if (k == ANYOF) { - const U8 flags = ANYOF_FLAGS(o); + const U8 flags = (OP(o) == ANYOFH) ? 0 : ANYOF_FLAGS(o); bool do_sep = FALSE; /* Do we need to separate various components of the output? */ /* Set if there is still an unresolved user-defined property */ @@ -19981,42 +20370,46 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ /* Ready to start outputting. First, the initial left bracket */ Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); - /* Then all the things that could fit in the bitmap */ - do_sep = put_charclass_bitmap_innards(sv, - ANYOF_BITMAP(o), - bitmap_range_not_in_bitmap, - only_utf8_locale_invlist, - o, - - /* Can't try inverting for a - * better display if there are - * things that haven't been - * resolved */ - unresolved != NULL); - SvREFCNT_dec(bitmap_range_not_in_bitmap); - - /* If there are user-defined properties which haven't been defined yet, - * output them. If the result is not to be inverted, it is clearest to - * output them in a separate [] from the bitmap range stuff. If the - * result is to be complemented, we have to show everything in one [], - * as the inversion applies to the whole thing. Use {braces} to - * separate them from anything in the bitmap and anything above the - * bitmap. */ - if (unresolved) { - if (inverted) { - if (! do_sep) { /* If didn't output anything in the bitmap */ - sv_catpvs(sv, "^"); + if (OP(o) != ANYOFH) { + /* Then all the things that could fit in the bitmap */ + do_sep = put_charclass_bitmap_innards(sv, + ANYOF_BITMAP(o), + bitmap_range_not_in_bitmap, + only_utf8_locale_invlist, + o, + + /* Can't try inverting for a + * better display if there + * are things that haven't + * been resolved */ + unresolved != NULL); + SvREFCNT_dec(bitmap_range_not_in_bitmap); + + /* If there are user-defined properties which haven't been defined + * yet, output them. If the result is not to be inverted, it is + * clearest to output them in a separate [] from the bitmap range + * stuff. If the result is to be complemented, we have to show + * everything in one [], as the inversion applies to the whole + * thing. Use {braces} to separate them from anything in the + * bitmap and anything above the bitmap. */ + if (unresolved) { + if (inverted) { + if (! do_sep) { /* If didn't output anything in the bitmap + */ + sv_catpvs(sv, "^"); + } + sv_catpvs(sv, "{"); } - sv_catpvs(sv, "{"); - } - else if (do_sep) { - Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], PL_colors[0]); - } - sv_catsv(sv, unresolved); - if (inverted) { - sv_catpvs(sv, "}"); + else if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s", PL_colors[1], + PL_colors[0]); + } + sv_catsv(sv, unresolved); + if (inverted) { + sv_catpvs(sv, "}"); + } + do_sep = ! inverted; } - do_sep = ! inverted; } /* And, finally, add the above-the-bitmap stuff */ @@ -20075,6 +20468,11 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ /* And finally the matching, closing ']' */ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); + if (OP(o) == ANYOFH && FLAGS(o) != 0) { + Perl_sv_catpvf(aTHX_ sv, " (First UTF-8 byte=\\x%02x)", FLAGS(o)); + } + + SvREFCNT_dec(unresolved); } else if (k == ANYOFM) { @@ -20117,8 +20515,13 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ assert(FLAGS(o) < C_ARRAY_LENGTH(bounds)); sv_catpv(sv, bounds[FLAGS(o)]); } - else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) - Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags)); + else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) { + Perl_sv_catpvf(aTHX_ sv, "[%d", -(o->flags)); + if (o->next_off) { + Perl_sv_catpvf(aTHX_ sv, "..-%d", o->flags - o->next_off); + } + Perl_sv_catpvf(aTHX_ sv, "]"); + } else if (OP(o) == SBOL) Perl_sv_catpvf(aTHX_ sv, " /%s/", o->flags ? "\\A" : "^"); @@ -21064,6 +21467,7 @@ S_put_charclass_bitmap_innards_common(pTHX_ * output would have been only the inversion indicator '^', NULL is instead * returned. */ + dVAR; SV * output; PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON; @@ -21167,6 +21571,8 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, * whether the class itself is to be inverted. However, there are some * cases where it can't try inverting, as what actually matches isn't known * until runtime, and hence the inversion isn't either. */ + + dVAR; bool inverting_allowed = ! force_as_is_display; int i; @@ -21561,6 +21967,17 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, void Perl_init_uniprops(pTHX) { + dVAR; + + PL_user_def_props = newHV(); + +#ifdef USE_ITHREADS + + HvSHAREKEYS_off(PL_user_def_props); + PL_user_def_props_aTHX = aTHX; + +#endif + /* Set up the inversion list global variables */ PL_XPosix_ptrs[_CC_ASCII] = _new_invlist_C_array(uni_prop_ptrs[UNI_ASCII]); @@ -21618,8 +22035,10 @@ Perl_init_uniprops(pTHX) PL_in_some_fold = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]); PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[ UNI__PERL_FOLDS_TO_MULTI_CHAR]); - PL_InMultiCharFold = _new_invlist_C_array(UNI__PERL_IS_IN_MULTI_CHAR_FOLD_invlist); - PL_NonFinalFold = _new_invlist_C_array(UNI__PERL_NON_FINAL_FOLDS_invlist); + PL_InMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[ + UNI__PERL_IS_IN_MULTI_CHAR_FOLD]); + PL_NonFinalFold = _new_invlist_C_array(uni_prop_ptrs[ + UNI__PERL_NON_FINAL_FOLDS]); PL_utf8_toupper = _new_invlist_C_array(Uppercase_Mapping_invlist); PL_utf8_tolower = _new_invlist_C_array(Lowercase_Mapping_invlist); @@ -21628,6 +22047,8 @@ Perl_init_uniprops(pTHX) PL_utf8_tosimplefold = _new_invlist_C_array(Simple_Case_Folding_invlist); PL_utf8_foldclosures = _new_invlist_C_array(_Perl_IVCF_invlist); PL_utf8_mark = _new_invlist_C_array(uni_prop_ptrs[UNI_M]); + PL_CCC_non0_non230 = _new_invlist_C_array(_Perl_CCC_non0_non230_invlist); + PL_Private_Use = _new_invlist_C_array(uni_prop_ptrs[UNI_CO]); #ifdef UNI_XIDC /* The below are used only by deprecated functions. They could be removed */ @@ -21637,39 +22058,451 @@ Perl_init_uniprops(pTHX) #endif } -SV * -Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, - const bool to_fold, bool * invert) +#if 0 + +This code was mainly added for backcompat to give a warning for non-portable +code points in user-defined properties. But experiments showed that the +warning in earlier perls were only omitted on overflow, which should be an +error, so there really isnt a backcompat issue, and actually adding the +warning when none was present before might cause breakage, for little gain. So +khw left this code in, but not enabled. Tests were never added. + +embed.fnc entry: +Ei |const char *|get_extended_utf8_msg|const UV cp + +PERL_STATIC_INLINE const char * +S_get_extended_utf8_msg(pTHX_ const UV cp) { - /* Parse the interior meat of \p{} passed to this in 'name' with length - * 'name_len', and return an inversion list if a property with 'name' is - * found, or NULL if not. 'name' point to the input with leading and - * trailing space trimmed. 'to_fold' indicates if /i is in effect. + U8 dummy[UTF8_MAXBYTES + 1]; + HV *msgs; + SV **msg; + + uvchr_to_utf8_flags_msgs(dummy, cp, UNICODE_WARN_PERL_EXTENDED, + &msgs); + + msg = hv_fetchs(msgs, "text", 0); + assert(msg); + + (void) sv_2mortal((SV *) msgs); + + return SvPVX(*msg); +} + +#endif + +SV * +Perl_handle_user_defined_property(pTHX_ + + /* Parses the contents of a user-defined property definition; returning the + * expanded definition if possible. If so, the return is an inversion + * list. * - * When the return is an inversion list, '*invert' will be set to a boolean - * indicating if it should be inverted or not + * If there are subroutines that are part of the expansion and which aren't + * known at the time of the call to this function, this returns what + * parse_uniprop_string() returned for the first one encountered. * - * This currently doesn't handle all cases. A NULL return indicates the - * caller should try a different approach - */ + * If an error was found, NULL is returned, and 'msg' gets a suitable + * message appended to it. (Appending allows the back trace of how we got + * to the faulty definition to be displayed through nested calls of + * user-defined subs.) + * + * The caller IS responsible for freeing any returned SV. + * + * The syntax of the contents is pretty much described in perlunicode.pod, + * but we also allow comments on each line */ + + const char * name, /* Name of property */ + const STRLEN name_len, /* The name's length in bytes */ + const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */ + const bool to_fold, /* ? Is this under /i */ + const bool runtime, /* ? Are we in compile- or run-time */ + const bool deferrable, /* Is it ok for this property's full definition + to be deferred until later? */ + SV* contents, /* The property's definition */ + bool *user_defined_ptr, /* This will be set TRUE as we wouldn't be + getting called unless this is thought to be + a user-defined property */ + SV * msg, /* Any error or warning msg(s) are appended to + this */ + const STRLEN level) /* Recursion level of this call */ +{ + STRLEN len; + const char * string = SvPV_const(contents, len); + const char * const e = string + len; + const bool is_contents_utf8 = cBOOL(SvUTF8(contents)); + const STRLEN msgs_length_on_entry = SvCUR(msg); + + const char * s0 = string; /* Points to first byte in the current line + being parsed in 'string' */ + const char overflow_msg[] = "Code point too large in \""; + SV* running_definition = NULL; + + PERL_ARGS_ASSERT_HANDLE_USER_DEFINED_PROPERTY; + + *user_defined_ptr = TRUE; + + /* Look at each line */ + while (s0 < e) { + const char * s; /* Current byte */ + char op = '+'; /* Default operation is 'union' */ + IV min = 0; /* range begin code point */ + IV max = -1; /* and range end */ + SV* this_definition; + + /* Skip comment lines */ + if (*s0 == '#') { + s0 = strchr(s0, '\n'); + if (s0 == NULL) { + break; + } + s0++; + continue; + } - char* lookup_name; - bool stricter = FALSE; - bool is_nv_type = FALSE; /* nv= or numeric_value=, or possibly one - of the cjk numeric properties (though - it requires extra effort to compile - them) */ - unsigned int i; - unsigned int j = 0, lookup_len; - int equals_pos = -1; /* Where the '=' is found, or negative if none */ - int slash_pos = -1; /* Where the '/' is found, or negative if none */ - int table_index = 0; - bool starts_with_In_or_Is = FALSE; - Size_t lookup_offset = 0; + /* For backcompat, allow an empty first line */ + if (*s0 == '\n') { + s0++; + continue; + } + + /* First character in the line may optionally be the operation */ + if ( *s0 == '+' + || *s0 == '!' + || *s0 == '-' + || *s0 == '&') + { + op = *s0++; + } + + /* If the line is one or two hex digits separated by blank space, its + * a range; otherwise it is either another user-defined property or an + * error */ + + s = s0; + + if (! isXDIGIT(*s)) { + goto check_if_property; + } + + do { /* Each new hex digit will add 4 bits. */ + if (min > ( (IV) MAX_LEGAL_CP >> 4)) { + s = strchr(s, '\n'); + if (s == NULL) { + s = e; + } + if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); + sv_catpv(msg, overflow_msg); + Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, + UTF8fARG(is_contents_utf8, s - s0, s0)); + sv_catpvs(msg, "\""); + goto return_msg; + } + + /* Accumulate this digit into the value */ + min = (min << 4) + READ_XDIGIT(s); + } while (isXDIGIT(*s)); + + while (isBLANK(*s)) { s++; } + + /* We allow comments at the end of the line */ + if (*s == '#') { + s = strchr(s, '\n'); + if (s == NULL) { + s = e; + } + s++; + } + else if (s < e && *s != '\n') { + if (! isXDIGIT(*s)) { + goto check_if_property; + } + + /* Look for the high point of the range */ + max = 0; + do { + if (max > ( (IV) MAX_LEGAL_CP >> 4)) { + s = strchr(s, '\n'); + if (s == NULL) { + s = e; + } + if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); + sv_catpv(msg, overflow_msg); + Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, + UTF8fARG(is_contents_utf8, s - s0, s0)); + sv_catpvs(msg, "\""); + goto return_msg; + } + + max = (max << 4) + READ_XDIGIT(s); + } while (isXDIGIT(*s)); + + while (isBLANK(*s)) { s++; } + + if (*s == '#') { + s = strchr(s, '\n'); + if (s == NULL) { + s = e; + } + } + else if (s < e && *s != '\n') { + goto check_if_property; + } + } + + if (max == -1) { /* The line only had one entry */ + max = min; + } + else if (max < min) { + if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); + sv_catpvs(msg, "Illegal range in \""); + Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, + UTF8fARG(is_contents_utf8, s - s0, s0)); + sv_catpvs(msg, "\""); + goto return_msg; + } + +#if 0 /* See explanation at definition above of get_extended_utf8_msg() */ + + if ( UNICODE_IS_PERL_EXTENDED(min) + || UNICODE_IS_PERL_EXTENDED(max)) + { + if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); + + /* If both code points are non-portable, warn only on the lower + * one. */ + sv_catpv(msg, get_extended_utf8_msg( + (UNICODE_IS_PERL_EXTENDED(min)) + ? min : max)); + sv_catpvs(msg, " in \""); + Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, + UTF8fARG(is_contents_utf8, s - s0, s0)); + sv_catpvs(msg, "\""); + } + +#endif + + /* Here, this line contains a legal range */ + this_definition = sv_2mortal(_new_invlist(2)); + this_definition = _add_range_to_invlist(this_definition, min, max); + goto calculate; + + check_if_property: + + /* Here it isn't a legal range line. See if it is a legal property + * line. First find the end of the meat of the line */ + s = strpbrk(s, "#\n"); + if (s == NULL) { + s = e; + } + + /* Ignore trailing blanks in keeping with the requirements of + * parse_uniprop_string() */ + s--; + while (s > s0 && isBLANK_A(*s)) { + s--; + } + s++; + + this_definition = parse_uniprop_string(s0, s - s0, + is_utf8, to_fold, runtime, + deferrable, + user_defined_ptr, msg, + (name_len == 0) + ? level /* Don't increase level + if input is empty */ + : level + 1 + ); + if (this_definition == NULL) { + goto return_msg; /* 'msg' should have had the reason appended to + it by the above call */ + } + + if (! is_invlist(this_definition)) { /* Unknown at this time */ + return newSVsv(this_definition); + } + + if (*s != '\n') { + s = strchr(s, '\n'); + if (s == NULL) { + s = e; + } + } + + calculate: + + switch (op) { + case '+': + _invlist_union(running_definition, this_definition, + &running_definition); + break; + case '-': + _invlist_subtract(running_definition, this_definition, + &running_definition); + break; + case '&': + _invlist_intersection(running_definition, this_definition, + &running_definition); + break; + case '!': + _invlist_union_complement_2nd(running_definition, + this_definition, &running_definition); + break; + default: + Perl_croak(aTHX_ "panic: %s: %d: Unexpected operation %d", + __FILE__, __LINE__, op); + break; + } + + /* Position past the '\n' */ + s0 = s + 1; + } /* End of loop through the lines of 'contents' */ + + /* Here, we processed all the lines in 'contents' without error. If we + * didn't add any warnings, simply return success */ + if (msgs_length_on_entry == SvCUR(msg)) { + + /* If the expansion was empty, the answer isn't nothing: its an empty + * inversion list */ + if (running_definition == NULL) { + running_definition = _new_invlist(1); + } + + return running_definition; + } + + /* Otherwise, add some explanatory text, but we will return success */ + + return_msg: + + if (name_len > 0) { + sv_catpvs(msg, " in expansion of "); + Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name)); + } + + return running_definition; +} + +/* As explained below, certain operations need to take place in the first + * thread created. These macros switch contexts */ +#ifdef USE_ITHREADS +# define DECLARATION_FOR_GLOBAL_CONTEXT \ + PerlInterpreter * save_aTHX = aTHX; +# define SWITCH_TO_GLOBAL_CONTEXT \ + PERL_SET_CONTEXT((aTHX = PL_user_def_props_aTHX)) +# define RESTORE_CONTEXT PERL_SET_CONTEXT((aTHX = save_aTHX)); +# define CUR_CONTEXT aTHX +# define ORIGINAL_CONTEXT save_aTHX +#else +# define DECLARATION_FOR_GLOBAL_CONTEXT +# define SWITCH_TO_GLOBAL_CONTEXT NOOP +# define RESTORE_CONTEXT NOOP +# define CUR_CONTEXT NULL +# define ORIGINAL_CONTEXT NULL +#endif + +STATIC void +S_delete_recursion_entry(pTHX_ void *key) +{ + /* Deletes the entry used to detect recursion when expanding user-defined + * properties. This is a function so it can be set up to be called even if + * the program unexpectedly quits */ + + dVAR; + SV ** current_entry; + const STRLEN key_len = strlen((const char *) key); + DECLARATION_FOR_GLOBAL_CONTEXT; + + SWITCH_TO_GLOBAL_CONTEXT; + + /* If the entry is one of these types, it is a permanent entry, and not the + * one used to detect recursions. This function should delete only the + * recursion entry */ + current_entry = hv_fetch(PL_user_def_props, (const char *) key, key_len, 0); + if ( current_entry + && ! is_invlist(*current_entry) + && ! SvPOK(*current_entry)) + { + (void) hv_delete(PL_user_def_props, (const char *) key, key_len, + G_DISCARD); + } + + RESTORE_CONTEXT; +} + +SV * +Perl_parse_uniprop_string(pTHX_ + + /* Parse the interior of a \p{}, \P{}. Returns its definition if knowable + * now. If so, the return is an inversion list. + * + * If the property is user-defined, it is a subroutine, which in turn + * may call other subroutines. This function will call the whole nest of + * them to get the definition they return; if some aren't known at the time + * of the call to this function, the fully qualified name of the highest + * level sub is returned. It is an error to call this function at runtime + * without every sub defined. + * + * If an error was found, NULL is returned, and 'msg' gets a suitable + * message appended to it. (Appending allows the back trace of how we got + * to the faulty definition to be displayed through nested calls of + * user-defined subs.) + * + * The caller should NOT try to free any returned inversion list. + * + * Other parameters will be set on return as described below */ + + const char * const name, /* The first non-blank in the \p{}, \P{} */ + const Size_t name_len, /* Its length in bytes, not including any + trailing space */ + const bool is_utf8, /* ? Is 'name' encoded in UTF-8 */ + const bool to_fold, /* ? Is this under /i */ + const bool runtime, /* TRUE if this is being called at run time */ + const bool deferrable, /* TRUE if it's ok for the definition to not be + known at this call */ + bool *user_defined_ptr, /* Upon return from this function it will be + set to TRUE if any component is a + user-defined property */ + SV * msg, /* Any error or warning msg(s) are appended to + this */ + const STRLEN level) /* Recursion level of this call */ +{ + dVAR; + char* lookup_name; /* normalized name for lookup in our tables */ + unsigned lookup_len; /* Its length */ + bool stricter = FALSE; /* Some properties have stricter name + normalization rules, which we decide upon + based on parsing */ + + /* nv= or numeric_value=, or possibly one of the cjk numeric properties + * (though it requires extra effort to download them from Unicode and + * compile perl to know about them) */ + bool is_nv_type = FALSE; + + unsigned int i, j = 0; + int equals_pos = -1; /* Where the '=' is found, or negative if none */ + int slash_pos = -1; /* Where the '/' is found, or negative if none */ + int table_index = 0; /* The entry number for this property in the table + of all Unicode property names */ + bool starts_with_In_or_Is = FALSE; /* ? Does the name start with 'In' or + 'Is' */ + Size_t lookup_offset = 0; /* Used to ignore the first few characters of + the normalized name in certain situations */ + Size_t non_pkg_begin = 0; /* Offset of first byte in 'name' that isn't + part of a package name */ + bool could_be_user_defined = TRUE; /* ? Could this be a user-defined + property rather than a Unicode + one. */ + SV * prop_definition = NULL; /* The returned definition of 'name' or NULL + if an error. If it is an inversion list, + it is the definition. Otherwise it is a + string containing the fully qualified sub + name of 'name' */ + bool invert_return = FALSE; /* ? Do we need to complement the result before + returning it */ PERL_ARGS_ASSERT_PARSE_UNIPROP_STRING; - /* The input will be modified into 'lookup_name' */ + /* The input will be normalized into 'lookup_name' */ Newx(lookup_name, name_len, char); SAVEFREEPV(lookup_name); @@ -21677,40 +22510,86 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, for (i = 0; i < name_len; i++) { char cur = name[i]; - /* These characters can be freely ignored in most situations. Later it - * may turn out we shouldn't have ignored them, and we have to reparse, - * but we don't have enough information yet to make that decision */ - if (cur == '-' || cur == '_' || isSPACE_A(cur)) { + /* Most of the characters in the input will be of this ilk, being parts + * of a name */ + if (isIDCONT_A(cur)) { + + /* Case differences are ignored. Our lookup routine assumes + * everything is lowercase, so normalize to that */ + if (isUPPER_A(cur)) { + lookup_name[j++] = toLOWER_A(cur); + continue; + } + + if (cur == '_') { /* Don't include these in the normalized name */ + continue; + } + + lookup_name[j++] = cur; + + /* The first character in a user-defined name must be of this type. + * */ + if (i - non_pkg_begin == 0 && ! isIDFIRST_A(cur)) { + could_be_user_defined = FALSE; + } + continue; } - /* Case differences are also ignored. Our lookup routine assumes - * everything is lowercase */ - if (isUPPER_A(cur)) { - lookup_name[j++] = toLOWER(cur); + /* Here, the character is not something typically in a name, But these + * two types of characters (and the '_' above) can be freely ignored in + * most situations. Later it may turn out we shouldn't have ignored + * them, and we have to reparse, but we don't have enough information + * yet to make that decision */ + if (cur == '-' || isSPACE_A(cur)) { + could_be_user_defined = FALSE; continue; } - /* A double colon is either an error, or a package qualifier to a - * subroutine user-defined property; neither of which do we currently - * handle - * - * But a single colon is a synonym for '=' */ - if (cur == ':') { - if (i < name_len - 1 && name[i+1] == ':') { - return NULL; - } - cur = '='; + /* An equals sign or single colon mark the end of the first part of + * the property name */ + if ( cur == '=' + || (cur == ':' && (i >= name_len - 1 || name[i+1] != ':'))) + { + lookup_name[j++] = '='; /* Treat the colon as an '=' */ + equals_pos = j; /* Note where it occurred in the input */ + could_be_user_defined = FALSE; + break; } /* Otherwise, this character is part of the name. */ lookup_name[j++] = cur; - /* Only the equals sign needs further processing */ - if (cur == '=') { - equals_pos = j; /* Note where it occurred in the input */ - break; + /* Here it isn't a single colon, so if it is a colon, it must be a + * double colon */ + if (cur == ':') { + + /* A double colon should be a package qualifier. We note its + * position and continue. Note that one could have + * pkg1::pkg2::...::foo + * so that the position at the end of the loop will be just after + * the final qualifier */ + + i++; + non_pkg_begin = i + 1; + lookup_name[j++] = ':'; } + else { /* Only word chars (and '::') can be in a user-defined name */ + could_be_user_defined = FALSE; + } + } /* End of parsing through the lhs of the property name (or all of it if + no rhs) */ + +#define STRLENs(s) (sizeof("" s "") - 1) + + /* If there is a single package name 'utf8::', it is ambiguous. It could + * be for a user-defined property, or it could be a Unicode property, as + * all of them are considered to be for that package. For the purposes of + * parsing the rest of the property, strip it off */ + if (non_pkg_begin == STRLENs("utf8::") && memBEGINPs(name, name_len, "utf8::")) { + lookup_name += STRLENs("utf8::"); + j -= STRLENs("utf8::"); + equals_pos -= STRLENs("utf8::"); } /* Here, we are either done with the whole property name, if it was simple; @@ -21727,17 +22606,184 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, } } - /* Certain properties need special handling. They may optionally be - * prefixed by 'is'. Ignore that prefix for the purposes of checking - * if this is one of those properties */ + /* Most punctuation after the equals indicates a subpattern, like + * \p{foo=/bar/} */ + if ( isPUNCT_A(name[i]) + && name[i] != '-' + && name[i] != '+' + && name[i] != '_' + && name[i] != '{') + { + /* Find the property. The table includes the equals sign, so we + * use 'j' as-is */ + table_index = match_uniprop((U8 *) lookup_name, j); + if (table_index) { + const char * const * prop_values + = UNI_prop_value_ptrs[table_index]; + SV * subpattern; + Size_t subpattern_len; + REGEXP * subpattern_re; + char open = name[i++]; + char close; + const char * pos_in_brackets; + bool escaped = 0; + + /* A backslash means the real delimitter is the next character. + * */ + if (open == '\\') { + open = name[i++]; + escaped = 1; + } + + /* This data structure is constructed so that the matching + * closing bracket is 3 past its matching opening. The second + * set of closing is so that if the opening is something like + * ']', the closing will be that as well. Something similar is + * done in toke.c */ + pos_in_brackets = strchr("([<)]>)]>", open); + close = (pos_in_brackets) ? pos_in_brackets[3] : open; + + if ( name[name_len-1] != close + || (escaped && name[name_len-2] != '\\')) + { + sv_catpvs(msg, "Unicode property wildcard not terminated"); + goto append_name_to_msg; + } + + Perl_ck_warner_d(aTHX_ + packWARN(WARN_EXPERIMENTAL__UNIPROP_WILDCARDS), + "The Unicode property wildcards feature is experimental"); + + /* Now create and compile the wildcard subpattern. Use /iaa + * because nothing outside of ASCII will match, and it the + * property values should all match /i. Note that when the + * pattern fails to compile, our added text to the user's + * pattern will be displayed to the user, which is not so + * desirable. */ + subpattern_len = name_len - i - 1 - escaped; + subpattern = Perl_newSVpvf(aTHX_ "(?iaa:%.*s)", + (unsigned) subpattern_len, + name + i); + subpattern = sv_2mortal(subpattern); + subpattern_re = re_compile(subpattern, 0); + assert(subpattern_re); /* Should have died if didn't compile + successfully */ + + /* For each legal property value, see if the supplied pattern + * matches it. */ + while (*prop_values) { + const char * const entry = *prop_values; + const Size_t len = strlen(entry); + SV* entry_sv = newSVpvn_flags(entry, len, SVs_TEMP); + + if (pregexec(subpattern_re, + (char *) entry, + (char *) entry + len, + (char *) entry, 0, + entry_sv, + 0)) + { /* Here, matched. Add to the returned list */ + Size_t total_len = j + len; + SV * sub_invlist = NULL; + char * this_string; + + /* We know this is a legal \p{property=value}. Call + * the function to return the list of code points that + * match it */ + Newxz(this_string, total_len + 1, char); + Copy(lookup_name, this_string, j, char); + my_strlcat(this_string, entry, total_len + 1); + SAVEFREEPV(this_string); + sub_invlist = parse_uniprop_string(this_string, + total_len, + is_utf8, + to_fold, + runtime, + deferrable, + user_defined_ptr, + msg, + level + 1); + _invlist_union(prop_definition, sub_invlist, + &prop_definition); + } + + prop_values++; /* Next iteration, look at next propvalue */ + } /* End of looking through property values; (the data + structure is terminated by a NULL ptr) */ + + SvREFCNT_dec_NN(subpattern_re); + + if (prop_definition) { + return prop_definition; + } + + sv_catpvs(msg, "No Unicode property value wildcard matches:"); + goto append_name_to_msg; + } + + /* Here's how khw thinks we should proceed to handle the properties + * not yet done: Bidi Mirroring Glyph + Bidi Paired Bracket + Case Folding (both full and simple) + Decomposition Mapping + Equivalent Unified Ideograph + Name + Name Alias + Lowercase Mapping (both full and simple) + NFKC Case Fold + Titlecase Mapping (both full and simple) + Uppercase Mapping (both full and simple) + * Move the part that looks at the property values into a perl + * script, like utf8_heavy.pl is done. This makes things somewhat + * easier, but most importantly, it avoids always adding all these + * strings to the memory usage when the feature is little-used. + * + * The property values would all be concatenated into a single + * string per property with each value on a separate line, and the + * code point it's for on alternating lines. Then we match the + * user's input pattern m//mg, without having to worry about their + * uses of '^' and '$'. Only the values that aren't the default + * would be in the strings. Code points would be in UTF-8. The + * search pattern that we would construct would look like + * (?: \n (code-point_re) \n (?aam: user-re ) \n ) + * And so $1 would contain the code point that matched the user-re. + * For properties where the default is the code point itself, such + * as any of the case changing mappings, the string would otherwise + * consist of all Unicode code points in UTF-8 strung together. + * This would be impractical. So instead, examine their compiled + * pattern, looking at the ssc. If none, reject the pattern as an + * error. Otherwise run the pattern against every code point in + * the ssc. The ssc is kind of like tr18's 3.9 Possible Match Sets + * And it might be good to create an API to return the ssc. + * + * For the name properties, a new function could be created in + * charnames which essentially does the same thing as above, + * sharing Name.pl with the other charname functions. Don't know + * about loose name matching, or algorithmically determined names. + * Decomposition.pl similarly. + * + * It might be that a new pattern modifier would have to be + * created, like /t for resTricTed, which changed the behavior of + * some constructs in their subpattern, like \A. */ + } /* End of is a wildcard subppattern */ + + + /* Certain properties whose values are numeric need special handling. + * They may optionally be prefixed by 'is'. Ignore that prefix for the + * purposes of checking if this is one of those properties */ if (memBEGINPs(lookup_name, name_len, "is")) { lookup_offset = 2; } - /* Then check if it is one of these properties. This is hard-coded - * because easier this way, and the list is unlikely to change. There - * are several properties like this in the Unihan DB, which is unlikely - * to be compiled, and they all end with 'numeric'. The interiors + /* Then check if it is one of these specially-handled properties. The + * possibilities are hard-coded because easier this way, and the list + * is unlikely to change. + * + * All numeric value type properties are of this ilk, and are also + * special in a different way later on. So find those first. There + * are several numeric value type properties in the Unihan DB (which is + * unlikely to be compiled with perl, but we handle it here in case it + * does get compiled). They all end with 'numeric'. The interiors * aren't checked for the precise property. This would stop working if * a cjk property were to be created that ended with 'numeric' and * wasn't a numeric type */ @@ -21765,15 +22811,14 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, { unsigned int k; - /* What makes these properties special is that the stuff after the - * '=' is a number. Therefore, we can't throw away '-' - * willy-nilly, as those could be a minus sign. Other stricter + /* Since the stuff after the '=' is a number, we can't throw away + * '-' willy-nilly, as those could be a minus sign. Other stricter * rules also apply. However, these properties all can have the * rhs not be a number, in which case they contain at least one * alphabetic. In those cases, the stricter rules don't apply. * But the numeric type properties can have the alphas [Ee] to * signify an exponent, and it is still a number with stricter - * rules. So look for an alpha that signifys not-strict */ + * rules. So look for an alpha that signifies not-strict */ stricter = TRUE; for (k = i; k < name_len; k++) { if ( isALPHA_A(name[k]) @@ -21801,7 +22846,7 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, * zeros, or between the final leading zero and the first other * digit */ for (; i < name_len - 1; i++) { - if ( name[i] != '0' + if ( name[i] != '0' && (name[i] != '_' || ! isDIGIT_A(name[i+1]))) { break; @@ -21811,9 +22856,8 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, } else { /* No '=' */ - /* We are now in a position to determine if this property should have - * been parsed using stricter rules. Only a few are like that, and - * unlikely to change. */ + /* Only a few properties without an '=' should be parsed with stricter + * rules. The list is unlikely to change. */ if ( memBEGINPs(lookup_name, j, "perl") && memNEs(lookup_name + 4, j - 4, "space") && memNEs(lookup_name + 4, j - 4, "word")) @@ -21908,33 +22952,309 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, { lookup_name[j++] = '&'; } - else if (name_len > 2 && name[0] == 'I' && ( name[1] == 'n' - || name[1] == 's')) - { - - /* Also, if the original input began with 'In' or 'Is', it could be a - * subroutine call instead of a property names, which currently isn't - * handled by this function. Subroutine calls can't happen if there is - * an '=' in the name */ - if (equals_pos < 0 && get_cvn_flags(name, name_len, GV_NOTQUAL) != NULL) - { - return NULL; - } + /* If the original input began with 'In' or 'Is', it could be a subroutine + * call to a user-defined property instead of a Unicode property name. */ + if ( non_pkg_begin + name_len > 2 + && name[non_pkg_begin+0] == 'I' + && (name[non_pkg_begin+1] == 'n' || name[non_pkg_begin+1] == 's')) + { starts_with_In_or_Is = TRUE; } + else { + could_be_user_defined = FALSE; + } + + if (could_be_user_defined) { + CV* user_sub; + + /* Here, the name could be for a user defined property, which are + * implemented as subs. */ + user_sub = get_cvn_flags(name, name_len, 0); + if (user_sub) { + + /* Here, there is a sub by the correct name. Normally we call it + * to get the property definition */ + dSP; + SV * user_sub_sv = MUTABLE_SV(user_sub); + SV * error; /* Any error returned by calling 'user_sub' */ + SV * fq_name; /* Fully qualified property name */ + SV * placeholder; + char to_fold_string[] = "0:"; /* The 0 gets overwritten with the + actual value */ + SV ** saved_user_prop_ptr; /* Hash entry for this property */ + + /* How many times to retry when another thread is in the middle of + * expanding the same definition we want */ + PERL_INT_FAST8_T retry_countdown = 10; + + DECLARATION_FOR_GLOBAL_CONTEXT; + + /* If we get here, we know this property is user-defined */ + *user_defined_ptr = TRUE; + + /* We refuse to call a tainted subroutine; returning an error + * instead */ + if (TAINT_get) { + if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); + sv_catpvs(msg, "Insecure user-defined property"); + goto append_name_to_msg; + } + + /* In principal, we only call each subroutine property definition + * once during the life of the program. This guarantees that the + * property definition never changes. The results of the single + * sub call are stored in a hash, which is used instead for future + * references to this property. The property definition is thus + * immutable. But, to allow the user to have a /i-dependent + * definition, we call the sub once for non-/i, and once for /i, + * should the need arise, passing the /i status as a parameter. + * + * We start by constructing the hash key name, consisting of the + * fully qualified subroutine name */ + fq_name = sv_2mortal(newSV(10)); /* 10 is just a guess */ + (void) cv_name(user_sub, fq_name, 0); + + /* But precede the sub name in the key with the /i status, so that + * there is a key for /i and a different key for non-/i */ + to_fold_string[0] = to_fold + '0'; + sv_insert(fq_name, 0, 0, to_fold_string, 2); + + /* We only call the sub once throughout the life of the program + * (with the /i, non-/i exception noted above). That means the + * hash must be global and accessible to all threads. It is + * created at program start-up, before any threads are created, so + * is accessible to all children. But this creates some + * complications. + * + * 1) The keys can't be shared, or else problems arise; sharing is + * turned off at hash creation time + * 2) All SVs in it are there for the remainder of the life of the + * program, and must be created in the same interpreter context + * as the hash, or else they will be freed from the wrong pool + * at global destruction time. This is handled by switching to + * the hash's context to create each SV going into it, and then + * immediately switching back + * 3) All accesses to the hash must be controlled by a mutex, to + * prevent two threads from getting an unstable state should + * they simultaneously be accessing it. The code below is + * crafted so that the mutex is locked whenever there is an + * access and unlocked only when the next stable state is + * achieved. + * + * The hash stores either the definition of the property if it was + * valid, or, if invalid, the error message that was raised. We + * use the type of SV to distinguish. + * + * There's also the need to guard against the definition expansion + * from infinitely recursing. This is handled by storing the aTHX + * of the expanding thread during the expansion. Again the SV type + * is used to distinguish this from the other two cases. If we + * come to here and the hash entry for this property is our aTHX, + * it means we have recursed, and the code assumes that we would + * infinitely recurse, so instead stops and raises an error. + * (Any recursion has always been treated as infinite recursion in + * this feature.) + * + * If instead, the entry is for a different aTHX, it means that + * that thread has gotten here first, and hasn't finished expanding + * the definition yet. We just have to wait until it is done. We + * sleep and retry a few times, returning an error if the other + * thread doesn't complete. */ + + re_fetch: + USER_PROP_MUTEX_LOCK; + + /* If we have an entry for this key, the subroutine has already + * been called once with this /i status. */ + saved_user_prop_ptr = hv_fetch(PL_user_def_props, + SvPVX(fq_name), SvCUR(fq_name), 0); + if (saved_user_prop_ptr) { - lookup_len = j; /* Use a more mnemonic name starting here */ + /* If the saved result is an inversion list, it is the valid + * definition of this property */ + if (is_invlist(*saved_user_prop_ptr)) { + prop_definition = *saved_user_prop_ptr; + + /* The SV in the hash won't be removed until global + * destruction, so it is stable and we can unlock */ + USER_PROP_MUTEX_UNLOCK; + + /* The caller shouldn't try to free this SV */ + return prop_definition; + } + + /* Otherwise, if it is a string, it is the error message + * that was returned when we first tried to evaluate this + * property. Fail, and append the message */ + if (SvPOK(*saved_user_prop_ptr)) { + if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); + sv_catsv(msg, *saved_user_prop_ptr); + + /* The SV in the hash won't be removed until global + * destruction, so it is stable and we can unlock */ + USER_PROP_MUTEX_UNLOCK; + + return NULL; + } + + assert(SvIOK(*saved_user_prop_ptr)); + + /* Here, we have an unstable entry in the hash. Either another + * thread is in the middle of expanding the property's + * definition, or we are ourselves recursing. We use the aTHX + * in it to distinguish */ + if (SvIV(*saved_user_prop_ptr) != PTR2IV(CUR_CONTEXT)) { + + /* Here, it's another thread doing the expanding. We've + * looked as much as we are going to at the contents of the + * hash entry. It's safe to unlock. */ + USER_PROP_MUTEX_UNLOCK; + + /* Retry a few times */ + if (retry_countdown-- > 0) { + PerlProc_sleep(1); + goto re_fetch; + } + + if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); + sv_catpvs(msg, "Timeout waiting for another thread to " + "define"); + goto append_name_to_msg; + } + + /* Here, we are recursing; don't dig any deeper */ + USER_PROP_MUTEX_UNLOCK; + + if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); + sv_catpvs(msg, + "Infinite recursion in user-defined property"); + goto append_name_to_msg; + } + + /* Here, this thread has exclusive control, and there is no entry + * for this property in the hash. So we have the go ahead to + * expand the definition ourselves. */ + + ENTER; + + /* Create a temporary placeholder in the hash to detect recursion + * */ + SWITCH_TO_GLOBAL_CONTEXT; + placeholder= newSVuv(PTR2IV(ORIGINAL_CONTEXT)); + (void) hv_store_ent(PL_user_def_props, fq_name, placeholder, 0); + RESTORE_CONTEXT; + + /* Now that we have a placeholder, we can let other threads + * continue */ + USER_PROP_MUTEX_UNLOCK; + + /* Make sure the placeholder always gets destroyed */ + SAVEDESTRUCTOR_X(S_delete_recursion_entry, SvPVX(fq_name)); + + PUSHMARK(SP); + SAVETMPS; + + /* Call the user's function, with the /i status as a parameter. + * Note that we have gone to a lot of trouble to keep this call + * from being within the locked mutex region. */ + XPUSHs(boolSV(to_fold)); + PUTBACK; + + (void) call_sv(user_sub_sv, G_EVAL|G_SCALAR); + + SPAGAIN; + + error = ERRSV; + if (SvTRUE(error)) { + if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); + sv_catpvs(msg, "Error \""); + sv_catsv(msg, error); + sv_catpvs(msg, "\""); + if (name_len > 0) { + sv_catpvs(msg, " in expansion of "); + Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, + name_len, + name)); + } + + (void) POPs; + prop_definition = NULL; + } + else { /* G_SCALAR guarantees a single return value */ + + /* The contents is supposed to be the expansion of the property + * definition. Call a function to check for valid syntax and + * handle it */ + prop_definition = handle_user_defined_property(name, name_len, + is_utf8, to_fold, runtime, + deferrable, + POPs, user_defined_ptr, + msg, + level); + } + + /* Here, we have the results of the expansion. Replace the + * placeholder with them. We need exclusive access to the hash, + * and we can't let anyone else in, between when we delete the + * placeholder and add the permanent entry */ + USER_PROP_MUTEX_LOCK; + + S_delete_recursion_entry(aTHX_ SvPVX(fq_name)); + + if (! prop_definition || is_invlist(prop_definition)) { + + /* If we got success we use the inversion list defining the + * property; otherwise use the error message */ + SWITCH_TO_GLOBAL_CONTEXT; + (void) hv_store_ent(PL_user_def_props, + fq_name, + ((prop_definition) + ? newSVsv(prop_definition) + : newSVsv(msg)), + 0); + RESTORE_CONTEXT; + } + + /* All done, and the hash now has a permanent entry for this + * property. Give up exclusive control */ + USER_PROP_MUTEX_UNLOCK; + + FREETMPS; + LEAVE; + + if (prop_definition) { + + /* If the definition is for something not known at this time, + * we toss it, and go return the main property name, as that's + * the one the user will be aware of */ + if (! is_invlist(prop_definition)) { + SvREFCNT_dec_NN(prop_definition); + goto definition_deferred; + } + + sv_2mortal(prop_definition); + } + + /* And return */ + return prop_definition; + + } /* End of calling the subroutine for the user-defined property */ + } /* End of it could be a user-defined property */ + + /* Here it wasn't a user-defined property that is known at this time. See + * if it is a Unicode property */ + + lookup_len = j; /* This is a more mnemonic name than 'j' */ /* Get the index into our pointer table of the inversion list corresponding * to the property */ table_index = match_uniprop((U8 *) lookup_name, lookup_len); - /* If it didn't find the property */ + /* If it didn't find the property ... */ if (table_index == 0) { - /* If didn't find the property, we try again stripping off any initial - * 'In' or 'Is' */ + /* Try again stripping off any initial 'In' or 'Is' */ if (starts_with_In_or_Is) { lookup_name += 2; lookup_len -= 2; @@ -21947,14 +23267,28 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, if (table_index == 0) { char * canonical; - /* If not found, and not a numeric type property, isn't a legal - * property */ + /* Here, we didn't find it. If not a numeric type property, and + * can't be a user-defined one, it isn't a legal property */ if (! is_nv_type) { - return NULL; - } + if (! could_be_user_defined) { + goto failed; + } - /* But the numeric type properties need more work to decide. What - * we do is make sure we have the number in canonical form and look + /* Here, the property name is legal as a user-defined one. At + * compile time, it might just be that the subroutine for that + * property hasn't been encountered yet, but at runtime, it's + * an error to try to use an undefined one */ + if (! deferrable) { + if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); + sv_catpvs(msg, "Unknown user-defined property name"); + goto append_name_to_msg; + } + + goto definition_deferred; + } /* End of isn't a numeric type property */ + + /* The numeric type properties need more work to decide. What we + * do is make sure we have the number in canonical form and look * that up. */ if (slash_pos < 0) { /* No slash */ @@ -21970,13 +23304,14 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, lookup_len - equals_pos) != lookup_name + lookup_len) { - return NULL; + goto failed; } - /* If the value is an integer, the canonical value is integral */ + /* If the value is an integer, the canonical value is integral + * */ if (Perl_ceil(value) == value) { canonical = Perl_form(aTHX_ "%.*s%.0" NVff, - equals_pos, lookup_name, value); + equals_pos, lookup_name, value); } else { /* Otherwise, it is %e with a known precision */ char * exp_ptr; @@ -22038,12 +23373,12 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, /* Convert the numerator to numeric */ end_ptr = this_lookup_name + slash_pos; if (! grok_atoUV(this_lookup_name, &numerator, &end_ptr)) { - return NULL; + goto failed; } /* It better have included all characters before the slash */ if (*end_ptr != '/') { - return NULL; + goto failed; } /* Set to look at just the denominator */ @@ -22053,7 +23388,7 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, /* Convert the denominator to numeric */ if (! grok_atoUV(this_lookup_name, &denominator, &end_ptr)) { - return NULL; + goto failed; } /* It better be the rest of the characters, and don't divide by @@ -22061,7 +23396,7 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, if ( end_ptr != this_lookup_name + lookup_len || denominator == 0) { - return NULL; + goto failed; } /* Get the greatest common denominator using @@ -22077,11 +23412,11 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, /* If already in lowest possible terms, we have already tried * looking this up */ if (gcd == 1) { - return NULL; + goto failed; } - /* Reduce the rational, which should put it in canonical form. - * Then look it up */ + /* Reduce the rational, which should put it in canonical form + * */ numerator /= gcd; denominator /= gcd; @@ -22092,26 +23427,23 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, /* Here, we have the number in canonical form. Try that */ table_index = match_uniprop((U8 *) canonical, strlen(canonical)); if (table_index == 0) { - return NULL; + goto failed; } - } - } + } /* End of still didn't find the property in our table */ + } /* End of didn't find the property in our table */ - /* The return is an index into a table of ptrs. A negative return - * signifies that the real index is the absolute value, but the result - * needs to be inverted */ + /* Here, we have a non-zero return, which is an index into a table of ptrs. + * A negative return signifies that the real index is the absolute value, + * but the result needs to be inverted */ if (table_index < 0) { - *invert = TRUE; + invert_return = TRUE; table_index = -table_index; } - else { - *invert = FALSE; - } /* Out-of band indices indicate a deprecated property. The proper index is * modulo it with the table size. And dividing by the table size yields - * an offset into a table constructed to contain the corresponding warning - * message */ + * an offset into a table constructed by regen/mk_invlists.pl to contain + * the corresponding warning message */ if (table_index > MAX_UNI_KEYWORD_INDEX) { Size_t warning_offset = table_index / MAX_UNI_KEYWORD_INDEX; table_index %= MAX_UNI_KEYWORD_INDEX; @@ -22145,7 +23477,118 @@ Perl_parse_uniprop_string(pTHX_ const char * const name, const Size_t name_len, } /* Create and return the inversion list */ - return _new_invlist_C_array(uni_prop_ptrs[table_index]); + prop_definition =_new_invlist_C_array(uni_prop_ptrs[table_index]); + sv_2mortal(prop_definition); + + + /* See if there is a private use override to add to this definition */ + { + COPHH * hinthash = (IN_PERL_COMPILETIME) + ? CopHINTHASH_get(&PL_compiling) + : CopHINTHASH_get(PL_curcop); + SV * pu_overrides = cophh_fetch_pv(hinthash, "private_use", 0, 0); + + if (UNLIKELY(pu_overrides && SvPOK(pu_overrides))) { + + /* See if there is an element in the hints hash for this table */ + SV * pu_lookup = Perl_newSVpvf(aTHX_ "%d=", table_index); + const char * pos = strstr(SvPVX(pu_overrides), SvPVX(pu_lookup)); + + if (pos) { + bool dummy; + SV * pu_definition; + SV * pu_invlist; + SV * expanded_prop_definition = + sv_2mortal(invlist_clone(prop_definition, NULL)); + + /* If so, it's definition is the string from here to the next + * \a character. And its format is the same as a user-defined + * property */ + pos += SvCUR(pu_lookup); + pu_definition = newSVpvn(pos, strchr(pos, '\a') - pos); + pu_invlist = handle_user_defined_property(lookup_name, + lookup_len, + 0, /* Not UTF-8 */ + 0, /* Not folded */ + runtime, + deferrable, + pu_definition, + &dummy, + msg, + level); + if (TAINT_get) { + if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); + sv_catpvs(msg, "Insecure private-use override"); + goto append_name_to_msg; + } + + /* For now, as a safety measure, make sure that it doesn't + * override non-private use code points */ + _invlist_intersection(pu_invlist, PL_Private_Use, &pu_invlist); + + /* Add it to the list to be returned */ + _invlist_union(prop_definition, pu_invlist, + &expanded_prop_definition); + prop_definition = expanded_prop_definition; + Perl_ck_warner_d(aTHX_ packWARN(WARN_EXPERIMENTAL__PRIVATE_USE), "The private_use feature is experimental"); + } + } + } + + if (invert_return) { + _invlist_invert(prop_definition); + } + return prop_definition; + + + failed: + if (non_pkg_begin != 0) { + if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); + sv_catpvs(msg, "Illegal user-defined property name"); + } + else { + if (SvCUR(msg) > 0) sv_catpvs(msg, "; "); + sv_catpvs(msg, "Can't find Unicode property definition"); + } + /* FALLTHROUGH */ + + append_name_to_msg: + { + const char * prefix = (runtime && level == 0) ? " \\p{" : " \""; + const char * suffix = (runtime && level == 0) ? "}" : "\""; + + sv_catpv(msg, prefix); + Perl_sv_catpvf(aTHX_ msg, "%" UTF8f, UTF8fARG(is_utf8, name_len, name)); + sv_catpv(msg, suffix); + } + + return NULL; + + definition_deferred: + + /* Here it could yet to be defined, so defer evaluation of this + * until its needed at runtime. */ + prop_definition = newSVpvs_flags("", SVs_TEMP); + + /* To avoid any ambiguity, the package is always specified. + * Use the current one if it wasn't included in our input */ + if (non_pkg_begin == 0) { + const HV * pkg = (IN_PERL_COMPILETIME) + ? PL_curstash + : CopSTASH(PL_curcop); + const char* pkgname = HvNAME(pkg); + + Perl_sv_catpvf(aTHX_ prop_definition, "%" UTF8f, + UTF8fARG(is_utf8, strlen(pkgname), pkgname)); + sv_catpvs(prop_definition, "::"); + } + + Perl_sv_catpvf(aTHX_ prop_definition, "%" UTF8f, + UTF8fARG(is_utf8, name_len, name)); + sv_catpvs(prop_definition, "\n"); + + *user_defined_ptr = TRUE; + return prop_definition; } #endif