X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/fc2ed17c144434ee3dde93c77bd1cd59b52da134..bec88f1bea9be699db294f4c38b20fef00a7f605:/regcomp.c diff --git a/regcomp.c b/regcomp.c index a30f2a2..e57f233 100644 --- a/regcomp.c +++ b/regcomp.c @@ -105,6 +105,10 @@ EXTERN_C const struct regexp_engine my_reg_engine; #define MIN(a,b) ((a) < (b) ? (a) : (b)) #endif +#ifndef MAX +#define MAX(a,b) ((a) > (b) ? (a) : (b)) +#endif + /* this is a chain of data about sub patterns we are processing that need to be handled separately/specially in study_chunk. Its so we can simulate recursion without losing state. */ @@ -164,7 +168,7 @@ struct RExC_state_t { I32 seen_zerolen; regnode **open_parens; /* pointers to open parens */ regnode **close_parens; /* pointers to close parens */ - regnode *opend; /* END node in program */ + regnode *end_op; /* END node in program */ I32 utf8; /* whether the pattern is utf8 or not */ I32 orig_utf8; /* whether the pattern was originally in utf8 */ /* XXX use this for future optimisation of case @@ -175,7 +179,7 @@ struct RExC_state_t { HV *paren_names; /* Paren names */ regnode **recurse; /* Recurse regops */ - I32 recurse_count; /* Number of recurse regops */ + I32 recurse_count; /* Number of recurse regops we have generated */ U8 *study_chunk_recursed; /* bitmap of which subs we have moved through */ U32 study_chunk_recursed_bytes; /* bytes in bitmap */ @@ -195,7 +199,7 @@ struct RExC_state_t { scan_frame *frame_head; scan_frame *frame_last; U32 frame_count; - U32 strict; + AV *warn_text; #ifdef ADD_TO_REGEXEC char *starttry; /* -Dr: where regtry was called. */ #define RExC_starttry (pRExC_state->starttry) @@ -218,6 +222,8 @@ struct RExC_state_t { #endif bool seen_unfolded_sharp_s; + bool strict; + bool study_started; }; #define RExC_flags (pRExC_state->flags) @@ -265,7 +271,7 @@ struct RExC_state_t { #define RExC_orig_utf8 (pRExC_state->orig_utf8) #define RExC_open_parens (pRExC_state->open_parens) #define RExC_close_parens (pRExC_state->close_parens) -#define RExC_opend (pRExC_state->opend) +#define RExC_end_op (pRExC_state->end_op) #define RExC_paren_names (pRExC_state->paren_names) #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) @@ -284,6 +290,8 @@ struct RExC_state_t { #define RExC_frame_last (pRExC_state->frame_last) #define RExC_frame_count (pRExC_state->frame_count) #define RExC_strict (pRExC_state->strict) +#define RExC_study_started (pRExC_state->study_started) +#define RExC_warn_text (pRExC_state->warn_text) /* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set * a flag to disable back-off on the fixed/floating substrings - if it's @@ -498,7 +506,20 @@ static const scan_data_t zero_scan_data = #define SF_HAS_PAR 0x0080 #define SF_IN_PAR 0x0100 #define SF_HAS_EVAL 0x0200 + + +/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the + * longest substring in the pattern. When it is not set the optimiser keeps + * track of position, but does not keep track of the actual strings seen, + * + * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but + * /foo/i will not. + * + * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble" + * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be + * turned off because of the alternation (BRANCH). */ #define SCF_DO_SUBSTR 0x0400 + #define SCF_DO_STCLASS_AND 0x0800 #define SCF_DO_STCLASS_OR 0x1000 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) @@ -894,52 +915,78 @@ static const scan_data_t zero_scan_data = #define EXPERIMENTAL_INPLACESCAN #endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/ -#define DEBUG_RExC_seen() \ +#ifdef DEBUGGING +int +Perl_re_printf(pTHX_ const char *fmt, ...) +{ + va_list ap; + int result; + PerlIO *f= Perl_debug_log; + PERL_ARGS_ASSERT_RE_PRINTF; + va_start(ap, fmt); + result = PerlIO_vprintf(f, fmt, ap); + va_end(ap); + return result; +} + +int +Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...) +{ + va_list ap; + int result; + PerlIO *f= Perl_debug_log; + PERL_ARGS_ASSERT_RE_INDENTF; + va_start(ap, depth); + PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, ""); + result = PerlIO_vprintf(f, fmt, ap); + va_end(ap); + return result; +} +#endif /* DEBUGGING */ + +#define DEBUG_RExC_seen() \ DEBUG_OPTIMISE_MORE_r({ \ - PerlIO_printf(Perl_debug_log,"RExC_seen: "); \ + Perl_re_printf( aTHX_ "RExC_seen: "); \ \ if (RExC_seen & REG_ZERO_LEN_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \ + Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \ \ if (RExC_seen & REG_LOOKBEHIND_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \ + Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \ \ if (RExC_seen & REG_GPOS_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \ + Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \ \ if (RExC_seen & REG_RECURSE_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \ + Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \ \ - if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \ + if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \ + Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \ \ if (RExC_seen & REG_VERBARG_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \ + Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \ \ if (RExC_seen & REG_CUTGROUP_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \ + Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \ \ if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \ + Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \ \ if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \ - \ - if (RExC_seen & REG_GOSTART_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \ + Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \ \ - if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ - PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \ + if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \ + Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \ \ - PerlIO_printf(Perl_debug_log,"\n"); \ + Perl_re_printf( aTHX_ "\n"); \ }); #define DEBUG_SHOW_STUDY_FLAG(flags,flag) \ - if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag) + if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag) #define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \ if ( ( flags ) ) { \ - PerlIO_printf(Perl_debug_log, "%s", open_str); \ + Perl_re_printf( aTHX_ "%s", open_str); \ DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \ DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \ DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \ @@ -955,29 +1002,28 @@ static const scan_data_t zero_scan_data = DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \ DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \ DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \ - PerlIO_printf(Perl_debug_log, "%s", close_str); \ + Perl_re_printf( aTHX_ "%s", close_str); \ } #define DEBUG_STUDYDATA(str,data,depth) \ DEBUG_OPTIMISE_MORE_r(if(data){ \ - PerlIO_printf(Perl_debug_log, \ - "%*s" str "Pos:%"IVdf"/%"IVdf \ + Perl_re_indentf( aTHX_ "" str "Pos:%"IVdf"/%"IVdf \ " Flags: 0x%"UVXf, \ - (int)(depth)*2, "", \ + depth, \ (IV)((data)->pos_min), \ (IV)((data)->pos_delta), \ (UV)((data)->flags) \ ); \ DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \ - PerlIO_printf(Perl_debug_log, \ + Perl_re_printf( aTHX_ \ " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ (IV)((data)->whilem_c), \ (IV)((data)->last_closep ? *((data)->last_closep) : -1), \ is_inf ? "INF " : "" \ ); \ if ((data)->last_found) \ - PerlIO_printf(Perl_debug_log, \ + Perl_re_printf( aTHX_ \ "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ SvPVX_const((data)->last_found), \ @@ -994,9 +1040,10 @@ DEBUG_OPTIMISE_MORE_r(if(data){ \ (IV)((data)->offset_float_min), \ (IV)((data)->offset_float_max) \ ); \ - PerlIO_printf(Perl_debug_log,"\n"); \ + Perl_re_printf( aTHX_ "\n"); \ }); + /* ========================================================= * BEGIN edit_distance stuff. * @@ -1236,8 +1283,8 @@ S_ssc_anything(pTHX_ regnode_ssc *ssc) assert(is_ANYOF_SYNTHETIC(ssc)); - ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ - _append_range_to_invlist(ssc->invlist, 0, UV_MAX); + /* mortalize so won't leak */ + ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX)); ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */ } @@ -1356,7 +1403,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, * returned list must, and will, contain every code point that is a * possibility. */ - SV* invlist = sv_2mortal(_new_invlist(0)); + SV* invlist = NULL; SV* only_utf8_locale_invlist = NULL; unsigned int i; const U32 n = ARG(node); @@ -1378,6 +1425,7 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, /* Here, no compile-time swash, and there are things that won't be * known until runtime -- we have to assume it could be anything */ + invlist = sv_2mortal(_new_invlist(1)); return _add_range_to_invlist(invlist, 0, UV_MAX); } else if (ary[3] && ary[3] != &PL_sv_undef) { @@ -1395,6 +1443,10 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, } } + if (! invlist) { + invlist = sv_2mortal(_new_invlist(0)); + } + /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS * code points, and an inversion list for the others, but if there are code * points that should match only conditionally on the target string being @@ -1414,7 +1466,12 @@ S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state, /* Add in the points from the bit map */ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { if (ANYOF_BITMAP_TEST(node, i)) { - invlist = add_cp_to_invlist(invlist, i); + unsigned int start = i++; + + for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) { + /* empty */ + } + invlist = _add_range_to_invlist(invlist, start, i-1); new_node_has_latin1 = TRUE; } } @@ -1937,14 +1994,13 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, PERL_ARGS_ASSERT_DUMP_TRIE; - PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ", - (int)depth * 2 + 2,"", - "Match","Base","Ofs" ); + Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ", + depth+1, "Match","Base","Ofs" ); for( state = 0 ; state < trie->uniquecharcount ; state++ ) { SV ** const tmp = av_fetch( revcharmap, state, 0); if ( tmp ) { - PerlIO_printf( Perl_debug_log, "%*s", + Perl_re_printf( aTHX_ "%*s", colwidth, pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, PL_colors[0], PL_colors[1], @@ -1954,27 +2010,25 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, ); } } - PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------", - (int)depth * 2 + 2,""); + Perl_re_printf( aTHX_ "\n"); + Perl_re_indentf( aTHX_ "State|-----------------------", depth+1); for( state = 0 ; state < trie->uniquecharcount ; state++ ) - PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------"); - PerlIO_printf( Perl_debug_log, "\n"); + Perl_re_printf( aTHX_ "%.*s", colwidth, "--------"); + Perl_re_printf( aTHX_ "\n"); for( state = 1 ; state < trie->statecount ; state++ ) { const U32 base = trie->states[ state ].trans.base; - PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", - (int)depth * 2 + 2,"", (UV)state); + Perl_re_indentf( aTHX_ "#%4"UVXf"|", depth+1, (UV)state); if ( trie->states[ state ].wordnum ) { - PerlIO_printf( Perl_debug_log, " W%4X", - trie->states[ state ].wordnum ); + Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum ); } else { - PerlIO_printf( Perl_debug_log, "%6s", "" ); + Perl_re_printf( aTHX_ "%6s", "" ); } - PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base ); + Perl_re_printf( aTHX_ " @%4"UVXf" ", (UV)base ); if ( base ) { U32 ofs = 0; @@ -1985,7 +2039,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, != state)) ofs++; - PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs); + Perl_re_printf( aTHX_ "+%2"UVXf"[ ", (UV)ofs); for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { if ( ( base + ofs >= trie->uniquecharcount ) @@ -1994,28 +2048,27 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, && trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) { - PerlIO_printf( Perl_debug_log, "%*"UVXf, - colwidth, - (UV)trie->trans[ base + ofs - - trie->uniquecharcount ].next ); + Perl_re_printf( aTHX_ "%*"UVXf, colwidth, + (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next + ); } else { - PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." ); + Perl_re_printf( aTHX_ "%*s",colwidth," ." ); } } - PerlIO_printf( Perl_debug_log, "]"); + Perl_re_printf( aTHX_ "]"); } - PerlIO_printf( Perl_debug_log, "\n" ); + Perl_re_printf( aTHX_ "\n" ); } - PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", - (int)depth*2, ""); + Perl_re_indentf( aTHX_ "word_info N:(prev,len)=", + depth); for (word=1; word <= trie->wordcount; word++) { - PerlIO_printf(Perl_debug_log, " %d:(%d,%d)", + Perl_re_printf( aTHX_ " %d:(%d,%d)", (int)word, (int)(trie->wordinfo[word].prev), (int)(trie->wordinfo[word].len)); } - PerlIO_printf(Perl_debug_log, "\n" ); + Perl_re_printf( aTHX_ "\n" ); } /* Dumps a fully constructed but uncompressed trie in list form. @@ -2036,19 +2089,20 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST; /* print out the table precompression. */ - PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s", - (int)depth * 2 + 2,"", (int)depth * 2 + 2,"", - "------:-----+-----------------\n" ); + Perl_re_indentf( aTHX_ "State :Word | Transition Data\n", + depth+1 ); + Perl_re_indentf( aTHX_ "%s", + depth+1, "------:-----+-----------------\n" ); for( state=1 ; state < next_alloc ; state ++ ) { U16 charid; - PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :", - (int)depth * 2 + 2,"", (UV)state ); + Perl_re_indentf( aTHX_ " %4"UVXf" :", + depth+1, (UV)state ); if ( ! trie->states[ state ].wordnum ) { - PerlIO_printf( Perl_debug_log, "%5s| ",""); + Perl_re_printf( aTHX_ "%5s| ",""); } else { - PerlIO_printf( Perl_debug_log, "W%4x| ", + Perl_re_printf( aTHX_ "W%4x| ", trie->states[ state ].wordnum ); } @@ -2056,7 +2110,7 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0); if ( tmp ) { - PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ", + Perl_re_printf( aTHX_ "%*s:%3X=%4"UVXf" | ", colwidth, pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, @@ -2068,11 +2122,11 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, (UV)TRIE_LIST_ITEM(state,charid).newstate ); if (!(charid % 10)) - PerlIO_printf(Perl_debug_log, "\n%*s| ", + Perl_re_printf( aTHX_ "\n%*s| ", (int)((depth * 2) + 14), ""); } } - PerlIO_printf( Perl_debug_log, "\n"); + Perl_re_printf( aTHX_ "\n"); } } @@ -2100,12 +2154,12 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, that they are identical. */ - PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" ); + Perl_re_indentf( aTHX_ "Char : ", depth+1 ); for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { SV ** const tmp = av_fetch( revcharmap, charid, 0); if ( tmp ) { - PerlIO_printf( Perl_debug_log, "%*s", + Perl_re_printf( aTHX_ "%*s", colwidth, pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, PL_colors[0], PL_colors[1], @@ -2116,32 +2170,33 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, } } - PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" ); + Perl_re_printf( aTHX_ "\n"); + Perl_re_indentf( aTHX_ "State+-", depth+1 ); for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) { - PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------"); + Perl_re_printf( aTHX_ "%.*s", colwidth,"--------"); } - PerlIO_printf( Perl_debug_log, "\n" ); + Perl_re_printf( aTHX_ "\n" ); for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { - PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ", - (int)depth * 2 + 2,"", + Perl_re_indentf( aTHX_ "%4"UVXf" : ", + depth+1, (UV)TRIE_NODENUM( state ) ); for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); if (v) - PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v ); + Perl_re_printf( aTHX_ "%*"UVXf, colwidth, v ); else - PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." ); + Perl_re_printf( aTHX_ "%*s", colwidth, "." ); } if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { - PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", + Perl_re_printf( aTHX_ " (%4"UVXf")\n", (UV)trie->trans[ state ].check ); } else { - PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", + Perl_re_printf( aTHX_ " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check, trie->states[ TRIE_NODENUM( state ) ].wordnum ); } @@ -2453,9 +2508,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT); } DEBUG_TRIE_COMPILE_r({ - PerlIO_printf( Perl_debug_log, - "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", - (int)depth * 2 + 2, "", + Perl_re_indentf( aTHX_ + "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n", + depth+1, REG_NODE_NUM(startbranch),REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth); }); @@ -2494,8 +2549,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, for ( cur = first ; cur < last ; cur = regnext( cur ) ) { regnode *noper = NEXTOPER( cur ); - const U8 *uc = (U8*)STRING( noper ); - const U8 *e = uc + STR_LEN( noper ); + const U8 *uc; + const U8 *e; int foldlen = 0; U32 wordlen = 0; /* required init */ STRLEN minchars = 0; @@ -2505,17 +2560,19 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); - if (noper_next != tail && OP(noper_next) == flags) { - noper = noper_next; - uc= (U8*)STRING(noper); - e= uc + STR_LEN(noper); - trie->minlen= STR_LEN(noper); - } else { - trie->minlen= 0; - continue; - } + if (noper_next < tail) + noper= noper_next; + } + + if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) { + uc= (U8*)STRING(noper); + e= uc + STR_LEN(noper); + } else { + trie->minlen= 0; + continue; } + if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */ TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte regardless of encoding */ @@ -2658,9 +2715,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, } } /* end first pass */ DEBUG_TRIE_COMPILE_r( - PerlIO_printf( Perl_debug_log, - "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", - (int)depth * 2 + 2,"", + Perl_re_indentf( aTHX_ + "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n", + depth+1, ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count, (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount, (int)trie->minlen, (int)trie->maxlen ) @@ -2708,9 +2765,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, STRLEN transcount = 1; - DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, - "%*sCompiling trie using list compiler\n", - (int)depth * 2 + 2, "")); + DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n", + depth+1)); trie->states = (reg_trie_state *) PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2, @@ -2721,22 +2777,20 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, for ( cur = first ; cur < last ; cur = regnext( cur ) ) { regnode *noper = NEXTOPER( cur ); - U8 *uc = (U8*)STRING( noper ); - const U8 *e = uc + STR_LEN( noper ); U32 state = 1; /* required init */ U16 charid = 0; /* sanity init */ U32 wordlen = 0; /* required init */ if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); - if (noper_next != tail && OP(noper_next) == flags) { - noper = noper_next; - uc= (U8*)STRING(noper); - e= uc + STR_LEN(noper); - } + if (noper_next < tail) + noper= noper_next; } - if (OP(noper) != NOTHING) { + if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) { + const U8 *uc= (U8*)STRING(noper); + const U8 *e= uc + STR_LEN(noper); + for ( ; uc < e ; uc += len ) { TRIE_READ_CHAR; @@ -2818,7 +2872,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, /* DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp) + Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp) ); */ @@ -2880,7 +2934,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, } /* DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf( Perl_debug_log, " base: %d\n",base); + Perl_re_printf( aTHX_ " base: %d\n",base); ); */ trie->states[ state ].trans.base=base; @@ -2923,9 +2977,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, we have to use TRIE_NODENUM() to convert. */ - DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log, - "%*sCompiling trie using table compiler\n", - (int)depth * 2 + 2, "")); + DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n", + depth+1)); trie->trans = (reg_trie_trans *) PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 ) @@ -2940,8 +2993,6 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, for ( cur = first ; cur < last ; cur = regnext( cur ) ) { regnode *noper = NEXTOPER( cur ); - const U8 *uc = (U8*)STRING( noper ); - const U8 *e = uc + STR_LEN( noper ); U32 state = 1; /* required init */ @@ -2952,14 +3003,14 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if (OP(noper) == NOTHING) { regnode *noper_next= regnext(noper); - if (noper_next != tail && OP(noper_next) == flags) { - noper = noper_next; - uc= (U8*)STRING(noper); - e= uc + STR_LEN(noper); - } + if (noper_next < tail) + noper= noper_next; } - if ( OP(noper) != NOTHING ) { + if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) { + const U8 *uc= (U8*)STRING(noper); + const U8 *e= uc + STR_LEN(noper); + for ( ; uc < e ; uc += len ) { TRIE_READ_CHAR; @@ -3117,9 +3168,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, PerlMemShared_realloc( trie->states, laststate * sizeof(reg_trie_state) ); DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf( Perl_debug_log, - "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", - (int)depth * 2 + 2,"", + Perl_re_indentf( aTHX_ "Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", + depth+1, (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ), (IV)next_alloc, @@ -3130,9 +3180,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, } /* end table compress */ } DEBUG_TRIE_COMPILE_MORE_r( - PerlIO_printf(Perl_debug_log, - "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n", - (int)depth * 2 + 2, "", + Perl_re_indentf( aTHX_ "Statecount:%"UVxf" Lasttrans:%"UVxf"\n", + depth+1, (UV)trie->statecount, (UV)trie->lasttrans) ); @@ -3182,9 +3231,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, }); } DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, - "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", - (int)depth * 2 + 2, "", + Perl_re_indentf( aTHX_ "MJD offset:%"UVuf" MJD length:%"UVuf"\n", + depth+1, (UV)mjd_offset, (UV)mjd_nodelen) ); #endif @@ -3214,9 +3262,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( count == 2 ) { Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, - "%*sNew Start State=%"UVuf" Class: [", - (int)depth * 2 + 2, "", + Perl_re_indentf( aTHX_ "New Start State=%"UVuf" Class: [", + depth+1, (UV)state)); if (idx >= 0) { SV ** const tmp = av_fetch( revcharmap, idx, 0); @@ -3226,14 +3273,14 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( folder ) TRIE_BITMAP_SET(trie, folder[ *ch ]); DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, "%s", (char*)ch) + Perl_re_printf( aTHX_ "%s", (char*)ch) ); } } TRIE_BITMAP_SET(trie,*ch); if ( folder ) TRIE_BITMAP_SET(trie,folder[ *ch ]); - DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch)); + DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch)); } idx = ofs; } @@ -3244,9 +3291,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, char *ch = SvPV( *tmp, len ); DEBUG_OPTIMISE_r({ SV *sv=sv_newmortal(); - PerlIO_printf( Perl_debug_log, - "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", - (int)depth * 2 + 2, "", + Perl_re_indentf( aTHX_ "Prefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", + depth+1, (UV)state, (UV)idx, pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, PL_colors[0], PL_colors[1], @@ -3266,7 +3312,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, } else { #ifdef DEBUGGING if (state>1) - DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n")); + DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n")); #endif break; } @@ -3538,14 +3584,13 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour */ fail[ 0 ] = fail[ 1 ] = 0; DEBUG_TRIE_COMPILE_r({ - PerlIO_printf(Perl_debug_log, - "%*sStclass Failtable (%"UVuf" states): 0", - (int)(depth * 2), "", (UV)numstates + Perl_re_indentf( aTHX_ "Stclass Failtable (%"UVuf" states): 0", + depth, (UV)numstates ); for( q_read=1; q_read%3d: %s (%d)", \ - (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\ - Next ? (REG_NODE_NUM(Next)) : 0 ); \ +#define DEBUG_PEEP(str,scan,depth) \ + DEBUG_OPTIMISE_r({if (scan){ \ + regnode *Next = regnext(scan); \ + regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);\ + Perl_re_indentf( aTHX_ "" str ">%3d: %s (%d)", \ + depth, REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\ + Next ? (REG_NODE_NUM(Next)) : 0 );\ DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\ - PerlIO_printf(Perl_debug_log, "\n"); \ + Perl_re_printf( aTHX_ "\n"); \ }}); /* The below joins as many adjacent EXACTish nodes as possible into a single @@ -4061,6 +4106,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_STUDY_CHUNK; + RExC_study_started= 1; if ( depth == 0 ) { @@ -4075,9 +4121,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, ); DEBUG_OPTIMISE_MORE_r( { - PerlIO_printf(Perl_debug_log, - "%*sstudy_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p", - (int)(depth*2), "", (long)stopparen, + Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p", + depth, (long)stopparen, (unsigned long)RExC_study_chunk_recursed_count, (unsigned long)depth, (unsigned long)recursed_depth, scan, @@ -4096,16 +4141,16 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, (( j - 1 ) * RExC_study_chunk_recursed_bytes), i) ) ) { - PerlIO_printf(Perl_debug_log," %d",(int)i); + Perl_re_printf( aTHX_ " %d",(int)i); break; } } if ( j + 1 < recursed_depth ) { - PerlIO_printf(Perl_debug_log, ","); + Perl_re_printf( aTHX_ ","); } } } - PerlIO_printf(Perl_debug_log,"\n"); + Perl_re_printf( aTHX_ "\n"); } ); while ( scan && OP(scan) != END && scan < last ){ @@ -4118,9 +4163,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, DEBUG_PEEP("Peep", scan, depth); - /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/ - * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled - * by a different invocation of reg() -- Yves + /* The reason we do this here is that we need to deal with things like + * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT + * parsing code, as each (?:..) is handled by a different invocation of + * reg() -- Yves */ JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0); @@ -4381,9 +4427,10 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, DEBUG_TRIE_COMPILE_r({ regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state); - PerlIO_printf( Perl_debug_log, "%*s%s%s\n", - (int)depth * 2 + 2, "", - "Looking for TRIE'able sequences. Tail node is: ", + Perl_re_indentf( aTHX_ "%s %"UVuf":%s\n", + depth+1, + "Looking for TRIE'able sequences. Tail node is ", + (UV)(tail - RExC_emit_start), SvPV_nolen_const( RExC_mysv ) ); }); @@ -4466,25 +4513,26 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, U8 noper_trietype = TRIE_TYPE( noper_type ); #if defined(DEBUGGING) || defined(NOJUMPTRIE) regnode * const noper_next = regnext( noper ); - U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0; - U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0; + U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; + U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0; #endif DEBUG_TRIE_COMPILE_r({ regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); - PerlIO_printf( Perl_debug_log, "%*s- %s (%d)", - (int)depth * 2 + 2,"", SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) ); + Perl_re_indentf( aTHX_ "- %d:%s (%d)", + depth+1, + REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) ); regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state); - PerlIO_printf( Perl_debug_log, " -> %s", - SvPV_nolen_const(RExC_mysv)); + Perl_re_printf( aTHX_ " -> %d:%s", + REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv)); if ( noper_next ) { regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state); - PerlIO_printf( Perl_debug_log,"\t=> %s\t", - SvPV_nolen_const(RExC_mysv)); + Perl_re_printf( aTHX_ "\t=> %d:%s\t", + REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv)); } - PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n", + Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n", REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype] ); @@ -4495,12 +4543,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( noper_trietype && ( - ( noper_trietype == NOTHING) + ( noper_trietype == NOTHING ) || ( trietype == NOTHING ) || ( trietype == noper_trietype ) ) #ifdef NOJUMPTRIE - && noper_next == tail + && noper_next >= tail #endif && count < U16_MAX) { @@ -4513,7 +4561,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if ( noper_trietype == NOTHING ) { #if !defined(DEBUGGING) && !defined(NOJUMPTRIE) regnode * const noper_next = regnext( noper ); - U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0; + U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0; U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0; #endif @@ -4560,7 +4608,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } if ( noper_trietype #ifdef NOJUMPTRIE - && noper_next == tail + && noper_next >= tail #endif ){ /* noper is triable, so we can start a new @@ -4580,10 +4628,12 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } /* loop over branches */ DEBUG_TRIE_COMPILE_r({ regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); - PerlIO_printf( Perl_debug_log, - "%*s- %s (%d) \n", - (int)depth * 2 + 2, - "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur)); + Perl_re_indentf( aTHX_ "- %s (%d) ", + depth+1, SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur)); + Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n", + REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur), + PL_reg_name[trietype] + ); }); if ( last && trietype ) { @@ -4601,7 +4651,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, depth==0 ) { flags |= SCF_TRIE_RESTUDY; if ( startbranch == first - && scan == tail ) + && scan >= tail ) { RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN; } @@ -4620,9 +4670,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, * turn it into a plain NOTHING op. */ DEBUG_TRIE_COMPILE_r({ regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state); - PerlIO_printf( Perl_debug_log, - "%*s- %s (%d) \n", (int)depth * 2 + 2, - "", SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur)); + Perl_re_indentf( aTHX_ "- %s (%d) \n", + depth+1, + SvPV_nolen_const( RExC_mysv ),REG_NODE_NUM(cur)); }); OP(startbranch)= NOTHING; @@ -4642,29 +4692,24 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } else /* single branch is optimized. */ scan = NEXTOPER(scan); continue; - } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) { + } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) { I32 paren = 0; regnode *start = NULL; regnode *end = NULL; U32 my_recursed_depth= recursed_depth; - - if (OP(scan) != SUSPEND) { /* GOSUB/GOSTART */ + if (OP(scan) != SUSPEND) { /* GOSUB */ /* Do setup, note this code has side effects beyond * the rest of this block. Specifically setting * RExC_recurse[] must happen at least once during * study_chunk(). */ - if (OP(scan) == GOSUB) { - paren = ARG(scan); - RExC_recurse[ARG2L(scan)] = scan; - start = RExC_open_parens[paren-1]; - end = RExC_close_parens[paren-1]; - } else { - start = RExC_rxi->program + 1; - end = RExC_opend; - } + paren = ARG(scan); + RExC_recurse[ARG2L(scan)] = scan; + start = RExC_open_parens[paren]; + end = RExC_close_parens[paren]; + /* NOTE we MUST always execute the above code, even - * if we do nothing with a GOSUB/GOSTART */ + * if we do nothing with a GOSUB */ if ( ( flags & SCF_IN_DEFINE ) || @@ -4713,11 +4758,11 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, RExC_study_chunk_recursed_bytes, U8); } /* we havent recursed into this paren yet, so recurse into it */ - DEBUG_STUDYDATA("set:", data,depth); + DEBUG_STUDYDATA("gosub-set:", data,depth); PAREN_SET(RExC_study_chunk_recursed + (recursed_depth * RExC_study_chunk_recursed_bytes), paren); my_recursed_depth= recursed_depth + 1; } else { - DEBUG_STUDYDATA("inf:", data,depth); + DEBUG_STUDYDATA("gosub-inf:", data,depth); /* some form of infinite recursion, assume infinite length * */ if (flags & SCF_DO_SUBSTR) { @@ -5060,8 +5105,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, if (OP(nxt) != CLOSE) goto nogo; if (RExC_open_parens) { - RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ - RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/ + RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)]=nxt+2; /*close->while*/ } /* Now we know that nxt2 is the only contents: */ oscan->flags = (U8)ARG(nxt); @@ -5107,8 +5152,8 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, oscan->flags = (U8)ARG(nxt); if (RExC_open_parens) { - RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/ - RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/ + RExC_open_parens[ARG(nxt1)]=oscan; /*open->CURLYM*/ + RExC_close_parens[ARG(nxt1)]=nxt2+1; /*close->NOTHING*/ } OP(nxt1) = OPTIMIZED; /* was OPEN. */ OP(nxt) = OPTIMIZED; /* was CLOSE. */ @@ -5215,13 +5260,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* It is counted once already... */ data->pos_min += minnext * (mincount - counted); #if 0 -PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf +Perl_re_printf( aTHX_ "counted=%"UVuf" deltanext=%"UVuf " SSize_t_MAX=%"UVuf" minnext=%"UVuf " maxcount=%"UVuf" mincount=%"UVuf"\n", (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, (UV)mincount); if (deltanext != SSize_t_MAX) -PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", +Perl_re_printf( aTHX_ "LHS=%"UVuf" RHS=%"UVuf"\n", (UV)(-counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); #endif @@ -5859,15 +5904,10 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", /* Else: zero-length, ignore. */ scan = regnext(scan); } - /* If we are exiting a recursion we can unset its recursed bit - * and allow ourselves to enter it again - no danger of an - * infinite loop there. - if (stopparen > -1 && recursed) { - DEBUG_STUDYDATA("unset:", data,depth); - PAREN_UNSET( recursed, stopparen); - } - */ + + finish: if (frame) { + /* we need to unwind recursion. */ depth = depth - 1; DEBUG_STUDYDATA("frame-end:",data,depth); @@ -5884,7 +5924,6 @@ PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n", goto fake_study_recurse; } - finish: assert(!frame); DEBUG_STUDYDATA("pre-fin:",data,depth); @@ -6035,7 +6074,7 @@ Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) /* Dispatch a request to compile a regexp to correct regexp engine. */ DEBUG_COMPILE_r({ - PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", + Perl_re_printf( aTHX_ "Using engine %"UVxf"\n", PTR2UV(eng)); }); return CALLREGCOMP_ENG(eng, pattern, flags); @@ -6080,7 +6119,7 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, bool do_end = 0; GET_RE_DEBUG_FLAGS_DECL; - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + DEBUG_PARSE_r(Perl_re_printf( aTHX_ "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); Newx(dst, *plen_p * 2 + 1, U8); @@ -6448,7 +6487,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, *p++ = 'x'; *p++ = '\0'; DEBUG_COMPILE_r({ - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "%sre-parsing pattern for runtime code:%s %s\n", PL_colors[4],PL_colors[5],newpat); }); @@ -6700,6 +6739,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* Initialize these here instead of as-needed, as is quick and avoids * having to test them each time otherwise */ if (! PL_AboveLatin1) { +#ifdef DEBUGGING + char * dump_len_string; +#endif + PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist); PL_Latin1 = _new_invlist_C_array(Latin1_invlist); PL_UpperLatin1 = _new_invlist_C_array(UpperLatin1_invlist); @@ -6713,8 +6756,17 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, PL_InBitmap = _new_invlist(2); PL_InBitmap = _add_range_to_invlist(PL_InBitmap, 0, NUM_ANYOF_CODE_POINTS - 1); +#ifdef DEBUGGING + dump_len_string = PerlEnv_getenv("PERL_DUMP_RE_MAX_LEN"); + if ( ! dump_len_string + || ! grok_atoUV(dump_len_string, (UV *)&PL_dump_re_max_len, NULL)) + { + PL_dump_re_max_len = 0; + } +#endif } + pRExC_state->warn_text = NULL; pRExC_state->code_blocks = NULL; pRExC_state->num_code_blocks = 0; @@ -6771,7 +6823,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Assembling pattern from %d elements%s\n", pat_count, orig_rx_flags & RXf_SPLIT ? " for split" : "")); @@ -6800,7 +6852,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, *is_bare_re = TRUE; SvREFCNT_inc(re); Safefree(pRExC_state->code_blocks); - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Precompiled pattern%s\n", orig_rx_flags & RXf_SPLIT ? " for split" : "")); @@ -6831,6 +6883,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_contains_locale = 0; RExC_contains_i = 0; RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT); + RExC_study_started = 0; pRExC_state->runtime_code_qr = NULL; RExC_frame_head= NULL; RExC_frame_last= NULL; @@ -6843,7 +6896,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, DEBUG_COMPILE_r({ SV *dsv= sv_newmortal(); RE_PV_QUOTED_DECL(s, RExC_utf8, dsv, exp, plen, 60); - PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", + Perl_re_printf( aTHX_ "%sCompiling REx%s %s\n", PL_colors[4],PL_colors[5],s); }); @@ -6938,7 +6991,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_whilem_seen = 0; RExC_open_parens = NULL; RExC_close_parens = NULL; - RExC_opend = NULL; + RExC_end_op = NULL; RExC_paren_names = NULL; #ifdef DEBUGGING RExC_paren_name_list = NULL; @@ -6959,7 +7012,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, assert(*RExC_end == '\0'); DEBUG_PARSE_r( - PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"); + Perl_re_printf( aTHX_ "Starting first pass (sizing)\n"); RExC_lastnum=0; RExC_lastparse=NULL; ); @@ -6989,7 +7042,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, pRExC_state->num_code_blocks); } else { - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Need to redo pass 1\n")); } @@ -7001,7 +7054,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ DEBUG_PARSE_r({ - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "Required size %"IVdf" nodes\n" "Starting second pass (creation)\n", (IV)RExC_size); @@ -7136,28 +7189,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, r->intflags = 0; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ - /* setup various meta data about recursion, this all requires - * RExC_npar to be correctly set, and a bit later on we clear it */ - if (RExC_seen & REG_RECURSE_SEEN) { - Newxz(RExC_open_parens, RExC_npar,regnode *); - SAVEFREEPV(RExC_open_parens); - Newxz(RExC_close_parens,RExC_npar,regnode *); - SAVEFREEPV(RExC_close_parens); - } - if (RExC_seen & (REG_RECURSE_SEEN | REG_GOSTART_SEEN)) { - /* Note, RExC_npar is 1 + the number of parens in a pattern. - * So its 1 if there are no parens. */ - RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + - ((RExC_npar & 0x07) != 0); - Newx(RExC_study_chunk_recursed, - RExC_study_chunk_recursed_bytes * RExC_npar, U8); - SAVEFREEPV(RExC_study_chunk_recursed); - } - /* Useful during FAIL. */ #ifdef RE_TRACK_PATTERN_OFFSETS Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ - DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, + DEBUG_OFFSETS_r(Perl_re_printf( aTHX_ "%s %"UVuf" bytes for offset annotations.\n", ri->u.offsets ? "Got" : "Couldn't get", (UV)((2*RExC_size+1) * sizeof(U32)))); @@ -7173,17 +7208,51 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_parse = exp; RExC_end = exp + plen; RExC_naughty = 0; - RExC_npar = 1; RExC_emit_start = ri->program; RExC_emit = ri->program; RExC_emit_bound = ri->program + RExC_size + 1; pRExC_state->code_index = 0; *((char*) RExC_emit++) = (char) REG_MAGIC; + /* setup various meta data about recursion, this all requires + * RExC_npar to be correctly set, and a bit later on we clear it */ + if (RExC_seen & REG_RECURSE_SEEN) { + DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ + "%*s%*s Setting up open/close parens\n", + 22, "| |", (int)(0 * 2 + 1), "")); + + /* setup RExC_open_parens, which holds the address of each + * OPEN tag, and to make things simpler for the 0 index + * the start of the program - this is used later for offsets */ + Newxz(RExC_open_parens, RExC_npar,regnode *); + SAVEFREEPV(RExC_open_parens); + RExC_open_parens[0] = RExC_emit; + + /* setup RExC_close_parens, which holds the address of each + * CLOSE tag, and to make things simpler for the 0 index + * the end of the program - this is used later for offsets */ + Newxz(RExC_close_parens, RExC_npar,regnode *); + SAVEFREEPV(RExC_close_parens); + /* we dont know where end op starts yet, so we dont + * need to set RExC_close_parens[0] like we do RExC_open_parens[0] above */ + + /* Note, RExC_npar is 1 + the number of parens in a pattern. + * So its 1 if there are no parens. */ + RExC_study_chunk_recursed_bytes= (RExC_npar >> 3) + + ((RExC_npar & 0x07) != 0); + Newx(RExC_study_chunk_recursed, + RExC_study_chunk_recursed_bytes * RExC_npar, U8); + SAVEFREEPV(RExC_study_chunk_recursed); + } + RExC_npar = 1; if (reg(pRExC_state, 0, &flags,1) == NULL) { ReREFCNT_dec(rx); Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); } + DEBUG_OPTIMISE_r( + Perl_re_printf( aTHX_ "Starting post parse optimization\n"); + ); + /* XXXX To minimize changes to RE engine we always allocate 3-units-long substrs field. */ Newx(r->substrs, 1, struct reg_substr_data); @@ -7210,7 +7279,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, copyRExC_state = RExC_state; } else { U32 seen=RExC_seen; - DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n")); + DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "Restudying\n")); RExC_state = copyRExC_state; if (seen & REG_TOP_LEVEL_BRANCHES_SEEN) @@ -7344,12 +7413,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, #ifdef TRIE_STUDY_OPT DEBUG_PARSE_r( if (!restudied) - PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + Perl_re_printf( aTHX_ "first at %"IVdf"\n", (IV)(first - scan + 1)) ); #else DEBUG_PARSE_r( - PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n", + Perl_re_printf( aTHX_ "first at %"IVdf"\n", (IV)(first - scan + 1)) ); #endif @@ -7478,7 +7547,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV *sv = sv_newmortal(); regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state); - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); data.start_class = NULL; @@ -7523,7 +7592,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, regnode_ssc ch_class; SSize_t last_close = 0; - DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n")); + DEBUG_PARSE_r(Perl_re_printf( aTHX_ "\nMulti Top Level\n")); scan = ri->program + 1; ssc_init(pRExC_state, &ch_class); @@ -7558,7 +7627,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */ DEBUG_COMPILE_r({ SV* sv = sv_newmortal(); regprop(r, sv, (regnode*)data.start_class, NULL, pRExC_state); - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "synthetic stclass \"%s\".\n", SvPVX_const(sv));}); data.start_class = NULL; @@ -7576,13 +7645,17 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* Guard against an embedded (?=) or (?<=) with a longer minlen than the "real" pattern. */ DEBUG_OPTIMISE_r({ - PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n", + Perl_re_printf( aTHX_ "minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n", (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen); }); r->minlenret = minlen; if (r->minlen < minlen) r->minlen = minlen; + if (RExC_seen & REG_RECURSE_SEEN ) { + r->intflags |= PREGf_RECURSE_SEEN; + Newxz(r->recurse_locinput, r->nparens + 1, char *); + } if (RExC_seen & REG_GPOS_SEEN) r->intflags |= PREGf_GPOS_SEEN; if (RExC_seen & REG_LOOKBEHIND_SEEN) @@ -7656,23 +7729,22 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, = (void*)SvREFCNT_inc(RExC_paren_name_list); } else #endif - ri->name_list_idx = 0; + ri->name_list_idx = 0; - if (RExC_recurse_count) { - for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { - const regnode *scan = RExC_recurse[RExC_recurse_count-1]; - ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); - } + while ( RExC_recurse_count > 0 ) { + const regnode *scan = RExC_recurse[ --RExC_recurse_count ]; + ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - scan ); } + Newxz(r->offs, RExC_npar, regexp_paren_pair); /* assume we don't need to swap parens around before we match */ DEBUG_TEST_r({ - PerlIO_printf(Perl_debug_log,"study_chunk_recursed_count: %lu\n", + Perl_re_printf( aTHX_ "study_chunk_recursed_count: %lu\n", (unsigned long)RExC_study_chunk_recursed_count); }); DEBUG_DUMP_r({ DEBUG_RExC_seen(); - PerlIO_printf(Perl_debug_log,"Final program:\n"); + Perl_re_printf( aTHX_ "Final program:\n"); regdump(r); }); #ifdef RE_TRACK_PATTERN_OFFSETS @@ -7680,14 +7752,14 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, const STRLEN len = ri->u.offsets[0]; STRLEN i; GET_RE_DEBUG_FLAGS_DECL; - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); for (i = 1; i <= len; i++) { if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) - PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", + Perl_re_printf( aTHX_ "%"UVuf":%"UVuf"[%"UVuf"] ", (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); } - PerlIO_printf(Perl_debug_log, "\n"); + Perl_re_printf( aTHX_ "\n"); }); #endif @@ -8205,7 +8277,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) #define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \ int num; \ if (RExC_lastparse!=RExC_parse) { \ - PerlIO_printf(Perl_debug_log, "%s", \ + Perl_re_printf( aTHX_ "%s", \ Perl_pv_pretty(aTHX_ RExC_mysv1, RExC_parse, \ RExC_end - RExC_parse, 16, \ "", "", \ @@ -8217,17 +8289,17 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) ) \ ); \ } else \ - PerlIO_printf(Perl_debug_log,"%16s",""); \ + Perl_re_printf( aTHX_ "%16s",""); \ \ if (SIZE_ONLY) \ num = RExC_size + 1; \ else \ num=REG_NODE_NUM(RExC_emit); \ if (RExC_lastnum!=num) \ - PerlIO_printf(Perl_debug_log,"|%4d",num); \ + Perl_re_printf( aTHX_ "|%4d",num); \ else \ - PerlIO_printf(Perl_debug_log,"|%4s",""); \ - PerlIO_printf(Perl_debug_log,"|%*s%-4s", \ + Perl_re_printf( aTHX_ "|%4s",""); \ + Perl_re_printf( aTHX_ "|%*s%-4s", \ (int)((depth*2)), "", \ (funcname) \ ); \ @@ -8239,11 +8311,11 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) #define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \ DEBUG_PARSE_MSG((funcname)); \ - PerlIO_printf(Perl_debug_log,"%4s","\n"); \ + Perl_re_printf( aTHX_ "%4s","\n"); \ }) -#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \ +#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({\ DEBUG_PARSE_MSG((funcname)); \ - PerlIO_printf(Perl_debug_log,fmt "\n",args); \ + Perl_re_printf( aTHX_ fmt "\n",args); \ }) /* This section of code defines the inversion list object and its methods. The @@ -8252,33 +8324,47 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) * as an SVt_INVLIST scalar. * * An inversion list for Unicode is an array of code points, sorted by ordinal - * number. The zeroth element is the first code point in the list. The 1th - * element is the first element beyond that not in the list. In other words, - * the first range is - * invlist[0]..(invlist[1]-1) - * The other ranges follow. Thus every element whose index is divisible by two - * marks the beginning of a range that is in the list, and every element not - * divisible by two marks the beginning of a range not in the list. A single - * element inversion list that contains the single code point N generally - * consists of two elements - * invlist[0] == N - * invlist[1] == N+1 - * (The exception is when N is the highest representable value on the - * machine, in which case the list containing just it would be a single - * element, itself. By extension, if the last range in the list extends to - * infinity, then the first element of that range will be in the inversion list - * at a position that is divisible by two, and is the final element in the - * list.) + * number. Each element gives the code point that begins a range that extends + * up-to but not including the code point given by the next element. The final + * element gives the first code point of a range that extends to the platform's + * infinity. The even-numbered elements (invlist[0], invlist[2], invlist[4], + * ...) give ranges whose code points are all in the inversion list. We say + * that those ranges are in the set. The odd-numbered elements give ranges + * whose code points are not in the inversion list, and hence not in the set. + * Thus, element [0] is the first code point in the list. Element [1] + * is the first code point beyond that not in the list; and element [2] is the + * first code point beyond that that is in the list. In other words, the first + * range is invlist[0]..(invlist[1]-1), and all code points in that range are + * in the inversion list. The second range is invlist[1]..(invlist[2]-1), and + * all code points in that range are not in the inversion list. The third + * range invlist[2]..(invlist[3]-1) gives code points that are in the inversion + * list, and so forth. Thus every element whose index is divisible by two + * gives the beginning of a range that is in the list, and every element whose + * index is not divisible by two gives the beginning of a range not in the + * list. If the final element's index is divisible by two, the inversion list + * extends to the platform's infinity; otherwise the highest code point in the + * inversion list is the contents of that element minus 1. + * + * A range that contains just a single code point N will look like + * invlist[i] == N + * invlist[i+1] == N+1 + * + * If N is UV_MAX (the highest representable code point on the machine), N+1 is + * impossible to represent, so element [i+1] is omitted. The single element + * inversion list + * invlist[0] == UV_MAX + * contains just UV_MAX, but is interpreted as matching to infinity. + * * Taking the complement (inverting) an inversion list is quite simple, if the * first element is 0, remove it; otherwise add a 0 element at the beginning. * This implementation reserves an element at the beginning of each inversion * list to always contain 0; there is an additional flag in the header which * indicates if the list begins at the 0, or is offset to begin at the next - * element. + * element. This means that the inversion list can be inverted without any + * copying; just flip the flag. * * More about inversion lists can be found in "Unicode Demystified" * Chapter 13 by Richard Gillam, published by Addison-Wesley. - * More will be coming when functionality is added later. * * The inversion list data structure is currently implemented as an SV pointing * to an array of UVs that the SV thinks are bytes. This allows us to have an @@ -8335,6 +8421,52 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) #ifndef PERL_IN_XSUB_RE +STATIC void +S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src) +{ + /* Replaces the inversion list in 'src' with the one in 'dest'. It steals + * the list from 'src', so 'src' is made to have a NULL list. This is + * similar to what SvSetMagicSV() would do, if it were implemented on + * inversion lists, though this routine avoids a copy */ + + const UV src_len = _invlist_len(src); + const bool src_offset = *get_invlist_offset_addr(src); + const STRLEN src_byte_len = SvLEN(src); + char * array = SvPVX(src); + + const int oldtainted = TAINT_get; + + PERL_ARGS_ASSERT_INVLIST_REPLACE_LIST_DESTROYS_SRC; + + assert(SvTYPE(src) == SVt_INVLIST); + assert(SvTYPE(dest) == SVt_INVLIST); + assert(! invlist_is_iterating(src)); + assert(SvCUR(src) == 0 || SvCUR(src) < SvLEN(src)); + + /* Make sure it ends in the right place with a NUL, as our inversion list + * manipulations aren't careful to keep this true, but sv_usepvn_flags() + * asserts it */ + array[src_byte_len - 1] = '\0'; + + TAINT_NOT; /* Otherwise it breaks */ + sv_usepvn_flags(dest, + (char *) array, + src_byte_len - 1, + + /* This flag is documented to cause a copy to be avoided */ + SV_HAS_TRAILING_NUL); + TAINT_set(oldtainted); + SvPV_set(src, 0); + SvLEN_set(src, 0); + SvCUR_set(src, 0); + + /* Finish up copying over the other fields in an inversion list */ + *get_invlist_offset_addr(dest) = src_offset; + invlist_set_len(dest, src_len, src_offset); + *get_invlist_previous_index_addr(dest) = 0; + invlist_iterfinish(dest); +} + PERL_STATIC_INLINE IV* S_get_invlist_previous_index_addr(SV* invlist) { @@ -8370,17 +8502,34 @@ S_invlist_set_previous_index(SV* const invlist, const IV index) } PERL_STATIC_INLINE void -S_invlist_trim(SV* const invlist) +S_invlist_trim(SV* invlist) { + /* Free the not currently-being-used space in an inversion list */ + + /* But don't free up the space needed for the 0 UV that is always at the + * beginning of the list, nor the trailing NUL */ + const UV min_size = TO_INTERNAL_SIZE(1) + 1; + PERL_ARGS_ASSERT_INVLIST_TRIM; assert(SvTYPE(invlist) == SVt_INVLIST); - /* Change the length of the inversion list to how many entries it currently - * has */ - SvPV_shrink_to_cur((SV *) invlist); + SvPV_renew(invlist, MAX(min_size, SvCUR(invlist) + 1)); +} + +PERL_STATIC_INLINE void +S_invlist_clear(pTHX_ SV* invlist) /* Empty the inversion list */ +{ + PERL_ARGS_ASSERT_INVLIST_CLEAR; + + assert(SvTYPE(invlist) == SVt_INVLIST); + + invlist_set_len(invlist, 0, 0); + invlist_trim(invlist); } +#endif /* ifndef PERL_IN_XSUB_RE */ + PERL_STATIC_INLINE bool S_invlist_is_iterating(SV* const invlist) { @@ -8389,8 +8538,6 @@ S_invlist_is_iterating(SV* const invlist) return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; } -#endif /* ifndef PERL_IN_XSUB_RE */ - PERL_STATIC_INLINE UV S_invlist_max(SV* const invlist) { @@ -8494,7 +8641,6 @@ Perl__new_invlist_C_array(pTHX_ const UV* const list) return invlist; } -#endif /* ifndef PERL_IN_XSUB_RE */ STATIC void S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) @@ -8538,7 +8684,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, UV final_element = len - 1; array = invlist_array(invlist); - if (array[final_element] > start + if ( array[final_element] > start || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) { Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", @@ -8546,10 +8692,10 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); } - /* Here, it is a legal append. If the new range begins with the first - * value not in the set, it is extending the set, so the new first - * value not in the set is one greater than the newly extended range. - * */ + /* Here, it is a legal append. If the new range begins 1 above the end + * of the range below it, it is extending the range below it, so the + * new first value not in the set is one greater than the newly + * extended range. */ offset = *get_invlist_offset_addr(invlist); if (array[final_element] == start) { if (end != UV_MAX) { @@ -8557,7 +8703,8 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, } else { /* But if the end is the maximum representable on the machine, - * just let the range that this would extend to have no end */ + * assume that infinity was actually what was meant. Just let + * the range that this would extend to have no end */ invlist_set_len(invlist, len - 1, offset); } return; @@ -8595,9 +8742,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, } } -#ifndef PERL_IN_XSUB_RE - -IV +SSize_t Perl__invlist_search(SV* const invlist, const UV cp) { /* Searches the inversion list for the entry that contains the input code @@ -8791,23 +8936,17 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Take the union of two inversion lists and point to it. *output * SHOULD BE DEFINED upon input, and if it points to one of the two lists, * the reference count to that list will be decremented if not already a - * temporary (mortal); otherwise *output will be made correspondingly - * mortal. The first list, , may be NULL, in which case a copy of the - * second list is returned. If is TRUE, the union is taken - * of the complement (inversion) of instead of b itself. + * temporary (mortal); otherwise just its contents will be modified to be + * the union. The first list, , may be NULL, in which case a copy of + * the second list is returned. If is TRUE, the union is + * taken of the complement (inversion) of instead of b itself. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some * length there. The preface says to incorporate its examples into your * code at your own risk. * - * The algorithm is like a merge sort. - * - * XXX A potential performance improvement is to keep track as we go along - * if only one of the inputs contributes to the result, meaning the other - * is a subset of that one. In that case, we can skip the final copy and - * return the larger of the input lists, but then outside code might need - * to keep track of whether to free the input list or not */ + * The algorithm is like a merge sort. */ const UV* array_a; /* a's array */ const UV* array_b; @@ -8816,7 +8955,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, SV* u; /* the resulting union */ UV* array_u; - UV len_u; + UV len_u = 0; UV i_a = 0; /* current index into a's array */ UV i_b = 0; @@ -8824,65 +8963,113 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* running count, as explained in the algorithm source book; items are * stopped accumulating and are output when the count changes to/from 0. - * The count is incremented when we start a range that's in the set, and - * decremented when we start a range that's not in the set. So its range - * is 0 to 2. Only when the count is zero is something not in the set. - */ + * The count is incremented when we start a range that's in an input's set, + * and decremented when we start a range that's not in a set. So this + * variable can be 0, 1, or 2. When it is 0 neither input is in their set, + * and hence nothing goes into the union; 1, just one of the inputs is in + * its set (and its current range gets added to the union); and 2 when both + * inputs are in their sets. */ UV count = 0; PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; assert(a != b); - /* If either one is empty, the union is the other one */ - if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { - bool make_temp = FALSE; /* Should we mortalize the result? */ + len_b = _invlist_len(b); + if (len_b == 0) { - if (*output == a) { - if (a != NULL) { - if (! (make_temp = cBOOL(SvTEMP(a)))) { - SvREFCNT_dec_NN(a); - } + /* Here, 'b' is empty. If the output is the complement of 'b', the + * union is all possible code points, and we need not even look at 'a'. + * It's easiest to create a new inversion list that matches everything. + * */ + if (complement_b) { + SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX); + + /* If the output didn't exist, just point it at the new list */ + if (*output == NULL) { + *output = everything; + return; } - } - if (*output != b) { - *output = invlist_clone(b); - if (complement_b) { - _invlist_invert(*output); + + /* Otherwise, replace its contents with the new list */ + invlist_replace_list_destroys_src(*output, everything); + SvREFCNT_dec_NN(everything); + return; + } + + /* Here, we don't want the complement of 'b', and since it is empty, + * the union will come entirely from 'a'. If 'a' is NULL or empty, the + * output will be empty */ + + if (a == NULL) { + *output = _new_invlist(0); + return; + } + + if (_invlist_len(a) == 0) { + invlist_clear(*output); + return; + } + + /* Here, 'a' is not empty, and entirely determines the union. If the + * output is not to overwrite 'b', we can just return 'a'. */ + if (*output != b) { + + /* If the output is to overwrite 'a', we have a no-op, as it's + * already in 'a' */ + if (*output == a) { + return; } - } /* else *output already = b; */ - if (make_temp) { - sv_2mortal(*output); + /* But otherwise we have to copy 'a' to the output */ + *output = invlist_clone(a); + return; } - return; + + /* Here, 'b' is to be overwritten by the output, which will be 'a' */ + u = invlist_clone(a); + invlist_replace_list_destroys_src(*output, u); + SvREFCNT_dec_NN(u); + + return; } - else if ((len_b = _invlist_len(b)) == 0) { - bool make_temp = FALSE; - if (*output == b) { - if (! (make_temp = cBOOL(SvTEMP(b)))) { - SvREFCNT_dec_NN(b); + + if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { + + /* Here, 'a' is empty (and b is not). That means the union will come + * entirely from 'b'. If the output is not to overwrite 'a', we can + * just return what's in 'b'. */ + if (*output != a) { + + /* If the output is to overwrite 'b', it's already in 'b', but + * otherwise we have to copy 'b' to the output */ + if (*output != b) { + *output = invlist_clone(b); } - } - /* The complement of an empty list is a list that has everything in it, - * so the union with includes everything too */ - if (complement_b) { - if (a == *output) { - if (! (make_temp = cBOOL(SvTEMP(a)))) { - SvREFCNT_dec_NN(a); - } + /* And if the output is to be the inversion of 'b', do that */ + if (complement_b) { + _invlist_invert(*output); } - *output = _new_invlist(1); - _append_range_to_invlist(*output, 0, UV_MAX); + + return; } - else if (*output != a) { - *output = invlist_clone(a); + + /* Here, 'a', which is empty or even NULL, is to be overwritten by the + * output, which will either be 'b' or the complement of 'b' */ + + if (a == NULL) { + *output = invlist_clone(b); } - /* else *output already = a; */ + else { + u = invlist_clone(b); + invlist_replace_list_destroys_src(*output, u); + SvREFCNT_dec_NN(u); + } - if (make_temp) { - sv_2mortal(*output); + if (complement_b) { + _invlist_invert(*output); } + return; } @@ -8914,10 +9101,10 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, u = _new_invlist(len_a + len_b); /* Will contain U+0000 if either component does */ - array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0) - || (len_b > 0 && array_b[0] == 0)); + array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0) + || (len_b > 0 && array_b[0] == 0)); - /* Go through each list item by item, stopping when exhausted one of + /* Go through each input list item by item, stopping when exhausted one of * them */ while (i_a < len_a && i_b < len_b) { UV cp; /* The element to potentially add to the union's array */ @@ -8925,21 +9112,21 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* We need to take one or the other of the two inputs for the union. * Since we are merging two sorted lists, we take the smaller of the - * next items. In case of a tie, we take the one that is in its set - * first. If we took one not in the set first, it would decrement the - * count, possibly to 0 which would cause it to be output as ending the - * range, and the next time through we would take the same number, and - * output it again as beginning the next range. By doing it the - * opposite way, there is no possibility that the count will be - * momentarily decremented to 0, and thus the two adjoining ranges will - * be seamlessly merged. (In a tie and both are in the set or both not - * in the set, it doesn't matter which we take first.) */ - if (array_a[i_a] < array_b[i_b] - || (array_a[i_a] == array_b[i_b] + * next items. In case of a tie, we take first the one that is in its + * set. If we first took the one not in its set, it would decrement + * the count, possibly to 0 which would cause it to be output as ending + * the range, and the next time through we would take the same number, + * and output it again as beginning the next range. By doing it the + * opposite way, there is no possibility that the count will be + * momentarily decremented to 0, and thus the two adjoining ranges will + * be seamlessly merged. (In a tie and both are in the set or both not + * in the set, it doesn't matter which we take first.) */ + if ( array_a[i_a] < array_b[i_b] + || ( array_a[i_a] == array_b[i_b] && ELEMENT_RANGE_MATCHES_INVLIST(i_a))) { cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); - cp= array_a[i_a++]; + cp = array_a[i_a++]; } else { cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); @@ -8948,7 +9135,7 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Here, have chosen which of the two inputs to look at. Only output * if the running count changes to/from 0, which marks the - * beginning/end of a range in that's in the set */ + * beginning/end of a range that's in the set */ if (cp_in_set) { if (count == 0) { array_u[i_u++] = cp; @@ -8963,80 +9150,90 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* Here, we are finished going through at least one of the lists, which - * means there is something remaining in at most one. We check if the list - * that hasn't been exhausted is positioned such that we are in the middle - * of a range in its set or not. (i_a and i_b point to the element beyond - * the one we care about.) If in the set, we decrement 'count'; if 0, there - * is potentially more to output. - * There are four cases: - * 1) Both weren't in their sets, count is 0, and remains 0. What's left - * in the union is entirely from the non-exhausted set. - * 2) Both were in their sets, count is 2. Nothing further should - * be output, as everything that remains will be in the exhausted - * list's set, hence in the union; decrementing to 1 but not 0 insures - * that - * 3) the exhausted was in its set, non-exhausted isn't, count is 1. - * Nothing further should be output because the union includes - * everything from the exhausted set. Not decrementing ensures that. - * 4) the exhausted wasn't in its set, non-exhausted is, count is 1; - * decrementing to 0 insures that we look at the remainder of the - * non-exhausted set */ - if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + + /* The loop above increments the index into exactly one of the input lists + * each iteration, and ends when either index gets to its list end. That + * means the other index is lower than its end, and so something is + * remaining in that one. We decrement 'count', as explained below, if + * that list is in its set. (i_a and i_b each currently index the element + * beyond the one we care about.) */ + if ( (i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) { count--; } - /* The final length is what we've output so far, plus what else is about to - * be output. (If 'count' is non-zero, then the input list we exhausted - * has everything remaining up to the machine's limit in its set, and hence - * in the union, so there will be no further output. */ - len_u = i_u; - if (count == 0) { - /* At most one of the subexpressions will be non-zero */ - len_u += (len_a - i_a) + (len_b - i_b); + /* Above we decremented 'count' if the list that had unexamined elements in + * it was in its set. This has made it so that 'count' being non-zero + * means there isn't anything left to output; and 'count' equal to 0 means + * that what is left to output is precisely that which is left in the + * non-exhausted input list. + * + * To see why, note first that the exhausted input obviously has nothing + * left to add to the union. If it was in its set at its end, that means + * the set extends from here to the platform's infinity, and hence so does + * the union and the non-exhausted set is irrelevant. The exhausted set + * also contributed 1 to 'count'. If 'count' was 2, it got decremented to + * 1, but if it was 1, the non-exhausted set wasn't in its set, and so + * 'count' remains at 1. This is consistent with the decremented 'count' + * != 0 meaning there's nothing left to add to the union. + * + * But if the exhausted input wasn't in its set, it contributed 0 to + * 'count', and the rest of the union will be whatever the other input is. + * If 'count' was 0, neither list was in its set, and 'count' remains 0; + * otherwise it gets decremented to 0. This is consistent with 'count' + * == 0 meaning the remainder of the union is whatever is left in the + * non-exhausted list. */ + if (count != 0) { + len_u = i_u; + } + else { + IV copy_count = len_a - i_a; + if (copy_count > 0) { /* The non-exhausted input is 'a' */ + Copy(array_a + i_a, array_u + i_u, copy_count, UV); + } + else { /* The non-exhausted input is b */ + copy_count = len_b - i_b; + Copy(array_b + i_b, array_u + i_u, copy_count, UV); + } + len_u = i_u + copy_count; } - /* Set result to final length, which can change the pointer to array_u, so - * re-find it */ + /* Set the result to the final length, which can change the pointer to + * array_u, so re-find it. (Note that it is unlikely that this will + * change, as we are shrinking the space, not enlarging it) */ if (len_u != _invlist_len(u)) { invlist_set_len(u, len_u, *get_invlist_offset_addr(u)); invlist_trim(u); array_u = invlist_array(u); } - /* When 'count' is 0, the list that was exhausted (if one was shorter than - * the other) ended with everything above it not in its set. That means - * that the remaining part of the union is precisely the same as the - * non-exhausted list, so can just copy it unchanged. (If both list were - * exhausted at the same time, then the operations below will be both 0.) - */ - if (count == 0) { - IV copy_count; /* At most one will have a non-zero copy count */ - if ((copy_count = len_a - i_a) > 0) { - Copy(array_a + i_a, array_u + i_u, copy_count, UV); - } - else if ((copy_count = len_b - i_b) > 0) { - Copy(array_b + i_b, array_u + i_u, copy_count, UV); - } + /* If the output is not to overwrite either of the inputs, just return the + * calculated union */ + if (a != *output && b != *output) { + *output = u; } + else { + /* Here, the output is to be the same as one of the input scalars, + * hence replacing it. The simple thing to do is to free the input + * scalar, making it instead be the output one. But experience has + * shown [perl #127392] that if the input is a mortal, we can get a + * huge build-up of these during regex compilation before they get + * freed. So for that case, replace just the input's interior with + * the union's, and then free the union */ - /* We may be removing a reference to one of the inputs. If so, the output - * is made mortal if the input was. (Mortal SVs shouldn't have their ref - * count decremented) */ - if (a == *output || b == *output) { assert(! invlist_is_iterating(*output)); - if ((SvTEMP(*output))) { - sv_2mortal(u); + + if (! SvTEMP(*output)) { + SvREFCNT_dec_NN(*output); + *output = u; } else { - SvREFCNT_dec_NN(*output); + invlist_replace_list_destroys_src(*output, u); + SvREFCNT_dec_NN(u); } } - *output = u; - return; } @@ -9047,11 +9244,11 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Take the intersection of two inversion lists and point to it. *i * SHOULD BE DEFINED upon input, and if it points to one of the two lists, * the reference count to that list will be decremented if not already a - * temporary (mortal); otherwise *i will be made correspondingly mortal. - * The first list, , may be NULL, in which case an empty list is - * returned. If is TRUE, the result will be the - * intersection of and the complement (or inversion) of instead of - * directly. + * temporary (mortal); otherwise just its contents will be modified to be + * the intersection. The first list, , may be NULL, in which case an + * empty list is returned. If is TRUE, the result will be + * the intersection of and the complement (or inversion) of instead + * of directly. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -9069,18 +9266,18 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, SV* r; /* the resulting intersection */ UV* array_r; - UV len_r; + UV len_r = 0; UV i_a = 0; /* current index into a's array */ UV i_b = 0; UV i_r = 0; - /* running count, as explained in the algorithm source book; items are - * stopped accumulating and are output when the count changes to/from 2. - * The count is incremented when we start a range that's in the set, and - * decremented when we start a range that's not in the set. So its range - * is 0 to 2. Only when the count is 2 is something in the intersection. - */ + /* running count of how many of the two inputs are postitioned at ranges + * that are in their sets. As explained in the algorithm source book, + * items are stopped accumulating and are output when the count changes + * to/from 2. The count is incremented when we start a range that's in an + * input's set, and decremented when we start a range that's not in a set. + * Only when it is 2 are we in the intersection. */ UV count = 0; PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; @@ -9089,50 +9286,38 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* Special case if either one is empty */ len_a = (a == NULL) ? 0 : _invlist_len(a); if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) { - bool make_temp = FALSE; - if (len_a != 0 && complement_b) { - /* Here, 'a' is not empty, therefore from the above 'if', 'b' must - * be empty. Here, also we are using 'b's complement, which hence - * must be every possible code point. Thus the intersection is - * simply 'a'. */ - if (*i != a) { - if (*i == b) { - if (! (make_temp = cBOOL(SvTEMP(b)))) { - SvREFCNT_dec_NN(b); - } - } + /* Here, 'a' is not empty, therefore from the enclosing 'if', 'b' + * must be empty. Here, also we are using 'b's complement, which + * hence must be every possible code point. Thus the intersection + * is simply 'a'. */ - *i = invlist_clone(a); + if (*i == a) { /* No-op */ + return; } - /* else *i is already 'a' */ - if (make_temp) { - sv_2mortal(*i); + /* If not overwriting either input, just make a copy of 'a' */ + if (*i != b) { + *i = invlist_clone(a); + return; } + + /* Here we are overwriting 'b' with 'a's contents */ + r = invlist_clone(a); + invlist_replace_list_destroys_src(*i, r); + SvREFCNT_dec_NN(r); return; } /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The * intersection must be empty */ - if (*i == a) { - if (a != NULL) { - if (! (make_temp = cBOOL(SvTEMP(a)))) { - SvREFCNT_dec_NN(a); - } - } - } - else if (*i == b) { - if (! (make_temp = cBOOL(SvTEMP(b)))) { - SvREFCNT_dec_NN(b); - } - } - *i = _new_invlist(0); - if (make_temp) { - sv_2mortal(*i); + if (*i == NULL) { + *i = _new_invlist(0); + return; } + invlist_clear(*i); return; } @@ -9164,8 +9349,8 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, r= _new_invlist(len_a + len_b); /* Will contain U+0000 iff both components do */ - array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 - && len_b > 0 && array_b[0] == 0); + array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 + && len_b > 0 && array_b[0] == 0); /* Go through each list item by item, stopping when exhausted one of * them */ @@ -9176,21 +9361,21 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, /* We need to take one or the other of the two inputs for the * intersection. Since we are merging two sorted lists, we take the - * smaller of the next items. In case of a tie, we take the one that - * is not in its set first (a difference from the union algorithm). If - * we took one in the set first, it would increment the count, possibly - * to 2 which would cause it to be output as starting a range in the - * intersection, and the next time through we would take that same - * number, and output it again as ending the set. By doing it the - * opposite of this, there is no possibility that the count will be - * momentarily incremented to 2. (In a tie and both are in the set or - * both not in the set, it doesn't matter which we take first.) */ - if (array_a[i_a] < array_b[i_b] - || (array_a[i_a] == array_b[i_b] + * smaller of the next items. In case of a tie, we take first the one + * that is not in its set (a difference from the union algorithm). If + * we first took the one in its set, it would increment the count, + * possibly to 2 which would cause it to be output as starting a range + * in the intersection, and the next time through we would take that + * same number, and output it again as ending the set. By doing the + * opposite of this, there is no possibility that the count will be + * momentarily incremented to 2. (In a tie and both are in the set or + * both not in the set, it doesn't matter which we take first.) */ + if ( array_a[i_a] < array_b[i_b] + || ( array_a[i_a] == array_b[i_b] && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a))) { cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a); - cp= array_a[i_a++]; + cp = array_a[i_a++]; } else { cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b); @@ -9212,40 +9397,60 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } count--; } + } - /* Here, we are finished going through at least one of the lists, which - * means there is something remaining in at most one. We check if the list - * that has been exhausted is positioned such that we are in the middle - * of a range in its set or not. (i_a and i_b point to elements 1 beyond - * the ones we care about.) There are four cases: - * 1) Both weren't in their sets, count is 0, and remains 0. There's - * nothing left in the intersection. - * 2) Both were in their sets, count is 2 and perhaps is incremented to - * above 2. What should be output is exactly that which is in the - * non-exhausted set, as everything it has is also in the intersection - * set, and everything it doesn't have can't be in the intersection - * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and - * gets incremented to 2. Like the previous case, the intersection is - * everything that remains in the non-exhausted set. - * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and - * remains 1. And the intersection has nothing more. */ - if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) - || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) + /* The loop above increments the index into exactly one of the input lists + * each iteration, and ends when either index gets to its list end. That + * means the other index is lower than its end, and so something is + * remaining in that one. We increment 'count', as explained below, if the + * exhausted list was in its set. (i_a and i_b each currently index the + * element beyond the one we care about.) */ + if ( (i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a)) + || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b))) { count++; } - /* The final length is what we've output so far plus what else is in the - * intersection. At most one of the subexpressions below will be non-zero - * */ - len_r = i_r; - if (count >= 2) { - len_r += (len_a - i_a) + (len_b - i_b); + /* Above we incremented 'count' if the exhausted list was in its set. This + * has made it so that 'count' being below 2 means there is nothing left to + * output; otheriwse what's left to add to the intersection is precisely + * that which is left in the non-exhausted input list. + * + * To see why, note first that the exhausted input obviously has nothing + * left to affect the intersection. If it was in its set at its end, that + * means the set extends from here to the platform's infinity, and hence + * anything in the non-exhausted's list will be in the intersection, and + * anything not in it won't be. Hence, the rest of the intersection is + * precisely what's in the non-exhausted list The exhausted set also + * contributed 1 to 'count', meaning 'count' was at least 1. Incrementing + * it means 'count' is now at least 2. This is consistent with the + * incremented 'count' being >= 2 means to add the non-exhausted list to + * the intersection. + * + * But if the exhausted input wasn't in its set, it contributed 0 to + * 'count', and the intersection can't include anything further; the + * non-exhausted set is irrelevant. 'count' was at most 1, and doesn't get + * incremented. This is consistent with 'count' being < 2 meaning nothing + * further to add to the intersection. */ + if (count < 2) { /* Nothing left to put in the intersection. */ + len_r = i_r; + } + else { /* copy the non-exhausted list, unchanged. */ + IV copy_count = len_a - i_a; + if (copy_count > 0) { /* a is the one with stuff left */ + Copy(array_a + i_a, array_r + i_r, copy_count, UV); + } + else { /* b is the one with stuff left */ + copy_count = len_b - i_b; + Copy(array_b + i_b, array_r + i_r, copy_count, UV); + } + len_r = i_r + copy_count; } - /* Set result to final length, which can change the pointer to array_r, so - * re-find it */ + /* Set the result to the final length, which can change the pointer to + * array_r, so re-find it. (Note that it is unlikely that this will + * change, as we are shrinking the space, not enlarging it) */ if (len_r != _invlist_len(r)) { invlist_set_len(r, len_r, *get_invlist_offset_addr(r)); invlist_trim(r); @@ -9263,100 +9468,328 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, } } - /* We may be removing a reference to one of the inputs. If so, the output - * is made mortal if the input was. (Mortal SVs shouldn't have their ref - * count decremented) */ - if (a == *i || b == *i) { + /* If the output is not to overwrite either of the inputs, just return the + * calculated intersection */ + if (a != *i && b != *i) { + *i = r; + } + else { + /* Here, the output is to be the same as one of the input scalars, + * hence replacing it. The simple thing to do is to free the input + * scalar, making it instead be the output one. But experience has + * shown [perl #127392] that if the input is a mortal, we can get a + * huge build-up of these during regex compilation before they get + * freed. So for that case, replace just the input's interior with + * the output's, and then free the output. A short-cut in this case + * is if the output is empty, we can just set the input to be empty */ + assert(! invlist_is_iterating(*i)); - if (SvTEMP(*i)) { - sv_2mortal(r); + + if (! SvTEMP(*i)) { + SvREFCNT_dec_NN(*i); + *i = r; } else { - SvREFCNT_dec_NN(*i); + if (len_r) { + invlist_replace_list_destroys_src(*i, r); + } + else { + invlist_clear(*i); + } + SvREFCNT_dec_NN(r); } } - *i = r; - return; } SV* -Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end) +Perl__add_range_to_invlist(pTHX_ SV* invlist, UV start, UV end) { /* Add the range from 'start' to 'end' inclusive to the inversion list's * set. A pointer to the inversion list is returned. This may actually be * a new list, in which case the passed in one has been destroyed. The * passed-in inversion list can be NULL, in which case a new one is created - * with just the one range in it */ - - SV* range_invlist; - UV len; - + * with just the one range in it. The new list is not necessarily + * NUL-terminated. Space is not freed if the inversion list shrinks as a + * result of this function. The gain would not be large, and in many + * cases, this is called multiple times on a single inversion list, so + * anything freed may almost immediately be needed again. + * + * This used to mostly call the 'union' routine, but that is much more + * heavyweight than really needed for a single range addition */ + + UV* array; /* The array implementing the inversion list */ + UV len; /* How many elements in 'array' */ + SSize_t i_s; /* index into the invlist array where 'start' + should go */ + SSize_t i_e = 0; /* And the index where 'end' should go */ + UV cur_highest; /* The highest code point in the inversion list + upon entry to this function */ + + /* This range becomes the whole inversion list if none already existed */ if (invlist == NULL) { invlist = _new_invlist(2); - len = 0; - } - else { - len = _invlist_len(invlist); + _append_range_to_invlist(invlist, start, end); + return invlist; } - /* If comes after the final entry actually in the list, can just append it - * to the end, */ - if (len == 0 - || (! ELEMENT_RANGE_MATCHES_INVLIST(len - 1) - && start >= invlist_array(invlist)[len - 1])) - { - _append_range_to_invlist(invlist, start, end); - return invlist; + /* Likewise, if the inversion list is currently empty */ + len = _invlist_len(invlist); + if (len == 0) { + _append_range_to_invlist(invlist, start, end); + return invlist; } - /* Here, can't just append things, create and return a new inversion list - * which is the union of this range and the existing inversion list. (If - * the new range is well-behaved wrt to the old one, we could just insert - * it, doing a Move() down on the tail of the old one (potentially growing - * it first). But to determine that means we would have the extra - * (possibly throw-away) work of first finding where the new one goes and - * whether it disrupts (splits) an existing range, so it doesn't appear to - * me (khw) that it's worth it) */ - range_invlist = _new_invlist(2); - _append_range_to_invlist(range_invlist, start, end); + /* Starting here, we have to know the internals of the list */ + array = invlist_array(invlist); - _invlist_union(invlist, range_invlist, &invlist); + /* If the new range ends higher than the current highest ... */ + cur_highest = invlist_highest(invlist); + if (end > cur_highest) { - /* The temporary can be freed */ - SvREFCNT_dec_NN(range_invlist); + /* If the whole range is higher, we can just append it */ + if (start > cur_highest) { + _append_range_to_invlist(invlist, start, end); + return invlist; + } - return invlist; -} + /* Otherwise, add the portion that is higher ... */ + _append_range_to_invlist(invlist, cur_highest + 1, end); + + /* ... and continue on below to handle the rest. As a result of the + * above append, we know that the index of the end of the range is the + * final even numbered one of the array. Recall that the final element + * always starts a range that extends to infinity. If that range is in + * the set (meaning the set goes from here to infinity), it will be an + * even index, but if it isn't in the set, it's odd, and the final + * range in the set is one less, which is even. */ + if (end == UV_MAX) { + i_e = len; + } + else { + i_e = len - 2; + } + } -SV* -Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, - UV** other_elements_ptr) -{ - /* Create and return an inversion list whose contents are to be populated - * by the caller. The caller gives the number of elements (in 'size') and - * the very first element ('element0'). This function will set - * '*other_elements_ptr' to an array of UVs, where the remaining elements - * are to be placed. - * - * Obviously there is some trust involved that the caller will properly - * fill in the other elements of the array. - * - * (The first element needs to be passed in, as the underlying code does - * things differently depending on whether it is zero or non-zero) */ + /* We have dealt with appending, now see about prepending. If the new + * range starts lower than the current lowest ... */ + if (start < array[0]) { - SV* invlist = _new_invlist(size); - bool offset; + /* Adding something which has 0 in it is somewhat tricky, and uncommon. + * Let the union code handle it, rather than having to know the + * trickiness in two code places. */ + if (UNLIKELY(start == 0)) { + SV* range_invlist; - PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; + range_invlist = _new_invlist(2); + _append_range_to_invlist(range_invlist, start, end); - _append_range_to_invlist(invlist, element0, element0); - offset = *get_invlist_offset_addr(invlist); + _invlist_union(invlist, range_invlist, &invlist); - invlist_set_len(invlist, size, offset); - *other_elements_ptr = invlist_array(invlist) + 1; - return invlist; + SvREFCNT_dec_NN(range_invlist); + + return invlist; + } + + /* If the whole new range comes before the first entry, and doesn't + * extend it, we have to insert it as an additional range */ + if (end < array[0] - 1) { + i_s = i_e = -1; + goto splice_in_new_range; + } + + /* Here the new range adjoins the existing first range, extending it + * downwards. */ + array[0] = start; + + /* And continue on below to handle the rest. We know that the index of + * the beginning of the range is the first one of the array */ + i_s = 0; + } + else { /* Not prepending any part of the new range to the existing list. + * Find where in the list it should go. This finds i_s, such that: + * invlist[i_s] <= start < array[i_s+1] + */ + i_s = _invlist_search(invlist, start); + } + + /* At this point, any extending before the beginning of the inversion list + * and/or after the end has been done. This has made it so that, in the + * code below, each endpoint of the new range is either in a range that is + * in the set, or is in a gap between two ranges that are. This means we + * don't have to worry about exceeding the array bounds. + * + * Find where in the list the new range ends (but we can skip this if we + * have already determined what it is, or if it will be the same as i_s, + * which we already have computed) */ + if (i_e == 0) { + i_e = (start == end) + ? i_s + : _invlist_search(invlist, end); + } + + /* Here generally invlist[i_e] <= end < array[i_e+1]. But if invlist[i_e] + * is a range that goes to infinity there is no element at invlist[i_e+1], + * so only the first relation holds. */ + + if ( ! ELEMENT_RANGE_MATCHES_INVLIST(i_s)) { + + /* Here, the ranges on either side of the beginning of the new range + * are in the set, and this range starts in the gap between them. + * + * The new range extends the range above it downwards if the new range + * ends at or above that range's start */ + const bool extends_the_range_above = ( end == UV_MAX + || end + 1 >= array[i_s+1]); + + /* The new range extends the range below it upwards if it begins just + * after where that range ends */ + if (start == array[i_s]) { + + /* If the new range fills the entire gap between the other ranges, + * they will get merged together. Other ranges may also get + * merged, depending on how many of them the new range spans. In + * the general case, we do the merge later, just once, after we + * figure out how many to merge. But in the case where the new + * range exactly spans just this one gap (possibly extending into + * the one above), we do the merge here, and an early exit. This + * is done here to avoid having to special case later. */ + if (i_e - i_s <= 1) { + + /* If i_e - i_s == 1, it means that the new range terminates + * within the range above, and hence 'extends_the_range_above' + * must be true. (If the range above it extends to infinity, + * 'i_s+2' will be above the array's limit, but 'len-i_s-2' + * will be 0, so no harm done.) */ + if (extends_the_range_above) { + Move(array + i_s + 2, array + i_s, len - i_s - 2, UV); + invlist_set_len(invlist, + len - 2, + *(get_invlist_offset_addr(invlist))); + return invlist; + } + + /* Here, i_e must == i_s. We keep them in sync, as they apply + * to the same range, and below we are about to decrement i_s + * */ + i_e--; + } + + /* Here, the new range is adjacent to the one below. (It may also + * span beyond the range above, but that will get resolved later.) + * Extend the range below to include this one. */ + array[i_s] = (end == UV_MAX) ? UV_MAX : end + 1; + i_s--; + start = array[i_s]; + } + else if (extends_the_range_above) { + + /* Here the new range only extends the range above it, but not the + * one below. It merges with the one above. Again, we keep i_e + * and i_s in sync if they point to the same range */ + if (i_e == i_s) { + i_e++; + } + i_s++; + array[i_s] = start; + } + } + + /* Here, we've dealt with the new range start extending any adjoining + * existing ranges. + * + * If the new range extends to infinity, it is now the final one, + * regardless of what was there before */ + if (UNLIKELY(end == UV_MAX)) { + invlist_set_len(invlist, i_s + 1, *(get_invlist_offset_addr(invlist))); + return invlist; + } + + /* If i_e started as == i_s, it has also been dealt with, + * and been updated to the new i_s, which will fail the following if */ + if (! ELEMENT_RANGE_MATCHES_INVLIST(i_e)) { + + /* Here, the ranges on either side of the end of the new range are in + * the set, and this range ends in the gap between them. + * + * If this range is adjacent to (hence extends) the range above it, it + * becomes part of that range; likewise if it extends the range below, + * it becomes part of that range */ + if (end + 1 == array[i_e+1]) { + i_e++; + array[i_e] = start; + } + else if (start <= array[i_e]) { + array[i_e] = end + 1; + i_e--; + } + } + + if (i_s == i_e) { + + /* If the range fits entirely in an existing range (as possibly already + * extended above), it doesn't add anything new */ + if (ELEMENT_RANGE_MATCHES_INVLIST(i_s)) { + return invlist; + } + + /* Here, no part of the range is in the list. Must add it. It will + * occupy 2 more slots */ + splice_in_new_range: + + invlist_extend(invlist, len + 2); + array = invlist_array(invlist); + /* Move the rest of the array down two slots. Don't include any + * trailing NUL */ + Move(array + i_e + 1, array + i_e + 3, len - i_e - 1, UV); + + /* Do the actual splice */ + array[i_e+1] = start; + array[i_e+2] = end + 1; + invlist_set_len(invlist, len + 2, *(get_invlist_offset_addr(invlist))); + return invlist; + } + + /* Here the new range crossed the boundaries of a pre-existing range. The + * code above has adjusted things so that both ends are in ranges that are + * in the set. This means everything in between must also be in the set. + * Just squash things together */ + Move(array + i_e + 1, array + i_s + 1, len - i_e - 1, UV); + invlist_set_len(invlist, + len - i_e + i_s, + *(get_invlist_offset_addr(invlist))); + + return invlist; +} + +SV* +Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, + UV** other_elements_ptr) +{ + /* Create and return an inversion list whose contents are to be populated + * by the caller. The caller gives the number of elements (in 'size') and + * the very first element ('element0'). This function will set + * '*other_elements_ptr' to an array of UVs, where the remaining elements + * are to be placed. + * + * Obviously there is some trust involved that the caller will properly + * fill in the other elements of the array. + * + * (The first element needs to be passed in, as the underlying code does + * things differently depending on whether it is zero or non-zero) */ + + SV* invlist = _new_invlist(size); + bool offset; + + PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; + + invlist = add_cp_to_invlist(invlist, element0); + offset = *get_invlist_offset_addr(invlist); + + invlist_set_len(invlist, size, offset); + *other_elements_ptr = invlist_array(invlist) + 1; + return invlist; } #endif @@ -9513,38 +9946,56 @@ S_invlist_highest(SV* const invlist) : array[len - 1] - 1; } -#ifndef PERL_IN_XSUB_RE -SV * -Perl__invlist_contents(pTHX_ SV* const invlist) +STATIC SV * +S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style) { /* Get the contents of an inversion list into a string SV so that they can - * be printed out. It uses the format traditionally done for debug tracing - */ + * be printed out. If 'traditional_style' is TRUE, it uses the format + * traditionally done for debug tracing; otherwise it uses a format + * suitable for just copying to the output, with blanks between ranges and + * a dash between range components */ UV start, end; - SV* output = newSVpvs("\n"); + SV* output; + const char intra_range_delimiter = (traditional_style ? '\t' : '-'); + const char inter_range_delimiter = (traditional_style ? '\n' : ' '); + + if (traditional_style) { + output = newSVpvs("\n"); + } + else { + output = newSVpvs(""); + } - PERL_ARGS_ASSERT__INVLIST_CONTENTS; + PERL_ARGS_ASSERT_INVLIST_CONTENTS; assert(! invlist_is_iterating(invlist)); invlist_iterinit(invlist); while (invlist_iternext(invlist, &start, &end)) { if (end == UV_MAX) { - Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start); + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%cINFINITY%c", + start, intra_range_delimiter, + inter_range_delimiter); } else if (end != start) { - Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n", - start, end); + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c%04"UVXf"%c", + start, + intra_range_delimiter, + end, inter_range_delimiter); } else { - Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start); + Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c", + start, inter_range_delimiter); } } + if (SvCUR(output) && ! traditional_style) {/* Get rid of trailing blank */ + SvCUR_set(output, SvCUR(output) - 1); + } + return output; } -#endif #ifndef PERL_IN_XSUB_RE void @@ -9800,7 +10251,7 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) { AV* list = (AV*) *listp; IV k; - for (k = 0; k <= av_tindex(list); k++) { + for (k = 0; k <= av_tindex_nomg(list); k++) { SV** c_p = av_fetch(list, k, FALSE); UV c; assert(c_p); @@ -10018,8 +10469,9 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) if (RExC_flags & RXf_PMf_FOLD) { RExC_contains_i = 1; } - if (PASS2) { - STD_PMMOD_FLAGS_PARSE_X_WARN(x_mod_count); + + if (UNLIKELY((x_mod_count) > 1)) { + vFAIL("Only one /x regex modifier is allowed"); } return; /*NOTREACHED*/ @@ -10154,7 +10606,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) * indivisible */ bool has_intervening_patws = paren == 2 && *(RExC_parse - 1) != '('; - assert(RExC_parse < RExC_end); + if (RExC_parse >= RExC_end) { + vFAIL("Unmatched ("); + } if ( *RExC_parse == '*') { /* (*VERB:ARG) */ char *start_verb = RExC_parse + 1; @@ -10402,7 +10856,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_seen |= REG_LOOKBEHIND_SEEN; RExC_in_lookbehind++; RExC_parse++; - assert(RExC_parse < RExC_end); + if (RExC_parse >= RExC_end) { + vFAIL("Sequence (?... not terminated"); + } + /* FALLTHROUGH */ case '=': /* (?=...) */ RExC_seen_zerolen++; @@ -10433,13 +10890,12 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case '0' : /* (?0) */ case 'R' : /* (?R) */ - if (*RExC_parse != ')') + if (RExC_parse == RExC_end || *RExC_parse != ')') FAIL("Sequence (?R) not terminated"); - ret = reg_node(pRExC_state, GOSTART); - RExC_seen |= REG_GOSTART_SEEN; + num = 0; + RExC_seen |= REG_RECURSE_SEEN; *flagp |= POSTPONED; - nextchar(pRExC_state); - return ret; + goto gen_recurse_regop; /*notreached*/ /* named and numeric backreferences */ case '&': /* (?&NAME) */ @@ -10515,6 +10971,14 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else if ( paren == '+' ) { num = RExC_npar + num - 1; } + /* We keep track how many GOSUB items we have produced. + To start off the ARG2L() of the GOSUB holds its "id", + which is used later in conjunction with RExC_recurse + to calculate the offset we need to jump for the GOSUB, + which it will store in the final representation. + We have to defer the actual calculation until much later + as the regop may move. + */ ret = reg2Lanode(pRExC_state, GOSUB, num, RExC_recurse_count); if (!SIZE_ONLY) { @@ -10523,16 +10987,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) vFAIL("Reference to nonexistent group"); } RExC_recurse_count++; - DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ "%*s%*s Recurse #%"UVuf" to %"IVdf"\n", 22, "| |", (int)(depth * 2 + 1), "", (UV)ARG(ret), (IV)ARG2L(ret))); } RExC_seen |= REG_RECURSE_SEEN; + Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ *flagp |= POSTPONED; + assert(*RExC_parse == ')'); nextchar(pRExC_state); return ret; @@ -10675,13 +11141,21 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else if (RExC_parse[0] == 'R') { RExC_parse++; + /* parno == 0 => /(?(R)YES|NO)/ "in any form of recursion OR eval" + * parno == 1 => /(?(R0)YES|NO)/ "in GOSUB (?0) / (?R)" + * parno == 2 => /(?(R1)YES|NO)/ "in GOSUB (?1) (parno-1)" + */ parno = 0; - if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + if (RExC_parse[0] == '0') { + parno = 1; + RExC_parse++; + } + else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { UV uv; if (grok_atoUV(RExC_parse, &uv, &endptr) && uv <= I32_MAX ) { - parno = (I32)uv; + parno = (I32)uv + 1; RExC_parse = (char*)endptr; } /* else "Switch condition not recognized" below */ @@ -10692,7 +11166,19 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA); - parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0; + + /* we should only have a false sv_dat when + * SIZE_ONLY is true, and we always have false + * sv_dat when SIZE_ONLY is true. + * reg_scan_name() will VFAIL() if the name is + * unknown when SIZE_ONLY is false, and otherwise + * will return something, and when SIZE_ONLY is + * true, reg_scan_name() just parses the string, + * and doesnt return anything. (in theory) */ + assert(SIZE_ONLY ? !sv_dat : !!sv_dat); + + if (sv_dat) + parno = 1 + *((I32 *)SvPVX(sv_dat)); } ret = reganode(pRExC_state,INSUBP,parno); goto insert_if_check_paren; @@ -10812,14 +11298,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY ){ if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_RECURSE_SEEN - && !RExC_open_parens[parno-1]) + if (RExC_open_parens && !RExC_open_parens[parno]) { - DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ "%*s%*s Setting open paren #%"IVdf" to %d\n", 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ret))); - RExC_open_parens[parno-1]= ret; + RExC_open_parens[parno]= ret; } } Set_Node_Length(ret, 1); /* MJD */ @@ -10908,11 +11393,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) break; case 1: case 2: ender = reganode(pRExC_state, CLOSE, parno); - if (!SIZE_ONLY && RExC_seen & REG_RECURSE_SEEN) { - DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, + if ( RExC_close_parens ) { + DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ "%*s%*s Setting close paren #%"IVdf" to %d\n", 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender))); - RExC_close_parens[parno-1]= ender; + RExC_close_parens[parno]= ender; if (RExC_nestroot == parno) RExC_nestroot = 0; } @@ -10931,8 +11416,15 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case 0: ender = reg_node(pRExC_state, END); if (!SIZE_ONLY) { - assert(!RExC_opend); /* there can only be one! */ - RExC_opend = ender; + assert(!RExC_end_op); /* there can only be one! */ + RExC_end_op = ender; + if (RExC_close_parens) { + DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ + "%*s%*s Setting close paren #0 (END) to %d\n", + 22, "| |", (int)(depth * 2 + 1), "", REG_NODE_NUM(ender))); + + RExC_close_parens[0]= ender; + } } break; } @@ -10940,7 +11432,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) DEBUG_PARSE_MSG("lsbr"); regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state); regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state); - PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + Perl_re_printf( aTHX_ "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(RExC_mysv1), (IV)REG_NODE_NUM(lastbr), SvPV_nolen_const(RExC_mysv2), @@ -10979,7 +11471,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) DEBUG_PARSE_MSG("NADA"); regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state); regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state); - PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + Perl_re_printf( aTHX_ "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", SvPV_nolen_const(RExC_mysv1), (IV)REG_NODE_NUM(ret), SvPV_nolen_const(RExC_mysv2), @@ -11103,7 +11595,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); } else if (ret == NULL) - ret = latest; + ret = latest; *flagp |= flags&(HASWIDTH|POSTPONED); if (chain == NULL) /* First piece. */ *flagp |= flags&SPSTART; @@ -11713,39 +12205,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, } -/* - * reg_recode - * - * It returns the code point in utf8 for the value in *encp. - * value: a code value in the source encoding - * encp: a pointer to an Encode object - * - * If the result from Encode is not a single character, - * it returns U+FFFD (Replacement character) and sets *encp to NULL. - */ -STATIC UV -S_reg_recode(pTHX_ const U8 value, SV **encp) -{ - STRLEN numlen = 1; - SV * const sv = newSVpvn_flags((const char *) &value, numlen, SVs_TEMP); - const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); - const STRLEN newlen = SvCUR(sv); - UV uv = UNICODE_REPLACEMENT; - - PERL_ARGS_ASSERT_REG_RECODE; - - if (newlen) - uv = SvUTF8(sv) - ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT) - : *(U8*)s; - - if (!newlen || numlen != newlen) { - uv = UNICODE_REPLACEMENT; - *encp = NULL; - } - return uv; -} - PERL_STATIC_INLINE U8 S_compute_EXACTish(RExC_state_t *pRExC_state) { @@ -11959,13 +12418,15 @@ S_backref_value(char *p) /* - regatom - the lowest level - Try to identify anything special at the start of the pattern. If there - is, then handle it as required. This may involve generating a single regop, - such as for an assertion; or it may involve recursing, such as to - handle a () structure. + Try to identify anything special at the start of the current parse position. + If there is, then handle it as required. This may involve generating a + single regop, such as for an assertion; or it may involve recursing, such as + to handle a () structure. If the string doesn't start with something special then we gobble up - as much literal text as we can. + as much literal text as we can. If we encounter a quantifier, we have to + back off the final literal character, as that quantifier applies to just it + and not to the whole string of literals. Once we have been able to handle whatever type of thing started the sequence, we return. @@ -12680,6 +13141,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) || UTF8_IS_INVARIANT(UCHARAT(RExC_parse)) || UTF8_IS_START(UCHARAT(RExC_parse))); + /* Here, we have a literal character. Find the maximal string of + * them in the input that we can fit into a single EXACTish node. + * We quit at the first non-literal or when the node gets full */ for (p = RExC_parse; len < upper_parse && p < RExC_end; len++) @@ -12810,9 +13274,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) vFAIL(error_msg); } ender = result; - if (IN_ENCODING && ender < 0x100) { - goto recode_encoding; - } if (ender > 0xff) { REQUIRE_UTF8(flagp); } @@ -12845,11 +13306,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (RExC_recode_x_to_native) { ender = LATIN1_TO_NATIVE(ender); } - else #endif - if (IN_ENCODING) { - goto recode_encoding; - } } else { REQUIRE_UTF8(flagp); @@ -12909,17 +13366,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) form_short_octal_warning(p, numlen)); } } - if (IN_ENCODING && ender < 0x100) - goto recode_encoding; - break; - recode_encoding: - if (! RExC_override_recoding) { - SV* enc = _get_encoding(); - ender = reg_recode((U8)ender, &enc); - if (!enc && PASS2) - ckWARNreg(p, "Invalid escape in the specified encoding"); - REQUIRE_UTF8(flagp); - } break; case '\0': if (p >= RExC_end) @@ -12937,14 +13383,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) } /* End of switch on '\' */ break; case '{': - /* Currently we don't warn when the lbrace is at the start + /* Currently we don't care if the lbrace is at the start * of a construct. This catches it in the middle of a * literal string, or when it's the first thing after * something like "\b" */ - if (! SIZE_ONLY - && (len || (p > RExC_start && isALPHA_A(*(p -1))))) - { - ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through"); + if (len || (p > RExC_start && isALPHA_A(*(p -1)))) { + RExC_parse = p + 1; + vFAIL("Unescaped left brace in regex is illegal here"); } /*FALLTHROUGH*/ default: /* A literal character */ @@ -12976,6 +13421,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) * the node, close the node with just them, and set up to do * this character again next time through, when it will be the * only thing in its new node */ + if ((next_is_quantifier = ( LIKELY(p < RExC_end) && UNLIKELY(ISMULT2(p)))) && LIKELY(len)) @@ -13349,8 +13795,6 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_parse = p - 1; Set_Node_Cur_Length(ret, parse_start); RExC_parse = p; - skip_to_be_ignored_text(pRExC_state, &RExC_parse, - FALSE /* Don't force to /x */ ); { /* len is STRLEN which is unsigned, need to copy to signed */ IV iv = len; @@ -13362,6 +13806,13 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) break; } /* End of giant switch on input character */ + /* Position parse to next real character */ + skip_to_be_ignored_text(pRExC_state, &RExC_parse, + FALSE /* Don't force to /x */ ); + if (PASS2 && *RExC_parse == '{' && OP(ret) != SBOL && ! regcurly(RExC_parse)) { + ckWARNregdep(RExC_parse + 1, "Unescaped left brace in regex is deprecated here, passed through"); + } + return(ret); } @@ -13451,11 +13902,9 @@ S_populate_ANYOF_from_invlist(pTHX_ regnode *node, SV** invlist_ptr) /* 'posix_warnings' and 'warn_text' are names of variables in the following * routine. q.v. */ #define ADD_POSIX_WARNING(p, text) STMT_START { \ - if (posix_warnings && ( posix_warnings != (AV **) -1 \ - || (PASS2 && ckWARN(WARN_REGEXP)))) \ - { \ - if (! warn_text) warn_text = newAV(); \ - av_push(warn_text, Perl_newSVpvf(aTHX_ \ + if (posix_warnings) { \ + if (! RExC_warn_text ) RExC_warn_text = (AV *) sv_2mortal((SV *) newAV()); \ + av_push(RExC_warn_text, Perl_newSVpvf(aTHX_ \ WARNING_PREFIX \ text \ REPORT_LOCATION, \ @@ -13472,8 +13921,9 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, besides RExC_parse. */ char ** updated_parse_ptr, /* Where to set the updated parse pointer, or NULL */ - AV ** posix_warnings /* Where to place any generated warnings, or -1 - if to output them, or NULL */ + AV ** posix_warnings, /* Where to place any generated warnings, or + NULL */ + const bool check_only /* Don't die if error */ ) { /* This parses what the caller thinks may be one of the three POSIX @@ -13499,19 +13949,13 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, * 'updated_parse_ptr' is not changed. No warnings nor errors are * raised. * - * In b) there may be warnings and even errors generated. What to do about - * these is determined by the 'posix_warnings' parameter. If it is NULL, - * this call is treated as a check-only, scouting-out-the-territory call, - * and no warnings nor errors are generated at all. Otherwise, any errors - * are raised if found. If 'posix_warnings' is -1 (appropriately cast), - * warnings are generated and displayed (in pass 2), just as they would be - * for any other message of the same type from this file. If it isn't NULL - * and not -1, warnings aren't displayed, but instead an AV is generated - * with all the warning messages (that aren't to be ignored) stored into - * it, so that the caller can output them if it wants. This is done in all + * In b) there may be errors or warnings generated. If 'check_only' is + * TRUE, then any errors are discarded. Warnings are returned to the + * caller via an AV* created into '*posix_warnings' if it is not NULL. If + * instead it is NULL, warnings are suppressed. This is done in all * passes. The reason for this is that the rest of the parsing is heavily * dependent on whether this routine found a valid posix class or not. If - * it did, the closing ']' is absorbed as part of the class. If no class + * it did, the closing ']' is absorbed as part of the class. If no class, * or an invalid one is found, any ']' will be considered the terminator of * the outer bracketed character class, leading to very different results. * In particular, a '(?[ ])' construct will likely have a syntax error if @@ -13591,7 +14035,6 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, bool has_opening_colon = FALSE; int class_number = OOB_NAMEDCLASS; /* Out-of-bounds until find valid class */ - AV* warn_text = NULL; /* any warning messages */ const char * possible_end = NULL; /* used for a 2nd parse pass */ const char* name_start; /* ptr to class name first char */ @@ -13607,6 +14050,9 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, PERL_ARGS_ASSERT_HANDLE_POSSIBLE_POSIX; + if (posix_warnings && RExC_warn_text) + av_clear(RExC_warn_text); + if (p >= e) { return NOT_MEANT_TO_BE_A_POSIX_CLASS; } @@ -13633,85 +14079,78 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, /* For [. .] and [= =]. These are quite different internally from [: :], * so they are handled separately. */ - if (POSIXCC_NOTYET(*p)) { + if (POSIXCC_NOTYET(*p) && p < e - 3) /* 1 for the close, and 1 for the ']' + and 1 for at least one char in it + */ + { const char open_char = *p; const char * temp_ptr = p + 1; - unsigned int len = 0; /* These two constructs are not handled by perl, and if we find a - * syntactically valid one, we croak. It looks like just about any - * byte can be in them, but they are likely very short, like [.ch.] to - * denote a ligature 'ch' single character. If we find something that - * started out to look like one of these constructs, but isn't, we - * break so that it can be checked for being a class name with a typo - * of '.' or '=' instead of a colon */ - while (temp_ptr < e) { - len++; - - /* qr/[[.].]]/, for example, is valid. But otherwise we quit on an - * unexpected ']'. It is possible, it appears, for such a ']' to - * be not in the final position, but that's so unlikely that that - * case is not handled. */ - if (*temp_ptr == ']' && temp_ptr[1] != open_char) { - break; - } - - /* XXX this could be cut down, but this value is certainly large - * enough */ - if (len > 10) { - break; - } + * syntactically valid one, we croak. khw, who wrote this code, finds + * this explanation of them very unclear: + * http://pubs.opengroup.org/onlinepubs/009696899/basedefs/xbd_chap09.html + * And searching the rest of the internet wasn't very helpful either. + * It looks like just about any byte can be in these constructs, + * depending on the locale. But unless the pattern is being compiled + * under /l, which is very rare, Perl runs under the C or POSIX locale. + * In that case, it looks like [= =] isn't allowed at all, and that + * [. .] could be any single code point, but for longer strings the + * constituent characters would have to be the ASCII alphabetics plus + * the minus-hyphen. Any sensible locale definition would limit itself + * to these. And any portable one definitely should. Trying to parse + * the general case is a nightmare (see [perl #127604]). So, this code + * looks only for interiors of these constructs that match: + * qr/.|[-\w]{2,}/ + * Using \w relaxes the apparent rules a little, without adding much + * danger of mistaking something else for one of these constructs. + * + * [. .] in some implementations described on the internet is usable to + * escape a character that otherwise is special in bracketed character + * classes. For example [.].] means a literal right bracket instead of + * the ending of the class + * + * [= =] can legitimately contain a [. .] construct, but we don't + * handle this case, as that [. .] construct will later get parsed + * itself and croak then. And [= =] is checked for even when not under + * /l, as Perl has long done so. + * + * The code below relies on there being a trailing NUL, so it doesn't + * have to keep checking if the parse ptr < e. + */ + if (temp_ptr[1] == open_char) { + temp_ptr++; + } + else while ( temp_ptr < e + && (isWORDCHAR(*temp_ptr) || *temp_ptr == '-')) + { + temp_ptr++; + } - if (*temp_ptr == open_char) { + if (*temp_ptr == open_char) { + temp_ptr++; + if (*temp_ptr == ']') { temp_ptr++; - if (*temp_ptr == ']') { - temp_ptr++; - if (! found_problem && posix_warnings) { - RExC_parse = (char *) temp_ptr; - vFAIL3("POSIX syntax [%c %c] is reserved for future " - "extensions", open_char, open_char); - } - - /* Here, the syntax wasn't completely valid, or else the - * call is to check-only */ - if (updated_parse_ptr) { - *updated_parse_ptr = (char *) temp_ptr; - } - - return OOB_NAMEDCLASS; + if (! found_problem && ! check_only) { + RExC_parse = (char *) temp_ptr; + vFAIL3("POSIX syntax [%c %c] is reserved for future " + "extensions", open_char, open_char); } - } - else if (*temp_ptr == '\\') { - - /* A backslash is treate as like any other character, unless it - * precedes a comment starter. XXX multiple backslashes in a - * row are not handled specially here, nor would they ever - * likely to be handled specially in one of these constructs */ - if (temp_ptr[1] == '#' && (RExC_flags & RXf_PMf_EXTENDED)) { - temp_ptr++; + + /* Here, the syntax wasn't completely valid, or else the call + * is to check-only */ + if (updated_parse_ptr) { + *updated_parse_ptr = (char *) temp_ptr; } - temp_ptr++; - } - else if (*temp_ptr == '#' && (RExC_flags & RXf_PMf_EXTENDED)) { - break; /* Under no circumstances can we look at the interior - of a comment */ - } - else if (*temp_ptr == '\n') { /* And we don't allow newlines - either as it's extremely - unlikely that one could be in an - intended class */ - break; - } - else if (UTF && ! UTF8_IS_INVARIANT(*temp_ptr)) { - /* XXX Since perl will never handle multi-byte locales, except - * for UTF-8, we could break if we found a byte above latin1, - * but perhaps the person intended to use one. */ - temp_ptr += UTF8SKIP(temp_ptr); - } - else { - temp_ptr++; + + return OOB_NAMEDCLASS; } } + + /* If we find something that started out to look like one of these + * constructs, but isn't, we continue below so that it can be checked + * for being a class name with a typo of '.' or '=' instead of a colon. + * */ } /* Here, we think there is a possibility that a [: :] class was meant, and @@ -14231,19 +14670,8 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, ADD_POSIX_WARNING(p, "there is no terminating ']'"); } - if (warn_text) { - if (posix_warnings != (AV **) -1) { - *posix_warnings = warn_text; - } - else { - SV * msg; - while ((msg = av_shift(warn_text)) != &PL_sv_undef) { - Perl_warner(aTHX_ packWARN(WARN_REGEXP), - "%s", SvPVX(msg)); - SvREFCNT_dec_NN(msg); - } - SvREFCNT_dec_NN(warn_text); - } + if (posix_warnings && RExC_warn_text && av_top_index(RExC_warn_text) > -1) { + *posix_warnings = RExC_warn_text; } } else if (class_number != OOB_NAMEDCLASS) { @@ -14252,7 +14680,7 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, * one */ return class_number + complement; } - else if (posix_warnings) { + else if (! check_only) { /* Here, it is an unrecognized class. This is an error (unless the * call is to check only, which we've already handled above) */ @@ -14315,7 +14743,9 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, 'stack' of where the undealt-with left parens would be if they were actually put there */ - IV fence = 0; /* Position of where most recent undealt- + /* The 'VOL' (expanding to 'volatile') is a workaround for an optimiser bug + * in Solaris Studio 12.3. See RT #127455 */ + VOL IV fence = 0; /* Position of where most recent undealt- with left paren in stack is; -1 if none. */ STRLEN len; /* Temporary */ @@ -14375,10 +14805,11 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, { /* See if this is a [:posix:] class. */ bool is_posix_class = (OOB_NAMEDCLASS - < handle_possible_posix(pRExC_state, - RExC_parse + 1, - NULL, - NULL)); + < handle_possible_posix(pRExC_state, + RExC_parse + 1, + NULL, + NULL, + TRUE /* checking only */)); /* If it is a posix class, leave the parse pointer at the * '[' to fool regclass() into thinking it is part of a * '[[:posix:]]'. */ @@ -14436,13 +14867,8 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, no_close: /* We output the messages even if warnings are off, because we'll fail * the very next thing, and these give a likely diagnosis for that */ - if (posix_warnings) { - SV * msg; - while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) { - Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg)); - SvREFCNT_dec_NN(msg); - } - SvREFCNT_dec_NN(posix_warnings); + if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) { + output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL); } FAIL("Syntax error in (?[...])"); @@ -14550,7 +14976,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, redo_curchar: - top_index = av_tindex(stack); + top_index = av_tindex_nomg(stack); switch (curchar) { SV** stacked_ptr; /* Ptr to something already on 'stack' */ @@ -14680,10 +15106,11 @@ redo_curchar: { /* See if this is a [:posix:] class. */ bool is_posix_class = (OOB_NAMEDCLASS - < handle_possible_posix(pRExC_state, - RExC_parse + 1, - NULL, - NULL)); + < handle_possible_posix(pRExC_state, + RExC_parse + 1, + NULL, + NULL, + TRUE /* checking only */)); /* If it is a posix class, leave the parse pointer at the '[' * to fool regclass() into thinking it is part of a * '[[:posix:]]'. */ @@ -14727,7 +15154,7 @@ redo_curchar: goto done; case ')': - if (av_tindex(fence_stack) < 0) { + if (av_tindex_nomg(fence_stack) < 0) { RExC_parse++; vFAIL("Unexpected ')'"); } @@ -14919,10 +15346,11 @@ redo_curchar: handle_operand: /* Here 'current' is the operand. If something is already on the - * stack, we have to check if it is a !. */ - top_index = av_tindex(stack); /* Code above may have altered the - * stack in the time since we - * earlier set 'top_index'. */ + * stack, we have to check if it is a !. But first, the code above + * may have altered the stack in the time since we earlier set + * 'top_index'. */ + + top_index = av_tindex_nomg(stack); if (top_index - fence >= 0) { /* If the top entry on the stack is an operator, it had better * be a '!', otherwise the entry below the top operand should @@ -14944,7 +15372,6 @@ redo_curchar: only_to_avoid_leaks = av_pop(stack); SvREFCNT_dec(only_to_avoid_leaks); - top_index = av_tindex(stack); /* And we redo with the inverted operand. This allows * handling multiple ! in a row */ @@ -14974,15 +15401,15 @@ redo_curchar: } /* End of loop parsing through the construct */ done: - if (av_tindex(fence_stack) >= 0) { + if (av_tindex_nomg(fence_stack) >= 0) { vFAIL("Unmatched ("); } - if (av_tindex(stack) < 0 /* Was empty */ + if (av_tindex_nomg(stack) < 0 /* Was empty */ || ((final = av_pop(stack)) == NULL) || ! IS_OPERAND(final) || SvTYPE(final) != SVt_INVLIST - || av_tindex(stack) >= 0) /* More left on stack */ + || av_tindex_nomg(stack) >= 0) /* More left on stack */ { bad_syntax: SvREFCNT_dec(final); @@ -15158,6 +15585,43 @@ S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invl } } +STATIC void +S_output_or_return_posix_warnings(pTHX_ RExC_state_t *pRExC_state, AV* posix_warnings, AV** return_posix_warnings) +{ + /* If the final parameter is NULL, output the elements of the array given + * by '*posix_warnings' as REGEXP warnings. Otherwise, the elements are + * pushed onto it, (creating if necessary) */ + + SV * msg; + const bool first_is_fatal = ! return_posix_warnings + && ckDEAD(packWARN(WARN_REGEXP)); + + PERL_ARGS_ASSERT_OUTPUT_OR_RETURN_POSIX_WARNINGS; + + while ((msg = av_shift(posix_warnings)) != &PL_sv_undef) { + if (return_posix_warnings) { + if (! *return_posix_warnings) { /* mortalize to not leak if + warnings are fatal */ + *return_posix_warnings = (AV *) sv_2mortal((SV *) newAV()); + } + av_push(*return_posix_warnings, msg); + } + else { + if (first_is_fatal) { /* Avoid leaking this */ + av_undef(posix_warnings); /* This isn't necessary if the + array is mortal, but is a + fail-safe */ + (void) sv_2mortal(msg); + if (PASS2) { + SAVEFREESV(RExC_rx_sv); + } + } + Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s", SvPVX(msg)); + SvREFCNT_dec_NN(msg); + } + } +} + STATIC AV * S_add_multi_match(pTHX_ AV* multi_char_matches, SV* multi_string, const STRLEN cp_count) { @@ -15239,7 +15703,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, bool optimizable, /* ? Allow a non-ANYOF return node */ SV** ret_invlist, /* Return an inversion list, not a node */ - AV** posix_warnings + AV** return_posix_warnings ) { /* parse a bracketed class specification. Most of these will produce an @@ -15353,12 +15817,16 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, const SSize_t orig_size = RExC_size; bool posixl_matches_all = FALSE; /* Does /l class have both e.g. \W,\w ? */ - /* This variable is used to mark where in the input something that looks - * like a POSIX construct ends. During the parse, when something looks - * like it could be such a construct is encountered, it is checked for - * being one, but not if we've already checked this area of the input. - * Only after this position is reached do we check again */ - char *dont_check_for_posix_end = RExC_parse - 1; + /* This variable is used to mark where the end in the input is of something + * that looks like a POSIX construct but isn't. During the parse, when + * something looks like it could be such a construct is encountered, it is + * checked for being one, but not if we've already checked this area of the + * input. Only after this position is reached do we check again */ + char *not_posix_region_end = RExC_parse - 1; + + AV* posix_warnings = NULL; + const bool do_posix_warnings = return_posix_warnings + || (PASS2 && ckWARN(WARN_REGEXP)); GET_RE_DEBUG_FLAGS_DECL; @@ -15375,10 +15843,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, allow_multi_folds = FALSE; #endif - if (posix_warnings == NULL) { - posix_warnings = (AV **) -1; - } - /* Assume we are going to generate an ANYOF node. */ ret = reganode(pRExC_state, (LOC) @@ -15413,25 +15877,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, /* Check that they didn't say [:posix:] instead of [[:posix:]] */ if (! ret_invlist && MAYBE_POSIXCC(UCHARAT(RExC_parse))) { - char *class_end; - int maybe_class = handle_possible_posix(pRExC_state, RExC_parse, - &class_end, NULL); - if (maybe_class >= OOB_NAMEDCLASS) { - dont_check_for_posix_end = class_end; - if (PASS2 && posix_warnings == (AV **) -1) { - SAVEFREESV(RExC_rx_sv); - ckWARN4reg(class_end, - "POSIX syntax [%c %c] belongs inside character classes%s", - *RExC_parse, *RExC_parse, - (maybe_class == OOB_NAMEDCLASS) - ? ((POSIXCC_NOTYET(*RExC_parse)) - ? " (but this one isn't implemented)" - : " (but this one isn't fully valid)") - : "" - ); - (void)ReREFCNT_inc(RExC_rx_sv); - } - } + int maybe_class = handle_possible_posix(pRExC_state, + RExC_parse, + ¬_posix_region_end, + NULL, + TRUE /* checking only */); + if (PASS2 && maybe_class >= OOB_NAMEDCLASS && do_posix_warnings) { + SAVEFREESV(RExC_rx_sv); + ckWARN4reg(not_posix_region_end, + "POSIX syntax [%c %c] belongs inside character classes%s", + *RExC_parse, *RExC_parse, + (maybe_class == OOB_NAMEDCLASS) + ? ((POSIXCC_NOTYET(*RExC_parse)) + ? " (but this one isn't implemented)" + : " (but this one isn't fully valid)") + : "" + ); + (void)ReREFCNT_inc(RExC_rx_sv); + } } /* If the caller wants us to just parse a single element, accomplish this @@ -15445,6 +15908,21 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, goto charclassloop; while (1) { + + if ( posix_warnings + && av_tindex_nomg(posix_warnings) >= 0 + && RExC_parse > not_posix_region_end) + { + /* Warnings about posix class issues are considered tentative until + * we are far enough along in the parse that we can no longer + * change our mind, at which point we either output them or add + * them, if it has so specified, to what gets returned to the + * caller. This is done each time through the loop so that a later + * class won't zap them before they have been dealt with. */ + output_or_return_posix_warnings(pRExC_state, posix_warnings, + return_posix_warnings); + } + if (RExC_parse >= stop_ptr) { break; } @@ -15476,22 +15954,52 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, value = UCHARAT(RExC_parse++); if (value == '[') { - namedclass = handle_possible_posix(pRExC_state, RExC_parse, &dont_check_for_posix_end, posix_warnings); + char * posix_class_end; + namedclass = handle_possible_posix(pRExC_state, + RExC_parse, + &posix_class_end, + do_posix_warnings ? &posix_warnings : NULL, + FALSE /* die if error */); if (namedclass > OOB_NAMEDCLASS) { - RExC_parse = dont_check_for_posix_end; + + /* If there was an earlier attempt to parse this particular + * posix class, and it failed, it was a false alarm, as this + * successful one proves */ + if ( posix_warnings + && av_tindex_nomg(posix_warnings) >= 0 + && not_posix_region_end >= RExC_parse + && not_posix_region_end <= posix_class_end) + { + av_undef(posix_warnings); + } + + RExC_parse = posix_class_end; + } + else if (namedclass == OOB_NAMEDCLASS) { + not_posix_region_end = posix_class_end; } else { namedclass = OOB_NAMEDCLASS; } } - else if ( RExC_parse - 1 > dont_check_for_posix_end + else if ( RExC_parse - 1 > not_posix_region_end && MAYBE_POSIXCC(value)) { - (void) handle_possible_posix(pRExC_state, RExC_parse - 1, /* -1 because parse has already been advanced */ - &dont_check_for_posix_end, posix_warnings); + (void) handle_possible_posix( + pRExC_state, + RExC_parse - 1, /* -1 because parse has already been + advanced */ + ¬_posix_region_end, + do_posix_warnings ? &posix_warnings : NULL, + TRUE /* checking only */); } else if (value == '\\') { /* Is a backslash; get the code point of the char after it */ + + if (RExC_parse >= RExC_end) { + vFAIL("Unmatched ["); + } + if (UTF && ! UTF8_IS_INVARIANT(UCHARAT(RExC_parse))) { value = utf8n_to_uvchr((U8*)RExC_parse, RExC_end - RExC_parse, @@ -15650,6 +16158,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, SV* invlist; char* name; char* base_name; /* name after any packages are stripped */ + char* lookup_name = NULL; const char * const colon_colon = "::"; /* Try to get the definition of the property into @@ -15657,18 +16166,24 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * will have its name be <__NAME_i>. The design is * discussed in commit * 2f833f5208e26b208886e51e09e2c072b5eabb46 */ - name = savepv(Perl_form(aTHX_ - "%s%.*s%s\n", - (FOLD) ? "__" : "", - (int)n, - RExC_parse, - (FOLD) ? "_i" : "" - )); + name = savepv(Perl_form(aTHX_ "%.*s", (int)n, RExC_parse)); + SAVEFREEPV(name); + if (FOLD) { + lookup_name = savepv(Perl_form(aTHX_ "__%s_i", name)); + + /* The function call just below that uses this can fail + * to return, leaking memory if we don't do this */ + SAVEFREEPV(lookup_name); + } /* Look up the property name, and get its swash and * inversion list, if the property is found */ SvREFCNT_dec(swash); /* Free any left-overs */ - swash = _core_swash_init("utf8", name, &PL_sv_undef, + swash = _core_swash_init("utf8", + (lookup_name) + ? lookup_name + : name, + &PL_sv_undef, 1, /* binary */ 0, /* not tr/// */ NULL, /* No inversion list */ @@ -15736,13 +16251,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, pkgname, name); n = strlen(full_name); - Safefree(name); name = savepvn(full_name, n); + SAVEFREEPV(name); } } - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n", + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%"UTF8f"%s\n", (value == 'p' ? '+' : '!'), - UTF8fARG(UTF, n, name)); + (FOLD) ? "__" : "", + UTF8fARG(UTF, n, name), + (FOLD) ? "_i" : ""); has_user_defined_property = TRUE; optimizable = FALSE; /* Will have to leave this an ANYOF node */ @@ -15790,7 +16307,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, _invlist_union(properties, invlist, &properties); } } - Safefree(name); } RExC_parse = e + 1; namedclass = ANYOF_UNIPROP; /* no official name, but it's @@ -15824,9 +16340,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } non_portable_endpoint++; - if (IN_ENCODING && value < 0x100) { - goto recode_encoding; - } break; case 'x': RExC_parse--; /* function expects to be pointed at the 'x' */ @@ -15844,8 +16357,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } non_portable_endpoint++; - if (IN_ENCODING && value < 0x100) - goto recode_encoding; break; case 'c': value = grok_bslash_c(*RExC_parse++, PASS2); @@ -15878,23 +16389,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } } non_portable_endpoint++; - if (IN_ENCODING && value < 0x100) - goto recode_encoding; - break; - } - recode_encoding: - if (! RExC_override_recoding) { - SV* enc = _get_encoding(); - value = reg_recode((U8)value, &enc); - if (!enc) { - if (strict) { - vFAIL("Invalid escape in the specified encoding"); - } - else if (PASS2) { - ckWARNreg(RExC_parse, - "Invalid escape in the specified encoding"); - } - } break; } default: @@ -16035,9 +16529,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, else if (! SIZE_ONLY) { /* Here, not in pass1 (in that pass we skip calculating the - * contents of this class), and is /l, or is a POSIX class for - * which /l doesn't matter (or is a Unicode property, which is - * skipped here). */ + * contents of this class), and is not /l, or is a POSIX class + * for which /l doesn't matter (or is a Unicode property, which + * is skipped here). */ if (namedclass >= ANYOF_POSIXL_MAX) { /* If a special class */ if (namedclass != ANYOF_UNIPROP) { /* UNIPROP = \p and \P */ @@ -16062,9 +16556,9 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, &cp_list); } } - else if (UNI_SEMANTICS + else if ( UNI_SEMANTICS || classnum == _CC_ASCII - || (DEPENDS_SEMANTICS && (classnum == _CC_DIGIT + || (DEPENDS_SEMANTICS && ( classnum == _CC_DIGIT || classnum == _CC_XDIGIT))) { /* We usually have to worry about /d and /a affecting what @@ -16396,6 +16890,12 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, range = 0; /* this range (if it was one) is done now */ } /* End of loop through all the text within the brackets */ + + if ( posix_warnings && av_tindex_nomg(posix_warnings) >= 0) { + output_or_return_posix_warnings(pRExC_state, posix_warnings, + return_posix_warnings); + } + /* If anything in the class expands to more than one character, we have to * deal with them by building up a substitute parse string, and recursively * calling reg() on it, instead of proceeding */ @@ -16424,7 +16924,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, #endif /* Look at the longest folds first */ - for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) { + for (cp_count = av_tindex_nomg(multi_char_matches); + cp_count > 0; + cp_count--) + { if (av_exists(multi_char_matches, cp_count)) { AV** this_array_ptr; @@ -16805,7 +17308,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, { AV* list = (AV*) *listp; IV k; - for (k = 0; k <= av_tindex(list); k++) { + for (k = 0; k <= av_tindex_nomg(list); k++) { SV** c_p = av_fetch(list, k, FALSE); UV c; assert(c_p); @@ -16852,83 +17355,163 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, SvREFCNT_dec_NN(cp_foldable_list); } - /* And combine the result (if any) with any inversion list from posix + /* And combine the result (if any) with any inversion lists from posix * classes. The lists are kept separate up to now because we don't want to * fold the classes (folding of those is automatically handled by the swash * fetching code) */ - if (simple_posixes) { - _invlist_union(cp_list, simple_posixes, &cp_list); - SvREFCNT_dec_NN(simple_posixes); + if (simple_posixes) { /* These are the classes known to be unaffected by + /a, /aa, and /d */ + if (cp_list) { + _invlist_union(cp_list, simple_posixes, &cp_list); + SvREFCNT_dec_NN(simple_posixes); + } + else { + cp_list = simple_posixes; + } } if (posixes || nposixes) { - if (posixes && AT_LEAST_ASCII_RESTRICTED) { + + /* We have to adjust /a and /aa */ + if (AT_LEAST_ASCII_RESTRICTED) { + /* Under /a and /aa, nothing above ASCII matches these */ - _invlist_intersection(posixes, - PL_XPosix_ptrs[_CC_ASCII], - &posixes); - } - if (nposixes) { - if (DEPENDS_SEMANTICS) { - /* Under /d, everything in the upper half of the Latin1 range - * matches these complements */ - ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; + if (posixes) { + _invlist_intersection(posixes, + PL_XPosix_ptrs[_CC_ASCII], + &posixes); } - else if (AT_LEAST_ASCII_RESTRICTED) { - /* Under /a and /aa, everything above ASCII matches these - * complements */ + + /* Under /a and /aa, everything above ASCII matches these + * complements */ + if (nposixes) { _invlist_union_complement_2nd(nposixes, PL_XPosix_ptrs[_CC_ASCII], &nposixes); } - if (posixes) { - _invlist_union(posixes, nposixes, &posixes); - SvREFCNT_dec_NN(nposixes); - } - else { - posixes = nposixes; - } } + if (! DEPENDS_SEMANTICS) { - if (cp_list) { - _invlist_union(cp_list, posixes, &cp_list); - SvREFCNT_dec_NN(posixes); + + /* For everything but /d, we can just add the current 'posixes' and + * 'nposixes' to the main list */ + if (posixes) { + if (cp_list) { + _invlist_union(cp_list, posixes, &cp_list); + SvREFCNT_dec_NN(posixes); + } + else { + cp_list = posixes; + } } - else { - cp_list = posixes; + if (nposixes) { + if (cp_list) { + _invlist_union(cp_list, nposixes, &cp_list); + SvREFCNT_dec_NN(nposixes); + } + else { + cp_list = nposixes; + } } } else { - /* Under /d, we put into a separate list the Latin1 things that - * match only when the target string is utf8 */ - SV* nonascii_but_latin1_properties = NULL; - _invlist_intersection(posixes, PL_UpperLatin1, - &nonascii_but_latin1_properties); - _invlist_subtract(posixes, nonascii_but_latin1_properties, - &posixes); - if (cp_list) { - _invlist_union(cp_list, posixes, &cp_list); - SvREFCNT_dec_NN(posixes); - } - else { - cp_list = posixes; - } + /* Under /d, things like \w match upper Latin1 characters only if + * the target string is in UTF-8. But things like \W match all the + * upper Latin1 characters if the target string is not in UTF-8. + * + * Handle the case where there something like \W separately */ + if (nposixes) { + SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1); + + /* A complemented posix class matches all upper Latin1 + * characters if not in UTF-8. And it matches just certain + * ones when in UTF-8. That means those certain ones are + * matched regardless, so can just be added to the + * unconditional list */ + if (cp_list) { + _invlist_union(cp_list, nposixes, &cp_list); + SvREFCNT_dec_NN(nposixes); + nposixes = NULL; + } + else { + cp_list = nposixes; + } + + /* Likewise for 'posixes' */ + _invlist_union(posixes, cp_list, &cp_list); + + /* Likewise for anything else in the range that matched only + * under UTF-8 */ + if (has_upper_latin1_only_utf8_matches) { + _invlist_union(cp_list, + has_upper_latin1_only_utf8_matches, + &cp_list); + SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); + has_upper_latin1_only_utf8_matches = NULL; + } - if (has_upper_latin1_only_utf8_matches) { - _invlist_union(has_upper_latin1_only_utf8_matches, - nonascii_but_latin1_properties, - &has_upper_latin1_only_utf8_matches); - SvREFCNT_dec_NN(nonascii_but_latin1_properties); + /* If we don't match all the upper Latin1 characters regardless + * of UTF-8ness, we have to set a flag to match the rest when + * not in UTF-8 */ + _invlist_subtract(only_non_utf8_list, cp_list, + &only_non_utf8_list); + if (_invlist_len(only_non_utf8_list) != 0) { + ANYOF_FLAGS(ret) |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; + } } else { - has_upper_latin1_only_utf8_matches - = nonascii_but_latin1_properties; - } - } - } + /* Here there were no complemented posix classes. That means + * the upper Latin1 characters in 'posixes' match only when the + * target string is in UTF-8. So we have to add them to the + * list of those types of code points, while adding the + * remainder to the unconditional list. + * + * First calculate what they are */ + SV* nonascii_but_latin1_properties = NULL; + _invlist_intersection(posixes, PL_UpperLatin1, + &nonascii_but_latin1_properties); + + /* And add them to the final list of such characters. */ + if (has_upper_latin1_only_utf8_matches) { + _invlist_union(has_upper_latin1_only_utf8_matches, + nonascii_but_latin1_properties, + &has_upper_latin1_only_utf8_matches); + SvREFCNT_dec_NN(nonascii_but_latin1_properties); + } + else { + has_upper_latin1_only_utf8_matches + = nonascii_but_latin1_properties; + } - /* And combine the result (if any) with any inversion list from properties. - * The lists are kept separate up to now so that we can distinguish the two - * in regards to matching above-Unicode. A run-time warning is generated + /* Remove them from what now becomes the unconditional list */ + _invlist_subtract(posixes, nonascii_but_latin1_properties, + &posixes); + + /* And the remainder are the unconditional ones */ + if (cp_list) { + _invlist_union(cp_list, posixes, &cp_list); + SvREFCNT_dec_NN(posixes); + posixes = NULL; + } + else { + cp_list = posixes; + } + + /* Get rid of any characters that we now know are matched + * unconditionally from the conditional list */ + _invlist_subtract(has_upper_latin1_only_utf8_matches, + cp_list, + &has_upper_latin1_only_utf8_matches); + if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) { + SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); + has_upper_latin1_only_utf8_matches = NULL; + } + } + } + } + + /* And combine the result (if any) with any inversion list from properties. + * The lists are kept separate up to now so that we can distinguish the two + * in regards to matching above-Unicode. A run-time warning is generated * if a Unicode property is matched against a non-Unicode code point. But, * we allow user-defined properties to match anything, without any warning, * and we also suppress the warning if there is a portion of the character @@ -17011,73 +17594,14 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, invlist_iterfinish(cp_list); } } - -#define MATCHES_ALL_NON_UTF8_NON_ASCII(ret) \ - ( DEPENDS_SEMANTICS \ - && (ANYOF_FLAGS(ret) \ - & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) - - /* See if we can simplify things under /d */ - if ( has_upper_latin1_only_utf8_matches - || MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) + else if ( DEPENDS_SEMANTICS + && ( has_upper_latin1_only_utf8_matches + || (ANYOF_FLAGS(ret) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))) { - if (has_upper_latin1_only_utf8_matches) { - if (MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) { - - /* Here, we have both the flag and inversion list. Any character in - * 'has_upper_latin1_only_utf8_matches' matches when UTF-8 is - * in effect, but it also matches when UTF-8 is not in effect - * because of MATCHES_ALL_NON_UTF8_NON_ASCII. Therefore it - * matches unconditionally, so can be added to the regular - * list, and 'has_upper_latin1_only_utf8_matches' cleared */ - _invlist_union(cp_list, - has_upper_latin1_only_utf8_matches, - &cp_list); - SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); - has_upper_latin1_only_utf8_matches = NULL; - } - else if (cp_list) { - - /* Here, 'cp_list' gives chars that always match, and - * 'has_upper_latin1_only_utf8_matches' gives chars that were - * specified to match only if the target string is in UTF-8. - * It may be that these overlap, so we can subtract the - * unconditionally matching from the conditional ones, to make - * the conditional list as small as possible, perhaps even - * clearing it, in which case more optimizations are possible - * later */ - _invlist_subtract(has_upper_latin1_only_utf8_matches, - cp_list, - &has_upper_latin1_only_utf8_matches); - if (_invlist_len(has_upper_latin1_only_utf8_matches) == 0) { - SvREFCNT_dec_NN(has_upper_latin1_only_utf8_matches); - has_upper_latin1_only_utf8_matches = NULL; - } - } - } - - /* Similarly, if the unconditional matches include every upper latin1 - * character, we can clear that flag to permit later optimizations */ - if (cp_list && MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) { - SV* only_non_utf8_list = invlist_clone(PL_UpperLatin1); - _invlist_subtract(only_non_utf8_list, cp_list, &only_non_utf8_list); - if (_invlist_len(only_non_utf8_list) == 0) { - ANYOF_FLAGS(ret) &= ~ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER; - } - SvREFCNT_dec_NN(only_non_utf8_list); - only_non_utf8_list = NULL;; - } - - /* If we haven't gotten rid of all conditional matching, we change the - * regnode type to indicate that */ - if ( has_upper_latin1_only_utf8_matches - || MATCHES_ALL_NON_UTF8_NON_ASCII(ret)) - { - OP(ret) = ANYOFD; - optimizable = FALSE; - } + OP(ret) = ANYOFD; + optimizable = FALSE; } -#undef MATCHES_ALL_NON_UTF8_NON_ASCII + /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known * at compile time. Besides not inverting folded locale now, we can't @@ -17429,7 +17953,7 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, bool doinit, SV** listsvp, SV** only_utf8_locale_ptr, - SV* exclude_list) + SV** output_invlist) { /* For internal core use only. @@ -17441,20 +17965,32 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, * swash exists, by calling this function with 'doinit' set to false, in * which case the components that will be used to eventually create the * swash are returned (in a printable form). - * If is not NULL, it is an inversion list of things to - * exclude from what's returned in . + * If is not NULL, it is where this routine is to + * store an inversion list of code points that should match only if the + * execution-time locale is a UTF-8 one. + * If is not NULL, it is where this routine is to store an + * inversion list of the code points that would be instead returned in + * if this were NULL. Thus, what gets output in + * when this parameter is used, is just the non-code point data that + * will go into creating the swash. This currently should be just + * user-defined properties whose definitions were not known at compile + * time. Using this parameter allows for easier manipulation of the + * swash's data by the caller. It is illegal to call this function with + * this parameter set, but not + * * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note * that, in spite of this function's name, the swash it returns may include * the bitmap data as well */ SV *sw = NULL; SV *si = NULL; /* Input swash initialization string */ - SV* invlist = NULL; + SV* invlist = NULL; RXi_GET_DECL(prog,progi); const struct reg_data * const data = prog ? progi->data : NULL; PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA; + assert(! output_invlist || listsvp); if (data && data->count) { const U32 n = ARG(node); @@ -17467,7 +18003,7 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, si = *ary; /* ary[0] = the string to initialize the swash with */ - if (av_tindex(av) >= 2) { + if (av_tindex_nomg(av) >= 2) { if (only_utf8_locale_ptr && ary[2] && ary[2] != &PL_sv_undef) @@ -17483,7 +18019,7 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, * is any inversion list generated at compile time; [4] * indicates if that inversion list has any user-defined * properties in it. */ - if (av_tindex(av) >= 3) { + if (av_tindex_nomg(av) >= 3) { invlist = ary[3]; if (SvUV(ary[4])) { swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; @@ -17516,7 +18052,7 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, /* If requested, return a printable version of what this swash matches */ if (listsvp) { - SV* matches_string = newSVpvs(""); + SV* matches_string = NULL; /* The swash should be used, if possible, to get the data, as it * contains the resolved data. But this function can be called at @@ -17526,22 +18062,124 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) && (si && si != &PL_sv_undef)) { - sv_catsv(matches_string, si); + /* Here, we only have 'si' (and possibly some passed-in data in + * 'invlist', which is handled below) If the caller only wants + * 'si', use that. */ + if (! output_invlist) { + matches_string = newSVsv(si); + } + else { + /* But if the caller wants an inversion list of the node, we + * need to parse 'si' and place as much as possible in the + * desired output inversion list, making 'matches_string' only + * contain the currently unresolvable things */ + const char *si_string = SvPVX(si); + STRLEN remaining = SvCUR(si); + UV prev_cp = 0; + U8 count = 0; + + /* Ignore everything before the first new-line */ + while (*si_string != '\n' && remaining > 0) { + si_string++; + remaining--; + } + assert(remaining > 0); + + si_string++; + remaining--; + + while (remaining > 0) { + + /* The data consists of just strings defining user-defined + * property names, but in prior incarnations, and perhaps + * somehow from pluggable regex engines, it could still + * hold hex code point definitions. Each component of a + * range would be separated by a tab, and each range by a + * new-line. If these are found, instead add them to the + * inversion list */ + I32 grok_flags = PERL_SCAN_SILENT_ILLDIGIT + |PERL_SCAN_SILENT_NON_PORTABLE; + STRLEN len = remaining; + UV cp = grok_hex(si_string, &len, &grok_flags, NULL); + + /* If the hex decode routine found something, it should go + * up to the next \n */ + if ( *(si_string + len) == '\n') { + if (count) { /* 2nd code point on line */ + *output_invlist = _add_range_to_invlist(*output_invlist, prev_cp, cp); + } + else { + *output_invlist = add_cp_to_invlist(*output_invlist, cp); + } + count = 0; + goto prepare_for_next_iteration; + } + + /* If the hex decode was instead for the lower range limit, + * save it, and go parse the upper range limit */ + if (*(si_string + len) == '\t') { + assert(count == 0); + + prev_cp = cp; + count = 1; + prepare_for_next_iteration: + si_string += len + 1; + remaining -= len + 1; + continue; + } + + /* Here, didn't find a legal hex number. Just add it from + * here to the next \n */ + + remaining -= len; + while (*(si_string + len) != '\n' && remaining > 0) { + remaining--; + len++; + } + if (*(si_string + len) == '\n') { + len++; + remaining--; + } + if (matches_string) { + sv_catpvn(matches_string, si_string, len - 1); + } + else { + matches_string = newSVpvn(si_string, len - 1); + } + si_string += len; + sv_catpvs(matches_string, " "); + } /* end of loop through the text */ + + assert(matches_string); + if (SvCUR(matches_string)) { /* Get rid of trailing blank */ + SvCUR_set(matches_string, SvCUR(matches_string) - 1); + } + } /* end of has an 'si' but no swash */ } - /* Add the inversion list to whatever we have. This may have come from - * the swash, or from an input parameter */ - if (invlist) { - if (exclude_list) { - SV* clone = invlist_clone(invlist); - _invlist_subtract(clone, exclude_list, &clone); - sv_catsv(matches_string, _invlist_contents(clone)); - SvREFCNT_dec_NN(clone); + /* If we have a swash in place, its equivalent inversion list was above + * placed into 'invlist'. If not, this variable may contain a stored + * inversion list which is information beyond what is in 'si' */ + if (invlist) { + + /* Again, if the caller doesn't want the output inversion list, put + * everything in 'matches-string' */ + if (! output_invlist) { + if ( ! matches_string) { + matches_string = newSVpvs("\n"); + } + sv_catsv(matches_string, invlist_contents(invlist, + TRUE /* traditional style */ + )); + } + else if (! *output_invlist) { + *output_invlist = invlist_clone(invlist); } else { - sv_catsv(matches_string, _invlist_contents(invlist)); + _invlist_union(*output_invlist, invlist, output_invlist); } - } + } + *listsvp = matches_string; } @@ -17661,7 +18299,7 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1; skip_to_be_ignored_text(pRExC_state, &RExC_parse, - FALSE /* Don't assume /x */ ); + FALSE /* Don't force /x */ ); } } @@ -17791,15 +18429,23 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) RExC_size += size; return; } - + assert(!RExC_study_started); /* I believe we should never use reginsert once we have started + studying. If this is wrong then we need to adjust RExC_recurse + below like we do with RExC_open_parens/RExC_close_parens. */ src = RExC_emit; RExC_emit += size; dst = RExC_emit; if (RExC_open_parens) { int paren; /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ + /* remember that RExC_npar is rex->nparens + 1, + * iow it is 1 more than the number of parens seen in + * the pattern so far. */ for ( paren=0 ; paren < RExC_npar ; paren++ ) { - if ( RExC_open_parens[paren] >= opnd ) { + /* note, RExC_open_parens[0] is the start of the + * regex, it can't move. RExC_close_parens[0] is the end + * of the regex, it *can* move. */ + if ( paren && RExC_open_parens[paren] >= opnd ) { /*DEBUG_PARSE_FMT("open"," - %d",size);*/ RExC_open_parens[paren] += size; } else { @@ -17813,6 +18459,8 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) } } } + if (RExC_end_op) + RExC_end_op += size; while (src > opnd) { StructCopy(--src, --dst, regnode); @@ -17885,7 +18533,7 @@ S_regtail(pTHX_ RExC_state_t * pRExC_state, DEBUG_PARSE_r({ DEBUG_PARSE_MSG((scan==p ? "tail" : "")); regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); - PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", + Perl_re_printf( aTHX_ "~ %s (%d) %s %s\n", SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan), (temp == NULL ? "->" : ""), (temp == NULL ? PL_reg_name[OP(val)] : "") @@ -17975,7 +18623,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, DEBUG_PARSE_r({ DEBUG_PARSE_MSG((scan==p ? "tsdy" : "")); regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); - PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", + Perl_re_printf( aTHX_ "~ %s (%d) -> %s\n", SvPV_nolen_const(RExC_mysv), REG_NODE_NUM(scan), PL_reg_name[exact]); @@ -17987,7 +18635,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, DEBUG_PARSE_r({ DEBUG_PARSE_MSG(""); regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state); - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "~ attach to %s (%"IVdf") offset to %"IVdf"\n", SvPV_nolen_const(RExC_mysv), (IV)REG_NODE_NUM(val), @@ -18021,15 +18669,15 @@ S_regdump_intflags(pTHX_ const char *lead, const U32 flags) for (bit=0; bitanchored_substr) { RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), RE_SV_DUMPLEN(r->anchored_substr), 30); - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "anchored %s%s at %"IVdf" ", s, RE_SV_TAIL(r->anchored_substr), (IV)r->anchored_offset); } else if (r->anchored_utf8) { RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), RE_SV_DUMPLEN(r->anchored_utf8), 30); - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "anchored utf8 %s%s at %"IVdf" ", s, RE_SV_TAIL(r->anchored_utf8), (IV)r->anchored_offset); @@ -18115,55 +18763,55 @@ Perl_regdump(pTHX_ const regexp *r) if (r->float_substr) { RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), RE_SV_DUMPLEN(r->float_substr), 30); - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "floating %s%s at %"IVdf"..%"UVuf" ", s, RE_SV_TAIL(r->float_substr), (IV)r->float_min_offset, (UV)r->float_max_offset); } else if (r->float_utf8) { RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), RE_SV_DUMPLEN(r->float_utf8), 30); - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "floating utf8 %s%s at %"IVdf"..%"UVuf" ", s, RE_SV_TAIL(r->float_utf8), (IV)r->float_min_offset, (UV)r->float_max_offset); } if (r->check_substr || r->check_utf8) - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ (const char *) (r->check_substr == r->float_substr && r->check_utf8 == r->float_utf8 ? "(checking floating" : "(checking anchored")); if (r->intflags & PREGf_NOSCAN) - PerlIO_printf(Perl_debug_log, " noscan"); + Perl_re_printf( aTHX_ " noscan"); if (r->extflags & RXf_CHECK_ALL) - PerlIO_printf(Perl_debug_log, " isall"); + Perl_re_printf( aTHX_ " isall"); if (r->check_substr || r->check_utf8) - PerlIO_printf(Perl_debug_log, ") "); + Perl_re_printf( aTHX_ ") "); if (ri->regstclass) { regprop(r, sv, ri->regstclass, NULL, NULL); - PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv)); + Perl_re_printf( aTHX_ "stclass %s ", SvPVX_const(sv)); } if (r->intflags & PREGf_ANCH) { - PerlIO_printf(Perl_debug_log, "anchored"); + Perl_re_printf( aTHX_ "anchored"); if (r->intflags & PREGf_ANCH_MBOL) - PerlIO_printf(Perl_debug_log, "(MBOL)"); + Perl_re_printf( aTHX_ "(MBOL)"); if (r->intflags & PREGf_ANCH_SBOL) - PerlIO_printf(Perl_debug_log, "(SBOL)"); + Perl_re_printf( aTHX_ "(SBOL)"); if (r->intflags & PREGf_ANCH_GPOS) - PerlIO_printf(Perl_debug_log, "(GPOS)"); - (void)PerlIO_putc(Perl_debug_log, ' '); + Perl_re_printf( aTHX_ "(GPOS)"); + Perl_re_printf( aTHX_ " "); } if (r->intflags & PREGf_GPOS_SEEN) - PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs); + Perl_re_printf( aTHX_ "GPOS:%"UVuf" ", (UV)r->gofs); if (r->intflags & PREGf_SKIP) - PerlIO_printf(Perl_debug_log, "plus "); + Perl_re_printf( aTHX_ "plus "); if (r->intflags & PREGf_IMPLICIT) - PerlIO_printf(Perl_debug_log, "implicit "); - PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen); + Perl_re_printf( aTHX_ "implicit "); + Perl_re_printf( aTHX_ "minlen %"IVdf" ", (IV)r->minlen); if (r->extflags & RXf_EVAL_SEEN) - PerlIO_printf(Perl_debug_log, "with eval "); - PerlIO_printf(Perl_debug_log, "\n"); + Perl_re_printf( aTHX_ "with eval "); + Perl_re_printf( aTHX_ "\n"); DEBUG_FLAGS_r({ regdump_extflags("r->extflags: ",r->extflags); regdump_intflags("r->intflags: ",r->intflags); @@ -18175,6 +18823,53 @@ Perl_regdump(pTHX_ const regexp *r) #endif /* DEBUGGING */ } +/* Should be synchronized with ANYOF_ #defines in regcomp.h */ +#ifdef DEBUGGING + +# if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 \ + || _CC_LOWER != 3 || _CC_UPPER != 4 || _CC_PUNCT != 5 \ + || _CC_PRINT != 6 || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 \ + || _CC_CASED != 9 || _CC_SPACE != 10 || _CC_BLANK != 11 \ + || _CC_XDIGIT != 12 || _CC_CNTRL != 13 || _CC_ASCII != 14 \ + || _CC_VERTSPACE != 15 +# error Need to adjust order of anyofs[] +# endif +static const char * const anyofs[] = { + "\\w", + "\\W", + "\\d", + "\\D", + "[:alpha:]", + "[:^alpha:]", + "[:lower:]", + "[:^lower:]", + "[:upper:]", + "[:^upper:]", + "[:punct:]", + "[:^punct:]", + "[:print:]", + "[:^print:]", + "[:alnum:]", + "[:^alnum:]", + "[:graph:]", + "[:^graph:]", + "[:cased:]", + "[:^cased:]", + "\\s", + "\\S", + "[:blank:]", + "[:^blank:]", + "[:xdigit:]", + "[:^xdigit:]", + "[:cntrl:]", + "[:^cntrl:]", + "[:ascii:]", + "[:^ascii:]", + "\\v", + "\\V" +}; +#endif + /* - regprop - printable representation of opcode, with run time support */ @@ -18184,49 +18879,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ { #ifdef DEBUGGING int k; - - /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */ - static const char * const anyofs[] = { -#if _CC_WORDCHAR != 0 || _CC_DIGIT != 1 || _CC_ALPHA != 2 || _CC_LOWER != 3 \ - || _CC_UPPER != 4 || _CC_PUNCT != 5 || _CC_PRINT != 6 \ - || _CC_ALPHANUMERIC != 7 || _CC_GRAPH != 8 || _CC_CASED != 9 \ - || _CC_SPACE != 10 || _CC_BLANK != 11 || _CC_XDIGIT != 12 \ - || _CC_CNTRL != 13 || _CC_ASCII != 14 || _CC_VERTSPACE != 15 - #error Need to adjust order of anyofs[] -#endif - "\\w", - "\\W", - "\\d", - "\\D", - "[:alpha:]", - "[:^alpha:]", - "[:lower:]", - "[:^lower:]", - "[:upper:]", - "[:^upper:]", - "[:punct:]", - "[:^punct:]", - "[:print:]", - "[:^print:]", - "[:alnum:]", - "[:^alnum:]", - "[:graph:]", - "[:^graph:]", - "[:cased:]", - "[:^cased:]", - "\\s", - "\\S", - "[:blank:]", - "[:^blank:]", - "[:xdigit:]", - "[:^xdigit:]", - "[:cntrl:]", - "[:^cntrl:]", - "[:ascii:]", - "[:^ascii:]", - "\\v", - "\\V" - }; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -18283,10 +18935,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { sv_catpvs(sv, "["); (void) put_charclass_bitmap_innards(sv, - (IS_ANYOF_TRIE(op)) + ((IS_ANYOF_TRIE(op)) ? ANYOF_BITMAP(o) - : TRIE_BITMAP(trie), - NULL); + : TRIE_BITMAP(trie)), + NULL, + NULL, + NULL, + FALSE + ); sv_catpvs(sv, "]"); } @@ -18357,7 +19013,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ } /* Paren and offset */ - Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); + Perl_sv_catpvf(aTHX_ sv, "%d[%+d:%d]", (int)ARG(o),(int)ARG2L(o), + (int)((o + (int)ARG2L(o)) - progi->program) ); if (name_list) { SV **name= av_fetch(name_list, ARG(o), 0 ); if (name) @@ -18369,161 +19026,156 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); else if (k == ANYOF) { const U8 flags = ANYOF_FLAGS(o); - int do_sep = 0; - SV* bitmap_invlist = NULL; /* Will hold what the bit map contains */ + bool do_sep = FALSE; /* Do we need to separate various components of + the output? */ + /* Set if there is still an unresolved user-defined property */ + SV *unresolved = NULL; + + /* Things that are ignored except when the runtime locale is UTF-8 */ + SV *only_utf8_locale_invlist = NULL; + + /* Code points that don't fit in the bitmap */ + SV *nonbitmap_invlist = NULL; + /* And things that aren't in the bitmap, but are small enough to be */ + SV* bitmap_range_not_in_bitmap = NULL; + + const bool inverted = flags & ANYOF_INVERT; if (OP(o) == ANYOFL) { if (ANYOFL_UTF8_LOCALE_REQD(flags)) { - sv_catpvs(sv, "{utf8-loc}"); + sv_catpvs(sv, "{utf8-locale-reqd}"); } - else { - sv_catpvs(sv, "{loc}"); + if (flags & ANYOFL_FOLD) { + sv_catpvs(sv, "{i}"); } } - if (flags & ANYOFL_FOLD) - sv_catpvs(sv, "{i}"); - Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); - if (flags & ANYOF_INVERT) - sv_catpvs(sv, "^"); - /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches - * */ - do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o), - &bitmap_invlist); + /* If there is stuff outside the bitmap, get it */ + if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) { + (void) _get_regclass_nonbitmap_data(prog, o, FALSE, + &unresolved, + &only_utf8_locale_invlist, + &nonbitmap_invlist); + /* The non-bitmap data may contain stuff that could fit in the + * bitmap. This could come from a user-defined property being + * finally resolved when this call was done; or much more likely + * because there are matches that require UTF-8 to be valid, and so + * aren't in the bitmap. This is teased apart later */ + _invlist_intersection(nonbitmap_invlist, + PL_InBitmap, + &bitmap_range_not_in_bitmap); + /* Leave just the things that don't fit into the bitmap */ + _invlist_subtract(nonbitmap_invlist, + PL_InBitmap, + &nonbitmap_invlist); + } - /* output any special charclass tests (used entirely under use - * locale) * */ - if (ANYOF_POSIXL_TEST_ANY_SET(o)) { - int i; - for (i = 0; i < ANYOF_POSIXL_MAX; i++) { - if (ANYOF_POSIXL_TEST(o,i)) { - sv_catpv(sv, anyofs[i]); - do_sep = 1; - } - } + /* Obey this flag to add all above-the-bitmap code points */ + if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) { + nonbitmap_invlist = _add_range_to_invlist(nonbitmap_invlist, + NUM_ANYOF_CODE_POINTS, + UV_MAX); } - if ( ARG(o) != ANYOF_ONLY_HAS_BITMAP - || (flags - & ( ANYOF_MATCHES_ALL_ABOVE_BITMAP - |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP - |ANYOFL_FOLD))) - { - if (do_sep) { - Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); - if (flags & ANYOF_INVERT) - /*make sure the invert info is in each */ + /* Ready to start outputting. First, the initial left bracket */ + Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); + + /* Then all the things that could fit in the bitmap */ + do_sep = put_charclass_bitmap_innards(sv, + ANYOF_BITMAP(o), + bitmap_range_not_in_bitmap, + only_utf8_locale_invlist, + o, + + /* Can't try inverting for a + * better display if there are + * things that haven't been + * resolved */ + unresolved != NULL); + SvREFCNT_dec(bitmap_range_not_in_bitmap); + + /* If there are user-defined properties which haven't been defined yet, + * output them. If the result is not to be inverted, it is clearest to + * output them in a separate [] from the bitmap range stuff. If the + * result is to be complemented, we have to show everything in one [], + * as the inversion applies to the whole thing. Use {braces} to + * separate them from anything in the bitmap and anything above the + * bitmap. */ + if (unresolved) { + if (inverted) { + if (! do_sep) { /* If didn't output anything in the bitmap */ sv_catpvs(sv, "^"); + } + sv_catpvs(sv, "{"); } - - if (OP(o) == ANYOFD - && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)) - { - sv_catpvs(sv, "{non-utf8-latin1-all}"); + else if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); } + sv_catsv(sv, unresolved); + if (inverted) { + sv_catpvs(sv, "}"); + } + do_sep = ! inverted; + } - if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) - sv_catpvs(sv, "{above_bitmap_all}"); - - if (ARG(o) != ANYOF_ONLY_HAS_BITMAP) { - SV *lv; /* Set if there is something outside the bit map. */ - bool byte_output = FALSE; /* If something has been output */ - SV *only_utf8_locale; - - /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist' - * is used to guarantee that nothing in the bitmap gets - * returned */ - (void) _get_regclass_nonbitmap_data(prog, o, FALSE, - &lv, &only_utf8_locale, - bitmap_invlist); - if (lv && lv != &PL_sv_undef) { - char *s = savesvpv(lv); - const char * const orig_s = s; /* Save the beginning of - 's', so can be freed */ - - /* Ignore anything before the first \n */ - while (*s && *s != '\n') - s++; - - /* The data are one range per line. A range is a single - * entity; or two, separated by \t. So can just convert \n - * to space and \t to '-' */ - if (*s == '\n') { - const char * const t = ++s; - - if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) { - if (OP(o) == ANYOFD) { - sv_catpvs(sv, "{utf8}"); - } - else { - sv_catpvs(sv, "{outside bitmap}"); - } - } - - if (byte_output) { - sv_catpvs(sv, " "); - } + /* And, finally, add the above-the-bitmap stuff */ + if (nonbitmap_invlist && _invlist_len(nonbitmap_invlist)) { + SV* contents; - while (*s) { - if (*s == '\n') { + /* See if truncation size is overridden */ + const STRLEN dump_len = (PL_dump_re_max_len) + ? PL_dump_re_max_len + : 256; - /* Truncate very long output */ - if ((UV) (s - t) > 256) { - Perl_sv_catpvf(aTHX_ sv, - "%.*s...", - (int) (s - t), - t); - goto out_dump; - } - *s = ' '; - } - else if (*s == '\t') { - *s = '-'; - } - s++; - } + /* This is output in a separate [] */ + if (do_sep) { + Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); + } - /* Here, it fits in the allocated space. Replace a - * final blank with a NUL */ - if (s[-1] == ' ') - s[-1] = '\0'; + /* And, for easy of understanding, it is shown in the + * uncomplemented form if possible. The one exception being if + * there are unresolved items, where the inversion has to be + * delayed until runtime */ + if (inverted && ! unresolved) { + _invlist_invert(nonbitmap_invlist); + _invlist_subtract(nonbitmap_invlist, PL_InBitmap, &nonbitmap_invlist); + } - sv_catpv(sv, t); - } + contents = invlist_contents(nonbitmap_invlist, + FALSE /* output suitable for catsv */ + ); - out_dump: + /* If the output is shorter than the permissible maximum, just do it. */ + if (SvCUR(contents) <= dump_len) { + sv_catsv(sv, contents); + } + else { + const char * contents_string = SvPVX(contents); + STRLEN i = dump_len; - Safefree(orig_s); - SvREFCNT_dec_NN(lv); + /* Otherwise, start at the permissible max and work back to the + * first break possibility */ + while (i > 0 && contents_string[i] != ' ') { + i--; } - - if ((flags & ANYOFL_FOLD) - && only_utf8_locale - && only_utf8_locale != &PL_sv_undef) - { - UV start, end; - int max_entries = 256; - - sv_catpvs(sv, "{utf8 locale}"); - invlist_iterinit(only_utf8_locale); - while (invlist_iternext(only_utf8_locale, - &start, &end)) { - put_range(sv, start, end, FALSE); - max_entries --; - if (max_entries < 0) { - sv_catpvs(sv, "..."); - break; - } - } - invlist_iterfinish(only_utf8_locale); + if (i == 0) { /* Fail-safe. Use the max if we couldn't + find a legal break */ + i = dump_len; } + + sv_catpvn(sv, contents_string, i); + sv_catpvs(sv, "..."); } - } - SvREFCNT_dec(bitmap_invlist); + SvREFCNT_dec_NN(contents); + SvREFCNT_dec_NN(nonbitmap_invlist); + } + /* And finally the matching, closing ']' */ Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); + + SvREFCNT_dec(unresolved); } else if (k == POSIXD || k == NPOSIXD) { U8 index = FLAGS(o) * 2; @@ -18589,7 +19241,7 @@ Perl_re_intuit_string(pTHX_ REGEXP * const r) ? prog->check_utf8 : prog->check_substr); if (!PL_colorset) reginitcolors(); - PerlIO_printf(Perl_debug_log, + Perl_re_printf( aTHX_ "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n", PL_colors[4], RX_UTF8(r) ? "utf8 " : "", @@ -18650,6 +19302,8 @@ Perl_pregfree2(pTHX_ REGEXP *rx) #endif Safefree(r->offs); SvREFCNT_dec(r->qr_anoncv); + if (r->recurse_locinput) + Safefree(r->recurse_locinput); rx->sv_u.svu_rx = 0; } @@ -18733,6 +19387,8 @@ Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx) #endif ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx); SvREFCNT_inc_void(ret->qr_anoncv); + if (r->recurse_locinput) + Newxz(ret->recurse_locinput,r->nparens + 1,char *); return ret_x; } @@ -18766,7 +19422,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) SV *dsv= sv_newmortal(); RE_PV_QUOTED_DECL(s, RX_UTF8(rx), dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60); - PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n", + Perl_re_printf( aTHX_ "%sFreeing REx:%s %s\n", PL_colors[4],PL_colors[5],s); } }); @@ -18871,7 +19527,7 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) /* - re_dup - duplicate a regexp. + re_dup_guts - duplicate a regexp. This routine is expected to clone a given regexp structure. It is only compiled under USE_ITHREADS. @@ -18939,6 +19595,8 @@ Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param) RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param); ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param)); + if (r->recurse_locinput) + Newxz(ret->recurse_locinput,r->nparens + 1,char *); if (ret->pprivate) RXi_SET(ret,CALLREGDUPE_PVT(dstr,param)); @@ -18993,6 +19651,7 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); + reti->num_code_blocks = ri->num_code_blocks; if (ri->code_blocks) { int n; @@ -19053,7 +19712,7 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) d->data[i] = ri->data->data[i]; break; default: - Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", + Perl_croak(aTHX_ "panic: re_dup_guts unknown data code '%c'", ri->data->what[i]); } } @@ -19189,7 +19848,10 @@ S_put_code_point(pTHX_ SV *sv, UV c) } else if (isPRINT(c)) { const char string = (char) c; - if (isBACKSLASHED_PUNCT(c)) + + /* We use {phrase} as metanotation in the class, so also escape literal + * braces */ + if (isBACKSLASHED_PUNCT(c) || c == '{' || c == '}') sv_catpvs(sv, "\\"); sv_catpvn(sv, &string, 1); } @@ -19207,8 +19869,15 @@ STATIC void S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) { /* Appends to 'sv' a displayable version of the range of code points from - * 'start' to 'end'. It assumes that only ASCII printables are displayable - * as-is (though some of these will be escaped by put_code_point()). */ + * 'start' to 'end'. Mnemonics (like '\r') are used for the few controls + * that have them, when they occur at the beginning or end of the range. + * It uses hex to output the remaining code points, unless 'allow_literals' + * is true, in which case the printable ASCII ones are output as-is (though + * some of these will be escaped by put_code_point()). + * + * NOTE: This is designed only for printing ranges of code points that fit + * inside an ANYOF bitmap. Higher code points are simply suppressed + */ const unsigned int min_range_count = 3; @@ -19222,7 +19891,7 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) if (end - start < min_range_count) { - /* Individual chars in short ranges */ + /* Output chars individually when they occur in short ranges */ for (; start <= end; start++) { put_code_point(sv, start); } @@ -19231,11 +19900,11 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) /* If permitted by the input options, and there is a possibility that * this range contains a printable literal, look to see if there is - * one. */ + * one. */ if (allow_literals && start <= MAX_PRINT_A) { - /* If the range begin isn't an ASCII printable, effectively split - * the range into two parts: + /* If the character at the beginning of the range isn't an ASCII + * printable, effectively split the range into two parts: * 1) the portion before the first such printable, * 2) the rest * and output them separately. */ @@ -19257,18 +19926,18 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) temp_end = end + 1; } - /* Output the first part of the split range, the part that - * doesn't have printables, with no looking for literals - * (otherwise we would infinitely recurse) */ + /* Output the first part of the split range: the part that + * doesn't have printables, with the parameter set to not look + * for literals (otherwise we would infinitely recurse) */ put_range(sv, start, temp_end - 1, FALSE); /* The 2nd part of the range (if any) starts here. */ start = temp_end; - /* We continue instead of dropping down because even if the 2nd - * part is non-empty, it could be so short that we want to - * output it specially, as tested for at the top of this loop. - * */ + /* We do a continue, instead of dropping down, because even if + * the 2nd part is non-empty, it could be so short that we want + * to output it as individual characters, as tested for at the + * top of this loop. */ continue; } @@ -19320,29 +19989,36 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) * mnemonic names. Split off any of those at the beginning and end of * the range to print mnemonically. It isn't possible for many of * these to be in a row, so this won't overwhelm with output */ - while (isMNEMONIC_CNTRL(start) && start <= end) { - put_code_point(sv, start); - start++; - } - if (start < end && isMNEMONIC_CNTRL(end)) { - - /* Here, the final character in the range has a mnemonic name. - * Work backwards from the end to find the final non-mnemonic */ - UV temp_end = end - 1; - while (isMNEMONIC_CNTRL(temp_end)) { - temp_end--; + if ( start <= end + && (isMNEMONIC_CNTRL(start) || isMNEMONIC_CNTRL(end))) + { + while (isMNEMONIC_CNTRL(start) && start <= end) { + put_code_point(sv, start); + start++; } - /* And separately output the range that doesn't have mnemonics */ - put_range(sv, start, temp_end, FALSE); + /* If this didn't take care of the whole range ... */ + if (start <= end) { - /* Then output the mnemonic trailing controls */ - start = temp_end + 1; - while (start <= end) { - put_code_point(sv, start); - start++; + /* Look backwards from the end to find the final non-mnemonic + * */ + UV temp_end = end; + while (isMNEMONIC_CNTRL(temp_end)) { + temp_end--; + } + + /* And separately output the interior range that doesn't start + * or end with mnemonics */ + put_range(sv, start, temp_end, FALSE); + + /* Then output the mnemonic trailing controls */ + start = temp_end + 1; + while (start <= end) { + put_code_point(sv, start); + start++; + } + break; } - break; } /* As a final resort, output the range or subrange as hex. */ @@ -19364,55 +20040,26 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) } } -STATIC bool -S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist) +STATIC void +S_put_charclass_bitmap_innards_invlist(pTHX_ SV *sv, SV* invlist) { - /* Appends to 'sv' a displayable version of the innards of the bracketed - * character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually - * output anything, and bitmap_invlist, if not NULL, will point to an - * inversion list of what is in the bit map */ + /* Concatenate onto the PV in 'sv' a displayable form of the inversion list + * 'invlist' */ - int i; UV start, end; - unsigned int punct_count = 0; - SV* invlist; bool allow_literals = TRUE; - bool inverted_for_output = FALSE; - PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS; - - /* Worst case is exactly every-other code point is in the list */ - invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2); - - /* Convert the bit map to an inversion list, keeping track of how many - * ASCII puncts are set, including an extra amount for the backslashed - * ones. */ - for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { - if (BITMAP_TEST(bitmap, i)) { - invlist = add_cp_to_invlist(invlist, i); - if (isPUNCT_A(i)) { - punct_count++; - if isBACKSLASHED_PUNCT(i) { - punct_count++; - } - } - } - } - - /* Nothing to output */ - if (_invlist_len(invlist) == 0) { - SvREFCNT_dec_NN(invlist); - return FALSE; - } + PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_INVLIST; /* Generally, it is more readable if printable characters are output as * literals, but if a range (nearly) spans all of them, it's best to output * it as a single range. This code will use a single range if all but 2 - * printables are in it */ + * ASCII printables are in it */ invlist_iterinit(invlist); while (invlist_iternext(invlist, &start, &end)) { - /* If range starts beyond final printable, it doesn't have any in it */ + /* If the range starts beyond the final printable, it doesn't have any + * in it */ if (start > MAX_PRINT_A) { break; } @@ -19435,24 +20082,6 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist) } invlist_iterfinish(invlist); - /* The legibility of the output depends mostly on how many punctuation - * characters are output. There are 32 possible ASCII ones, and some have - * an additional backslash, bringing it to currently 36, so if any more - * than 18 are to be output, we can instead output it as its complement, - * yielding fewer puncts, and making it more legible. But give some weight - * to the fact that outputting it as a complement is less legible than a - * straight output, so don't complement unless we are somewhat over the 18 - * mark */ - if (allow_literals && punct_count > 22) { - sv_catpvs(sv, "^"); - - /* Add everything remaining to the list, so when we invert it just - * below, it will be excluded */ - _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist); - _invlist_invert(invlist); - inverted_for_output = TRUE; - } - /* Here we have figured things out. Output each range */ invlist_iterinit(invlist); while (invlist_iternext(invlist, &start, &end)) { @@ -19463,29 +20092,349 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist) } invlist_iterfinish(invlist); - if (bitmap_invlist) { + return; +} - /* Here, wants the inversion list returned. If we inverted it, we have - * to restore it to the original */ - if (inverted_for_output) { - _invlist_invert(invlist); - _invlist_intersection(invlist, PL_InBitmap, &invlist); - } +STATIC SV* +S_put_charclass_bitmap_innards_common(pTHX_ + SV* invlist, /* The bitmap */ + SV* posixes, /* Under /l, things like [:word:], \S */ + SV* only_utf8, /* Under /d, matches iff the target is UTF-8 */ + SV* not_utf8, /* /d, matches iff the target isn't UTF-8 */ + SV* only_utf8_locale, /* Under /l, matches if the locale is UTF-8 */ + const bool invert /* Is the result to be inverted? */ +) +{ + /* Create and return an SV containing a displayable version of the bitmap + * and associated information determined by the input parameters. If the + * output would have been only the inversion indicator '^', NULL is instead + * returned. */ + + SV * output; + + PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS_COMMON; - *bitmap_invlist = invlist; + if (invert) { + output = newSVpvs("^"); } else { - SvREFCNT_dec_NN(invlist); + output = newSVpvs(""); } - return TRUE; + /* First, the code points in the bitmap that are unconditionally there */ + put_charclass_bitmap_innards_invlist(output, invlist); + + /* Traditionally, these have been placed after the main code points */ + if (posixes) { + sv_catsv(output, posixes); + } + + if (only_utf8 && _invlist_len(only_utf8)) { + Perl_sv_catpvf(aTHX_ output, "%s{utf8}%s", PL_colors[1], PL_colors[0]); + put_charclass_bitmap_innards_invlist(output, only_utf8); + } + + if (not_utf8 && _invlist_len(not_utf8)) { + Perl_sv_catpvf(aTHX_ output, "%s{not utf8}%s", PL_colors[1], PL_colors[0]); + put_charclass_bitmap_innards_invlist(output, not_utf8); + } + + if (only_utf8_locale && _invlist_len(only_utf8_locale)) { + Perl_sv_catpvf(aTHX_ output, "%s{utf8 locale}%s", PL_colors[1], PL_colors[0]); + put_charclass_bitmap_innards_invlist(output, only_utf8_locale); + + /* This is the only list in this routine that can legally contain code + * points outside the bitmap range. The call just above to + * 'put_charclass_bitmap_innards_invlist' will simply suppress them, so + * output them here. There's about a half-dozen possible, and none in + * contiguous ranges longer than 2 */ + if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { + UV start, end; + SV* above_bitmap = NULL; + + _invlist_subtract(only_utf8_locale, PL_InBitmap, &above_bitmap); + + invlist_iterinit(above_bitmap); + while (invlist_iternext(above_bitmap, &start, &end)) { + UV i; + + for (i = start; i <= end; i++) { + put_code_point(output, i); + } + } + invlist_iterfinish(above_bitmap); + SvREFCNT_dec_NN(above_bitmap); + } + } + + if (invert && SvCUR(output) == 1) { + return NULL; + } + + return output; +} + +STATIC bool +S_put_charclass_bitmap_innards(pTHX_ SV *sv, + char *bitmap, + SV *nonbitmap_invlist, + SV *only_utf8_locale_invlist, + const regnode * const node, + const bool force_as_is_display) +{ + /* Appends to 'sv' a displayable version of the innards of the bracketed + * character class defined by the other arguments: + * 'bitmap' points to the bitmap. + * 'nonbitmap_invlist' is an inversion list of the code points that are in + * the bitmap range, but for some reason aren't in the bitmap; NULL if + * none. The reasons for this could be that they require some + * condition such as the target string being or not being in UTF-8 + * (under /d), or because they came from a user-defined property that + * was not resolved at the time of the regex compilation (under /u) + * 'only_utf8_locale_invlist' is an inversion list of the code points that + * are valid only if the runtime locale is a UTF-8 one; NULL if none + * 'node' is the regex pattern node. It is needed only when the above two + * parameters are not null, and is passed so that this routine can + * tease apart the various reasons for them. + * 'force_as_is_display' is TRUE if this routine should definitely NOT try + * to invert things to see if that leads to a cleaner display. If + * FALSE, this routine is free to use its judgment about doing this. + * + * It returns TRUE if there was actually something output. (It may be that + * the bitmap, etc is empty.) + * + * When called for outputting the bitmap of a non-ANYOF node, just pass the + * bitmap, with the succeeding parameters set to NULL, and the final one to + * FALSE. + */ + + /* In general, it tries to display the 'cleanest' representation of the + * innards, choosing whether to display them inverted or not, regardless of + * whether the class itself is to be inverted. However, there are some + * cases where it can't try inverting, as what actually matches isn't known + * until runtime, and hence the inversion isn't either. */ + bool inverting_allowed = ! force_as_is_display; + + int i; + STRLEN orig_sv_cur = SvCUR(sv); + + SV* invlist; /* Inversion list we accumulate of code points that + are unconditionally matched */ + SV* only_utf8 = NULL; /* Under /d, list of matches iff the target is + UTF-8 */ + SV* not_utf8 = NULL; /* /d, list of matches iff the target isn't UTF-8 + */ + SV* posixes = NULL; /* Under /l, string of things like [:word:], \D */ + SV* only_utf8_locale = NULL; /* Under /l, list of matches if the locale + is UTF-8 */ + + SV* as_is_display; /* The output string when we take the inputs + literally */ + SV* inverted_display; /* The output string when we invert the inputs */ + + U8 flags = (node) ? ANYOF_FLAGS(node) : 0; + + bool invert = cBOOL(flags & ANYOF_INVERT); /* Is the input to be inverted + to match? */ + /* We are biased in favor of displaying things without them being inverted, + * as that is generally easier to understand */ + const int bias = 5; + + PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS; + + /* Start off with whatever code points are passed in. (We clone, so we + * don't change the caller's list) */ + if (nonbitmap_invlist) { + assert(invlist_highest(nonbitmap_invlist) < NUM_ANYOF_CODE_POINTS); + invlist = invlist_clone(nonbitmap_invlist); + } + else { /* Worst case size is every other code point is matched */ + invlist = _new_invlist(NUM_ANYOF_CODE_POINTS / 2); + } + + if (flags) { + if (OP(node) == ANYOFD) { + + /* This flag indicates that the code points below 0x100 in the + * nonbitmap list are precisely the ones that match only when the + * target is UTF-8 (they should all be non-ASCII). */ + if (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) + { + _invlist_intersection(invlist, PL_UpperLatin1, &only_utf8); + _invlist_subtract(invlist, only_utf8, &invlist); + } + + /* And this flag for matching all non-ASCII 0xFF and below */ + if (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER) + { + not_utf8 = invlist_clone(PL_UpperLatin1); + } + } + else if (OP(node) == ANYOFL) { + + /* If either of these flags are set, what matches isn't + * determinable except during execution, so don't know enough here + * to invert */ + if (flags & (ANYOFL_FOLD|ANYOF_MATCHES_POSIXL)) { + inverting_allowed = FALSE; + } + + /* What the posix classes match also varies at runtime, so these + * will be output symbolically. */ + if (ANYOF_POSIXL_TEST_ANY_SET(node)) { + int i; + + posixes = newSVpvs(""); + for (i = 0; i < ANYOF_POSIXL_MAX; i++) { + if (ANYOF_POSIXL_TEST(node,i)) { + sv_catpv(posixes, anyofs[i]); + } + } + } + } + } + + /* Accumulate the bit map into the unconditional match list */ + for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) { + if (BITMAP_TEST(bitmap, i)) { + int start = i++; + for (; i < NUM_ANYOF_CODE_POINTS && BITMAP_TEST(bitmap, i); i++) { + /* empty */ + } + invlist = _add_range_to_invlist(invlist, start, i-1); + } + } + + /* Make sure that the conditional match lists don't have anything in them + * that match unconditionally; otherwise the output is quite confusing. + * This could happen if the code that populates these misses some + * duplication. */ + if (only_utf8) { + _invlist_subtract(only_utf8, invlist, &only_utf8); + } + if (not_utf8) { + _invlist_subtract(not_utf8, invlist, ¬_utf8); + } + + if (only_utf8_locale_invlist) { + + /* Since this list is passed in, we have to make a copy before + * modifying it */ + only_utf8_locale = invlist_clone(only_utf8_locale_invlist); + + _invlist_subtract(only_utf8_locale, invlist, &only_utf8_locale); + + /* And, it can get really weird for us to try outputting an inverted + * form of this list when it has things above the bitmap, so don't even + * try */ + if (invlist_highest(only_utf8_locale) >= NUM_ANYOF_CODE_POINTS) { + inverting_allowed = FALSE; + } + } + + /* Calculate what the output would be if we take the input as-is */ + as_is_display = put_charclass_bitmap_innards_common(invlist, + posixes, + only_utf8, + not_utf8, + only_utf8_locale, + invert); + + /* If have to take the output as-is, just do that */ + if (! inverting_allowed) { + if (as_is_display) { + sv_catsv(sv, as_is_display); + SvREFCNT_dec_NN(as_is_display); + } + } + else { /* But otherwise, create the output again on the inverted input, and + use whichever version is shorter */ + + int inverted_bias, as_is_bias; + + /* We will apply our bias to whichever of the the results doesn't have + * the '^' */ + if (invert) { + invert = FALSE; + as_is_bias = bias; + inverted_bias = 0; + } + else { + invert = TRUE; + as_is_bias = 0; + inverted_bias = bias; + } + + /* Now invert each of the lists that contribute to the output, + * excluding from the result things outside the possible range */ + + /* For the unconditional inversion list, we have to add in all the + * conditional code points, so that when inverted, they will be gone + * from it */ + _invlist_union(only_utf8, invlist, &invlist); + _invlist_union(not_utf8, invlist, &invlist); + _invlist_union(only_utf8_locale, invlist, &invlist); + _invlist_invert(invlist); + _invlist_intersection(invlist, PL_InBitmap, &invlist); + + if (only_utf8) { + _invlist_invert(only_utf8); + _invlist_intersection(only_utf8, PL_UpperLatin1, &only_utf8); + } + else if (not_utf8) { + + /* If a code point matches iff the target string is not in UTF-8, + * then complementing the result has it not match iff not in UTF-8, + * which is the same thing as matching iff it is UTF-8. */ + only_utf8 = not_utf8; + not_utf8 = NULL; + } + + if (only_utf8_locale) { + _invlist_invert(only_utf8_locale); + _invlist_intersection(only_utf8_locale, + PL_InBitmap, + &only_utf8_locale); + } + + inverted_display = put_charclass_bitmap_innards_common( + invlist, + posixes, + only_utf8, + not_utf8, + only_utf8_locale, invert); + + /* Use the shortest representation, taking into account our bias + * against showing it inverted */ + if ( inverted_display + && ( ! as_is_display + || ( SvCUR(inverted_display) + inverted_bias + < SvCUR(as_is_display) + as_is_bias))) + { + sv_catsv(sv, inverted_display); + } + else if (as_is_display) { + sv_catsv(sv, as_is_display); + } + + SvREFCNT_dec(as_is_display); + SvREFCNT_dec(inverted_display); + } + + SvREFCNT_dec_NN(invlist); + SvREFCNT_dec(only_utf8); + SvREFCNT_dec(not_utf8); + SvREFCNT_dec(posixes); + SvREFCNT_dec(only_utf8_locale); + + return SvCUR(sv) > orig_sv_cur; } -#define CLEAR_OPTSTART \ +#define CLEAR_OPTSTART \ if (optstart) STMT_START { \ - DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, \ + DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \ " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ - optstart=NULL; \ + optstart=NULL; \ } STMT_END #define DUMPUNTIL(b,e) \ @@ -19507,7 +20456,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, PERL_ARGS_ASSERT_DUMPUNTIL; #ifdef DEBUG_DUMPUNTIL - PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start, + Perl_re_printf( aTHX_ "--- %d : %d - %d - %d\n",indent,node-start, last ? last-start : 0,plast ? plast-start : 0); #endif @@ -19533,18 +20482,18 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, CLEAR_OPTSTART; regprop(r, sv, node, NULL, NULL); - PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start), + Perl_re_printf( aTHX_ "%4"IVdf":%*s%s", (IV)(node - start), (int)(2*indent + 1), "", SvPVX_const(sv)); if (OP(node) != OPTIMIZED) { if (next == NULL) /* Next ptr. */ - PerlIO_printf(Perl_debug_log, " (0)"); + Perl_re_printf( aTHX_ " (0)"); else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH ) - PerlIO_printf(Perl_debug_log, " (FAIL)"); + Perl_re_printf( aTHX_ " (FAIL)"); else - PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start)); - (void)PerlIO_putc(Perl_debug_log, '\n'); + Perl_re_printf( aTHX_ " (%"IVdf")", (IV)(next - start)); + Perl_re_printf( aTHX_ "\n"); } after_print: @@ -19582,8 +20531,8 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); - PerlIO_printf(Perl_debug_log, "%*s%s ", - (int)(2*(indent+3)), "", + Perl_re_indentf( aTHX_ "%s ", + indent+3, elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, @@ -19598,7 +20547,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, ); if (trie->jump) { U16 dist= trie->jump[word_idx+1]; - PerlIO_printf(Perl_debug_log, "(%"UVuf")\n", + Perl_re_printf( aTHX_ "(%"UVuf")\n", (UV)((dist ? this_trie + dist : next) - start)); if (dist) { if (!nextbranch) @@ -19608,7 +20557,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH) nextbranch= regnext((regnode *)nextbranch); } else { - PerlIO_printf(Perl_debug_log, "\n"); + Perl_re_printf( aTHX_ "\n"); } } if (last && next > last) @@ -19648,7 +20597,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, } CLEAR_OPTSTART; #ifdef DEBUG_DUMPUNTIL - PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent); + Perl_re_printf( aTHX_ "--- %d\n", (int)indent); #endif return node; }