X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/d1e2fe7b8496f259e898cf2bbdcd94103adf12c7..d43328d502ac91c4d98e218d0721cd5f3bcd3950:/regcomp.c?ds=sidebyside diff --git a/regcomp.c b/regcomp.c index da01f05..ec7fa3b 100644 --- a/regcomp.c +++ b/regcomp.c @@ -101,14 +101,6 @@ EXTERN_C const struct regexp_engine my_reg_engine; #define STATIC static #endif -#ifndef MIN -#define MIN(a,b) ((a) < (b) ? (a) : (b)) -#endif - -#ifndef MAX -#define MAX(a,b) ((a) > (b) ? (a) : (b)) -#endif - /* this is a chain of data about sub patterns we are processing that need to be handled separately/specially in study_chunk. Its so we can simulate recursion without losing state. */ @@ -185,15 +177,13 @@ struct RExC_state_t { U32 study_chunk_recursed_bytes; /* bytes in bitmap */ I32 in_lookbehind; I32 contains_locale; - I32 contains_i; I32 override_recoding; #ifdef EBCDIC I32 recode_x_to_native; #endif I32 in_multi_char_class; - struct reg_code_block *code_blocks; /* positions of literal (?{}) + struct reg_code_blocks *code_blocks;/* positions of literal (?{}) within pattern */ - int num_code_blocks; /* size of code_blocks[] */ int code_index; /* next code_blocks[] slot */ SSize_t maxlen; /* mininum possible number of chars in string to match */ scan_frame *frame_head; @@ -280,8 +270,6 @@ struct RExC_state_t { (pRExC_state->study_chunk_recursed_bytes) #define RExC_in_lookbehind (pRExC_state->in_lookbehind) #define RExC_contains_locale (pRExC_state->contains_locale) -#define RExC_contains_i (pRExC_state->contains_i) -#define RExC_override_recoding (pRExC_state->override_recoding) #ifdef EBCDIC # define RExC_recode_x_to_native (pRExC_state->recode_x_to_native) #endif @@ -564,7 +552,6 @@ static const scan_data_t zero_scan_data = #define OOB_UNICODE 0xDEADBEEF #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) -#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) /* length of regex to show in messages that don't mark a position within */ @@ -579,7 +566,7 @@ static const scan_data_t zero_scan_data = #define MARKER2 " <-- HERE " /* marker as it appears within the regex */ #define REPORT_LOCATION " in regex; marked by " MARKER1 \ - " in m/%"UTF8f MARKER2 "%"UTF8f"/" + " in m/%" UTF8f MARKER2 "%" UTF8f "/" /* The code in this file in places uses one level of recursion with parsing * rebased to an alternate string constructed by us in memory. This can take @@ -667,11 +654,11 @@ static const scan_data_t zero_scan_data = } STMT_END #define FAIL(msg) _FAIL( \ - Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \ + Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \ msg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) #define FAIL2(msg,arg) _FAIL( \ - Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \ + Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \ arg, UTF8fARG(UTF, len, RExC_precomp), ellipses)) /* @@ -1008,24 +995,25 @@ Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...) #define DEBUG_STUDYDATA(str,data,depth) \ DEBUG_OPTIMISE_MORE_r(if(data){ \ - Perl_re_indentf( aTHX_ "" str "Pos:%"IVdf"/%"IVdf \ - " Flags: 0x%"UVXf, \ + Perl_re_indentf( aTHX_ "" str "Pos:%" IVdf "/%" IVdf \ + " Flags: 0x%" UVXf, \ depth, \ (IV)((data)->pos_min), \ (IV)((data)->pos_delta), \ (UV)((data)->flags) \ ); \ DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \ - Perl_re_printf( aTHX_ \ - " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \ + Perl_re_printf( aTHX_ \ + " Whilem_c: %" IVdf " Lcp: %" IVdf " %s", \ (IV)((data)->whilem_c), \ (IV)((data)->last_closep ? *((data)->last_closep) : -1), \ is_inf ? "INF " : "" \ ); \ if ((data)->last_found) \ - Perl_re_printf( aTHX_ \ - "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \ - " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \ + Perl_re_printf( aTHX_ \ + "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf \ + " %sFixed:'%s' @ %" IVdf \ + " %sFloat: '%s' @ %" IVdf "/%" IVdf, \ SvPVX_const((data)->last_found), \ (IV)((data)->last_end), \ (IV)((data)->last_start_min), \ @@ -1283,8 +1271,8 @@ S_ssc_anything(pTHX_ regnode_ssc *ssc) assert(is_ANYOF_SYNTHETIC(ssc)); - ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */ - _append_range_to_invlist(ssc->invlist, 0, UV_MAX); + /* mortalize so won't leak */ + ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX)); ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */ } @@ -2020,7 +2008,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, for( state = 1 ; state < trie->statecount ; state++ ) { const U32 base = trie->states[ state ].trans.base; - Perl_re_indentf( aTHX_ "#%4"UVXf"|", depth+1, (UV)state); + Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state); if ( trie->states[ state ].wordnum ) { Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum ); @@ -2028,7 +2016,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, Perl_re_printf( aTHX_ "%6s", "" ); } - Perl_re_printf( aTHX_ " @%4"UVXf" ", (UV)base ); + Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base ); if ( base ) { U32 ofs = 0; @@ -2039,7 +2027,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, != state)) ofs++; - Perl_re_printf( aTHX_ "+%2"UVXf"[ ", (UV)ofs); + Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs); for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) { if ( ( base + ofs >= trie->uniquecharcount ) @@ -2048,7 +2036,7 @@ S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap, && trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) { - Perl_re_printf( aTHX_ "%*"UVXf, colwidth, + Perl_re_printf( aTHX_ "%*" UVXf, colwidth, (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next ); } else { @@ -2097,7 +2085,7 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, for( state=1 ; state < next_alloc ; state ++ ) { U16 charid; - Perl_re_indentf( aTHX_ " %4"UVXf" :", + Perl_re_indentf( aTHX_ " %4" UVXf " :", depth+1, (UV)state ); if ( ! trie->states[ state ].wordnum ) { Perl_re_printf( aTHX_ "%5s| ",""); @@ -2110,7 +2098,7 @@ S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie, SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0); if ( tmp ) { - Perl_re_printf( aTHX_ "%*s:%3X=%4"UVXf" | ", + Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ", colwidth, pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth, @@ -2181,22 +2169,22 @@ S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie, for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) { - Perl_re_indentf( aTHX_ "%4"UVXf" : ", + Perl_re_indentf( aTHX_ "%4" UVXf " : ", depth+1, (UV)TRIE_NODENUM( state ) ); for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) { UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next ); if (v) - Perl_re_printf( aTHX_ "%*"UVXf, colwidth, v ); + Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v ); else Perl_re_printf( aTHX_ "%*s", colwidth, "." ); } if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) { - Perl_re_printf( aTHX_ " (%4"UVXf")\n", + Perl_re_printf( aTHX_ " (%4" UVXf ")\n", (UV)trie->trans[ state ].check ); } else { - Perl_re_printf( aTHX_ " (%4"UVXf") W%4X\n", + Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n", (UV)trie->trans[ state ].check, trie->states[ TRIE_NODENUM( state ) ].wordnum ); } @@ -2434,6 +2422,21 @@ is the recommended Unicode-aware way of saying : ( state==1 ? special : 0 ) \ ) +#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \ +STMT_START { \ + TRIE_BITMAP_SET(trie, uvc); \ + /* store the folded codepoint */ \ + if ( folder ) \ + TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \ + \ + if ( !UTF ) { \ + /* store first byte of utf8 representation of */ \ + /* variant codepoints */ \ + if (! UVCHR_IS_INVARIANT(uvc)) { \ + TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \ + } \ + } \ +} STMT_END #define MADE_TRIE 1 #define MADE_JUMP_TRIE 2 #define MADE_EXACT_TRIE 4 @@ -2559,12 +2562,23 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, bitmap?*/ if (OP(noper) == NOTHING) { + /* skip past a NOTHING at the start of an alternation + * eg, /(?:)a|(?:b)/ should be the same as /a|b/ + */ regnode *noper_next= regnext(noper); if (noper_next < tail) noper= noper_next; } - if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) { + if ( noper < tail && + ( + OP(noper) == flags || + ( + flags == EXACTFU && + OP(noper) == EXACTFU_SS + ) + ) + ) { uc= (U8*)STRING(noper); e= uc + STR_LEN(noper); } else { @@ -2581,6 +2595,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S); } } + for ( ; uc < e ; uc += len ) { /* Look at each char in the current branch */ TRIE_CHARCOUNT(trie)++; @@ -2664,18 +2679,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, if ( set_bit ) { /* store the codepoint in the bitmap, and its folded * equivalent. */ - TRIE_BITMAP_SET(trie, uvc); - - /* store the folded codepoint */ - if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); - - if ( !UTF ) { - /* store first byte of utf8 representation of - variant codepoints */ - if (! UVCHR_IS_INVARIANT(uvc)) { - TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); - } - } + TRIE_BITMAP_SET_FOLDED(trie, uvc, folder); set_bit = 0; /* We've done our bit :-) */ } } else { @@ -2694,7 +2698,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 ); if ( !svpp ) - Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc ); + Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc ); if ( !SvTRUE( *svpp ) ) { sv_setiv( *svpp, ++trie->uniquecharcount ); @@ -2838,7 +2842,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, } state = newstate; } else { - Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc ); } } } @@ -3035,7 +3039,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, } state = trie->trans[ state + charid ].next; } else { - Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc ); + Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc ); } /* charid is now 0 if we dont know the char read, or * nonzero if we do */ @@ -3168,7 +3172,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, PerlMemShared_realloc( trie->states, laststate * sizeof(reg_trie_state) ); DEBUG_TRIE_COMPILE_MORE_r( - Perl_re_indentf( aTHX_ "Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n", + Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n", depth+1, (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ), @@ -3180,7 +3184,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, } /* end table compress */ } DEBUG_TRIE_COMPILE_MORE_r( - Perl_re_indentf( aTHX_ "Statecount:%"UVxf" Lasttrans:%"UVxf"\n", + Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n", depth+1, (UV)trie->statecount, (UV)trie->lasttrans) @@ -3231,7 +3235,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, }); } DEBUG_OPTIMISE_r( - Perl_re_indentf( aTHX_ "MJD offset:%"UVuf" MJD length:%"UVuf"\n", + Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n", depth+1, (UV)mjd_offset, (UV)mjd_nodelen) ); @@ -3240,13 +3244,19 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, split out as an EXACT and put in front of the TRIE node. */ trie->startstate= 1; if ( trie->bitmap && !widecharmap && !trie->jump ) { + /* we want to find the first state that has more than + * one transition, if that state is not the first state + * then we have a common prefix which we can remove. + */ U32 state; for ( state = 1 ; state < trie->statecount-1 ; state++ ) { U32 ofs = 0; - I32 idx = -1; + I32 first_ofs = -1; /* keeps track of the ofs of the first + transition, -1 means none */ U32 count = 0; const U32 base = trie->states[ state ].trans.base; + /* does this state terminate an alternation? */ if ( trie->states[state].wordnum ) count = 1; @@ -3256,44 +3266,54 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, trie->trans[ base + ofs - trie->uniquecharcount ].check == state ) { if ( ++count > 1 ) { - SV **tmp = av_fetch( revcharmap, ofs, 0); - const U8 *ch = (U8*)SvPV_nolen_const( *tmp ); + /* we have more than one transition */ + SV **tmp; + U8 *ch; + /* if this is the first state there is no common prefix + * to extract, so we can exit */ if ( state == 1 ) break; + tmp = av_fetch( revcharmap, ofs, 0); + ch = (U8*)SvPV_nolen_const( *tmp ); + + /* if we are on count 2 then we need to initialize the + * bitmap, and store the previous char if there was one + * in it*/ if ( count == 2 ) { + /* clear the bitmap */ Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char); DEBUG_OPTIMISE_r( - Perl_re_indentf( aTHX_ "New Start State=%"UVuf" Class: [", + Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [", depth+1, (UV)state)); - if (idx >= 0) { - SV ** const tmp = av_fetch( revcharmap, idx, 0); + if (first_ofs >= 0) { + SV ** const tmp = av_fetch( revcharmap, first_ofs, 0); const U8 * const ch = (U8*)SvPV_nolen_const( *tmp ); - TRIE_BITMAP_SET(trie,*ch); - if ( folder ) - TRIE_BITMAP_SET(trie, folder[ *ch ]); + TRIE_BITMAP_SET_FOLDED(trie,*ch,folder); DEBUG_OPTIMISE_r( Perl_re_printf( aTHX_ "%s", (char*)ch) ); } } - TRIE_BITMAP_SET(trie,*ch); - if ( folder ) - TRIE_BITMAP_SET(trie,folder[ *ch ]); + /* store the current firstchar in the bitmap */ + TRIE_BITMAP_SET_FOLDED(trie,*ch,folder); DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch)); } - idx = ofs; + first_ofs = ofs; } } if ( count == 1 ) { - SV **tmp = av_fetch( revcharmap, idx, 0); + /* This state has only one transition, its transition is part + * of a common prefix - we need to concatenate the char it + * represents to what we have so far. */ + SV **tmp = av_fetch( revcharmap, first_ofs, 0); STRLEN len; char *ch = SvPV( *tmp, len ); DEBUG_OPTIMISE_r({ SV *sv=sv_newmortal(); - Perl_re_indentf( aTHX_ "Prefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n", + Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n", depth+1, - (UV)state, (UV)idx, + (UV)state, (UV)first_ofs, pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6, PL_colors[0], PL_colors[1], (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) | @@ -3584,11 +3604,11 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour */ fail[ 0 ] = fail[ 1 ] = 0; DEBUG_TRIE_COMPILE_r({ - Perl_re_indentf( aTHX_ "Stclass Failtable (%"UVuf" states): 0", + Perl_re_indentf( aTHX_ "Stclass Failtable (%" UVuf " states): 0", depth, (UV)numstates ); for( q_read=1; q_readwhilem_c < 16) { + && data) { /* This stays as CURLYX, we can put the count/of pair. */ /* Find WHILEM (as in regexec.c) */ regnode *nxt = oscan + NEXT_OFF(oscan); if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ nxt += ARG(nxt); - PREVOPER(nxt)->flags = (U8)(data->whilem_c - | (RExC_whilem_seen << 4)); /* On WHILEM */ + nxt = PREVOPER(nxt); + if (nxt->flags & 0xf) { + /* we've already set whilem count on this node */ + } else if (++data->whilem_c < 16) { + assert(data->whilem_c <= RExC_whilem_seen); + nxt->flags = (U8)(data->whilem_c + | (RExC_whilem_seen << 4)); /* On WHILEM */ + } } if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) pars++; @@ -5260,13 +5286,13 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, /* It is counted once already... */ data->pos_min += minnext * (mincount - counted); #if 0 -Perl_re_printf( aTHX_ "counted=%"UVuf" deltanext=%"UVuf - " SSize_t_MAX=%"UVuf" minnext=%"UVuf - " maxcount=%"UVuf" mincount=%"UVuf"\n", +Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf + " SSize_t_MAX=%" UVuf " minnext=%" UVuf + " maxcount=%" UVuf " mincount=%" UVuf "\n", (UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount, (UV)mincount); if (deltanext != SSize_t_MAX) -Perl_re_printf( aTHX_ "LHS=%"UVuf" RHS=%"UVuf"\n", +Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n", (UV)(-counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta)); #endif @@ -5571,7 +5597,7 @@ Perl_re_printf( aTHX_ "LHS=%"UVuf" RHS=%"UVuf"\n", FAIL("Variable length lookbehind not implemented"); } else if (minnext > (I32)U8_MAX) { - FAIL2("Lookbehind longer than %"UVuf" not implemented", + FAIL2("Lookbehind longer than %" UVuf " not implemented", (UV)U8_MAX); } scan->flags = (U8)minnext; @@ -5660,7 +5686,7 @@ Perl_re_printf( aTHX_ "LHS=%"UVuf" RHS=%"UVuf"\n", FAIL("Variable length lookbehind not implemented"); } else if (*minnextp > (I32)U8_MAX) { - FAIL2("Lookbehind longer than %"UVuf" not implemented", + FAIL2("Lookbehind longer than %" UVuf " not implemented", (UV)U8_MAX); } scan->flags = (U8)*minnextp; @@ -6074,7 +6100,7 @@ Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags) /* Dispatch a request to compile a regexp to correct regexp engine. */ DEBUG_COMPILE_r({ - Perl_re_printf( aTHX_ "Using engine %"UVxf"\n", + Perl_re_printf( aTHX_ "Using engine %" UVxf "\n", PTR2UV(eng)); }); return CALLREGCOMP_ENG(eng, pattern, flags); @@ -6101,6 +6127,39 @@ Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags) } +static void +S_free_codeblocks(pTHX_ struct reg_code_blocks *cbs) +{ + int n; + + if (--cbs->refcnt > 0) + return; + for (n = 0; n < cbs->count; n++) { + REGEXP *rx = cbs->cb[n].src_regex; + cbs->cb[n].src_regex = NULL; + SvREFCNT_dec(rx); + } + Safefree(cbs->cb); + Safefree(cbs); +} + + +static struct reg_code_blocks * +S_alloc_code_blocks(pTHX_ int ncode) +{ + struct reg_code_blocks *cbs; + Newx(cbs, 1, struct reg_code_blocks); + cbs->count = ncode; + cbs->refcnt = 1; + SAVEDESTRUCTOR_X(S_free_codeblocks, cbs); + if (ncode) + Newx(cbs->cb, ncode, struct reg_code_block); + else + cbs->cb = NULL; + return cbs; +} + + /* upgrade pattern pat_p of length plen_p to UTF8, and if there are code * blocks, recalculate the indices. Update pat_p and plen_p in-place to * point to the realloced string and length. @@ -6127,14 +6186,16 @@ S_pat_upgrade_to_utf8(pTHX_ RExC_state_t * const pRExC_state, while (s < *plen_p) { append_utf8_from_native_byte(src[s], &d); + if (n < num_code_blocks) { - if (!do_end && pRExC_state->code_blocks[n].start == s) { - pRExC_state->code_blocks[n].start = d - dst - 1; + assert(pRExC_state->code_blocks); + if (!do_end && pRExC_state->code_blocks->cb[n].start == s) { + pRExC_state->code_blocks->cb[n].start = d - dst - 1; assert(*(d - 1) == '('); do_end = 1; } - else if (do_end && pRExC_state->code_blocks[n].end == s) { - pRExC_state->code_blocks[n].end = d - dst - 1; + else if (do_end && pRExC_state->code_blocks->cb[n].end == s) { + pRExC_state->code_blocks->cb[n].end = d - dst - 1; assert(*(d - 1) == ')'); do_end = 0; n++; @@ -6254,10 +6315,10 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, if (oplist->op_type == OP_NULL && (oplist->op_flags & OPf_SPECIAL)) { - assert(n < pRExC_state->num_code_blocks); - pRExC_state->code_blocks[n].start = pat ? SvCUR(pat) : 0; - pRExC_state->code_blocks[n].block = oplist; - pRExC_state->code_blocks[n].src_regex = NULL; + assert(n < pRExC_state->code_blocks->count); + pRExC_state->code_blocks->cb[n].start = pat ? SvCUR(pat) : 0; + pRExC_state->code_blocks->cb[n].block = oplist; + pRExC_state->code_blocks->cb[n].src_regex = NULL; n++; code = 1; oplist = OpSIBLING(oplist); /* skip CONST */ @@ -6287,7 +6348,8 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, sv_setsv(pat, sv); /* overloading involved: all bets are off over literal * code. Pretend we haven't seen it */ - pRExC_state->num_code_blocks -= n; + if (n) + pRExC_state->code_blocks->count -= n; n = 0; } else { @@ -6321,11 +6383,23 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, sv_catsv_nomg(pat, msv); rx = msv; } - else - pat = msv; + else { + /* We have only one SV to process, but we need to verify + * it is properly null terminated or we will fail asserts + * later. In theory we probably shouldn't get such SV's, + * but if we do we should handle it gracefully. */ + if ( SvTYPE(msv) != SVt_PV || (SvLEN(msv) > SvCUR(msv) && *(SvEND(msv)) == 0) ) { + /* not a string, or a string with a trailing null */ + pat = msv; + } else { + /* a string with no trailing null, we need to copy it + * so it we have a trailing null */ + pat = newSVsv(msv); + } + } if (code) - pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1; + pRExC_state->code_blocks->cb[n-1].end = SvCUR(pat)-1; } /* extract any code blocks within any embedded qr//'s */ @@ -6334,25 +6408,30 @@ S_concat_pat(pTHX_ RExC_state_t * const pRExC_state, { RXi_GET_DECL(ReANY((REGEXP *)rx), ri); - if (ri->num_code_blocks) { + if (ri->code_blocks && ri->code_blocks->count) { int i; /* the presence of an embedded qr// with code means * we should always recompile: the text of the * qr// may not have changed, but it may be a * different closure than last time */ *recompile_p = 1; - Renew(pRExC_state->code_blocks, - pRExC_state->num_code_blocks + ri->num_code_blocks, - struct reg_code_block); - pRExC_state->num_code_blocks += ri->num_code_blocks; + if (pRExC_state->code_blocks) { + pRExC_state->code_blocks->count += ri->code_blocks->count; + Renew(pRExC_state->code_blocks->cb, + pRExC_state->code_blocks->count, + struct reg_code_block); + } + else + pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ + ri->code_blocks->count); - for (i=0; i < ri->num_code_blocks; i++) { + for (i=0; i < ri->code_blocks->count; i++) { struct reg_code_block *src, *dst; STRLEN offset = orig_patlen + ReANY((REGEXP *)rx)->pre_prefix; - assert(n < pRExC_state->num_code_blocks); - src = &ri->code_blocks[i]; - dst = &pRExC_state->code_blocks[n]; + assert(n < pRExC_state->code_blocks->count); + src = &ri->code_blocks->cb[i]; + dst = &pRExC_state->code_blocks->cb[n]; dst->start = src->start + offset; dst->end = src->end + offset; dst->block = src->block; @@ -6387,10 +6466,11 @@ S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, PERL_UNUSED_CONTEXT; for (s = 0; s < plen; s++) { - if (n < pRExC_state->num_code_blocks - && s == pRExC_state->code_blocks[n].start) + if ( pRExC_state->code_blocks + && n < pRExC_state->code_blocks->count + && s == pRExC_state->code_blocks->cb[n].start) { - s = pRExC_state->code_blocks[n].end; + s = pRExC_state->code_blocks->cb[n].end; n++; continue; } @@ -6450,7 +6530,7 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, int n = 0; STRLEN s; char *p, *newpat; - int newlen = plen + 6; /* allow for "qr''x\0" extra chars */ + int newlen = plen + 7; /* allow for "qr''xx\0" extra chars */ SV *sv, *qr_ref; dSP; @@ -6465,12 +6545,13 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, *p++ = 'q'; *p++ = 'r'; *p++ = '\''; for (s = 0; s < plen; s++) { - if (n < pRExC_state->num_code_blocks - && s == pRExC_state->code_blocks[n].start) + if ( pRExC_state->code_blocks + && n < pRExC_state->code_blocks->count + && s == pRExC_state->code_blocks->cb[n].start) { /* blank out literal code block */ assert(pat[s] == '('); - while (s <= pRExC_state->code_blocks[n].end) { + while (s <= pRExC_state->code_blocks->cb[n].end) { *p++ = '_'; s++; } @@ -6483,8 +6564,12 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, *p++ = pat[s]; } *p++ = '\''; - if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) + if (pRExC_state->pm_flags & RXf_PMf_EXTENDED) { *p++ = 'x'; + if (pRExC_state->pm_flags & RXf_PMf_EXTENDED_MORE) { + *p++ = 'x'; + } + } *p++ = '\0'; DEBUG_COMPILE_r({ Perl_re_printf( aTHX_ @@ -6510,11 +6595,8 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, { SV * const errsv = ERRSV; if (SvTRUE_NN(errsv)) - { - Safefree(pRExC_state->code_blocks); /* use croak_sv ? */ - Perl_croak_nocontext("%"SVf, SVfARG(errsv)); - } + Perl_croak_nocontext("%" SVf, SVfARG(errsv)); } assert(SvROK(qr_ref)); qr = SvRV(qr_ref); @@ -6546,42 +6628,46 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, struct reg_code_block *new_block, *dst; RExC_state_t * const r1 = pRExC_state; /* convenient alias */ int i1 = 0, i2 = 0; + int r1c, r2c; - if (!r2->num_code_blocks) /* we guessed wrong */ + if (!r2->code_blocks || !r2->code_blocks->count) /* we guessed wrong */ { SvREFCNT_dec_NN(qr); return 1; } - Newx(new_block, - r1->num_code_blocks + r2->num_code_blocks, - struct reg_code_block); + if (!r1->code_blocks) + r1->code_blocks = S_alloc_code_blocks(aTHX_ 0); + + r1c = r1->code_blocks->count; + r2c = r2->code_blocks->count; + + Newx(new_block, r1c + r2c, struct reg_code_block); + dst = new_block; - while ( i1 < r1->num_code_blocks - || i2 < r2->num_code_blocks) - { + while (i1 < r1c || i2 < r2c) { struct reg_code_block *src; bool is_qr = 0; - if (i1 == r1->num_code_blocks) { - src = &r2->code_blocks[i2++]; + if (i1 == r1c) { + src = &r2->code_blocks->cb[i2++]; is_qr = 1; } - else if (i2 == r2->num_code_blocks) - src = &r1->code_blocks[i1++]; - else if ( r1->code_blocks[i1].start - < r2->code_blocks[i2].start) + else if (i2 == r2c) + src = &r1->code_blocks->cb[i1++]; + else if ( r1->code_blocks->cb[i1].start + < r2->code_blocks->cb[i2].start) { - src = &r1->code_blocks[i1++]; - assert(src->end < r2->code_blocks[i2].start); + src = &r1->code_blocks->cb[i1++]; + assert(src->end < r2->code_blocks->cb[i2].start); } else { - assert( r1->code_blocks[i1].start - > r2->code_blocks[i2].start); - src = &r2->code_blocks[i2++]; + assert( r1->code_blocks->cb[i1].start + > r2->code_blocks->cb[i2].start); + src = &r2->code_blocks->cb[i2++]; is_qr = 1; - assert(src->end < r1->code_blocks[i1].start); + assert(src->end < r1->code_blocks->cb[i1].start); } assert(pat[src->start] == '('); @@ -6593,9 +6679,9 @@ S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state, : src->src_regex; dst++; } - r1->num_code_blocks += r2->num_code_blocks; - Safefree(r1->code_blocks); - r1->code_blocks = new_block; + r1->code_blocks->count += r2c; + Safefree(r1->code_blocks->cb); + r1->code_blocks->cb = new_block; } SvREFCNT_dec_NN(qr); @@ -6641,7 +6727,11 @@ S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, calculate it.*/ ml = minlen ? *(minlen) : (SSize_t)longest_length; *rx_end_shift = ml - offset - - longest_length + (SvTAIL(sv_longest) != 0) + - longest_length + /* XXX SvTAIL is always false here - did you mean FBMcf_TAIL + * intead? - DAPM + + (SvTAIL(sv_longest) != 0) + */ + lookbehind; t = (eol/* Can't have SEOL and MULTI */ @@ -6710,7 +6800,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, SSize_t minlen = 0; U32 rx_flags; SV *pat; - SV *code_blocksv = NULL; SV** new_patternp = patternp; /* these are all flags - maybe they should be turned @@ -6768,7 +6857,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, pRExC_state->warn_text = NULL; pRExC_state->code_blocks = NULL; - pRExC_state->num_code_blocks = 0; if (is_bare_re) *is_bare_re = FALSE; @@ -6782,10 +6870,9 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, for (o = cLISTOPx(expr)->op_first; o; o = OpSIBLING(o)) if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) ncode++; /* count of DO blocks */ - if (ncode) { - pRExC_state->num_code_blocks = ncode; - Newx(pRExC_state->code_blocks, ncode, struct reg_code_block); - } + + if (ncode) + pRExC_state->code_blocks = S_alloc_code_blocks(aTHX_ ncode); } if (!pat_count) { @@ -6829,7 +6916,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* set expr to the first arg op */ - if (pRExC_state->num_code_blocks + if (pRExC_state->code_blocks && pRExC_state->code_blocks->count && expr->op_type != OP_CONST) { expr = cLISTOPx(expr)->op_first; @@ -6851,7 +6938,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (is_bare_re) *is_bare_re = TRUE; SvREFCNT_inc(re); - Safefree(pRExC_state->code_blocks); DEBUG_PARSE_r(Perl_re_printf( aTHX_ "Precompiled pattern%s\n", orig_rx_flags & RXf_SPLIT ? " for split" : "")); @@ -6871,7 +6957,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, pat = newSVpvn_flags(exp, plen, SVs_TEMP | (IN_BYTES ? 0 : SvUTF8(pat))); } - Safefree(pRExC_state->code_blocks); return CALLREGCOMP_ENG(eng, pat, orig_rx_flags); } @@ -6881,7 +6966,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_uni_semantics = 0; RExC_seen_unfolded_sharp_s = 0; RExC_contains_locale = 0; - RExC_contains_i = 0; RExC_strict = cBOOL(pm_flags & RXf_PMf_STRICT); RExC_study_started = 0; pRExC_state->runtime_code_qr = NULL; @@ -6927,15 +7011,11 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, && memEQ(RX_PRECOMP(old_re), exp, plen) && !runtime_code /* with runtime code, always recompile */ ) { - Safefree(pRExC_state->code_blocks); return old_re; } rx_flags = orig_rx_flags; - if (rx_flags & PMf_FOLD) { - RExC_contains_i = 1; - } if ( initial_charset == REGEX_DEPENDS_CHARSET && (RExC_utf8 ||RExC_uni_semantics)) { @@ -6959,7 +7039,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* whoops, we have a non-utf8 pattern, whilst run-time code * got compiled as utf8. Try again with a utf8 pattern */ S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, - pRExC_state->num_code_blocks); + pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0); goto redo_first_pass; } } @@ -6972,7 +7052,6 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_in_lookbehind = 0; RExC_seen_zerolen = *exp == '^' ? -1 : 0; RExC_extralen = 0; - RExC_override_recoding = 0; #ifdef EBCDIC RExC_recode_x_to_native = 0; #endif @@ -7016,17 +7095,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_lastnum=0; RExC_lastparse=NULL; ); - /* reg may croak on us, not giving us a chance to free - pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may - need it to survive as long as the regexp (qr/(?{})/). - We must check that code_blocksv is not already set, because we may - have jumped back to restart the sizing pass. */ - if (pRExC_state->code_blocks && !code_blocksv) { - code_blocksv = newSV_type(SVt_PV); - SAVEFREESV(code_blocksv); - SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks); - SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/ - } + if (reg(pRExC_state, 0, &flags,1) == NULL) { /* It's possible to write a regexp in ascii that represents Unicode codepoints outside of the byte range, such as via \x{100}. If we @@ -7039,7 +7108,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (flags & RESTART_PASS1) { if (flags & NEED_UTF8) { S_pat_upgrade_to_utf8(aTHX_ pRExC_state, &exp, &plen, - pRExC_state->num_code_blocks); + pRExC_state->code_blocks ? pRExC_state->code_blocks->count : 0); } else { DEBUG_PARSE_r(Perl_re_printf( aTHX_ @@ -7048,14 +7117,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, goto redo_first_pass; } - Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#"UVxf"", (UV) flags); + Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for sizing pass, flags=%#" UVxf, (UV) flags); } - if (code_blocksv) - SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */ DEBUG_PARSE_r({ Perl_re_printf( aTHX_ - "Required size %"IVdf" nodes\n" + "Required size %" IVdf " nodes\n" "Starting second pass (creation)\n", (IV)RExC_size); RExC_lastnum=0; @@ -7104,16 +7171,8 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (pm_flags & PMf_IS_QR) { ri->code_blocks = pRExC_state->code_blocks; - ri->num_code_blocks = pRExC_state->num_code_blocks; - } - else - { - int n; - for (n = 0; n < pRExC_state->num_code_blocks; n++) - if (pRExC_state->code_blocks[n].src_regex) - SAVEFREESV(pRExC_state->code_blocks[n].src_regex); - if(pRExC_state->code_blocks) - SAVEFREEPV(pRExC_state->code_blocks); /* often null */ + if (ri->code_blocks) + ri->code_blocks->refcnt++; } { @@ -7130,7 +7189,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, == REG_RUN_ON_COMMENT_SEEN); U8 reganch = (U8)((r->extflags & RXf_PMf_STD_PMMOD) >> RXf_PMf_STD_PMMOD_SHIFT); - const char *fptr = STD_PAT_MODS; /*"msixn"*/ + const char *fptr = STD_PAT_MODS; /*"msixxn"*/ char *p; /* We output all the necessary flags; we never output a minus, as all @@ -7193,7 +7252,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, #ifdef RE_TRACK_PATTERN_OFFSETS Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ DEBUG_OFFSETS_r(Perl_re_printf( aTHX_ - "%s %"UVuf" bytes for offset annotations.\n", + "%s %" UVuf " bytes for offset annotations.\n", ri->u.offsets ? "Got" : "Couldn't get", (UV)((2*RExC_size+1) * sizeof(U32)))); #endif @@ -7247,7 +7306,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, RExC_npar = 1; if (reg(pRExC_state, 0, &flags,1) == NULL) { ReREFCNT_dec(rx); - Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#"UVxf"", (UV) flags); + Perl_croak(aTHX_ "panic: reg returned NULL to re_op_compile for generation pass, flags=%#" UVxf, (UV) flags); } DEBUG_OPTIMISE_r( Perl_re_printf( aTHX_ "Starting post parse optimization\n"); @@ -7392,7 +7451,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, !sawlookahead && (OP(first) == STAR && PL_regkind[OP(NEXTOPER(first))] == REG_ANY) && - !(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks) + !(r->intflags & PREGf_ANCH) && !pRExC_state->code_blocks) { /* turn .* into ^.* with an implied $*=1 */ const int type = @@ -7405,7 +7464,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, } if (sawplus && !sawminmod && !sawlookahead && (!sawopen || !RExC_sawback) - && !pRExC_state->num_code_blocks) /* May examine pos and $& */ + && !pRExC_state->code_blocks) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->intflags |= PREGf_SKIP; @@ -7413,12 +7472,12 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, #ifdef TRIE_STUDY_OPT DEBUG_PARSE_r( if (!restudied) - Perl_re_printf( aTHX_ "first at %"IVdf"\n", + Perl_re_printf( aTHX_ "first at %" IVdf "\n", (IV)(first - scan + 1)) ); #else DEBUG_PARSE_r( - Perl_re_printf( aTHX_ "first at %"IVdf"\n", + Perl_re_printf( aTHX_ "first at %" IVdf "\n", (IV)(first - scan + 1)) ); #endif @@ -7645,7 +7704,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, /* Guard against an embedded (?=) or (?<=) with a longer minlen than the "real" pattern. */ DEBUG_OPTIMISE_r({ - Perl_re_printf( aTHX_ "minlen: %"IVdf" r->minlen:%"IVdf" maxlen:%"IVdf"\n", + Perl_re_printf( aTHX_ "minlen: %" IVdf " r->minlen:%" IVdf " maxlen:%" IVdf "\n", (IV)minlen, (IV)r->minlen, (IV)RExC_maxlen); }); r->minlenret = minlen; @@ -7661,7 +7720,7 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, if (RExC_seen & REG_LOOKBEHIND_SEEN) r->extflags |= RXf_NO_INPLACE_SUBST; /* inplace might break the lookbehind */ - if (pRExC_state->num_code_blocks) + if (pRExC_state->code_blocks) r->extflags |= RXf_EVAL_SEEN; if (RExC_seen & REG_VERBARG_SEEN) { @@ -7733,6 +7792,18 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, while ( RExC_recurse_count > 0 ) { const regnode *scan = RExC_recurse[ --RExC_recurse_count ]; + /* + * This data structure is set up in study_chunk() and is used + * to calculate the distance between a GOSUB regopcode and + * the OPEN/CURLYM (CURLYM's are special and can act like OPEN's) + * it refers to. + * + * If for some reason someone writes code that optimises + * away a GOSUB opcode then the assert should be changed to + * an if(scan) to guard the ARG2L_SET() - Yves + * + */ + assert(scan && OP(scan) == GOSUB); ARG2L_SET( scan, RExC_open_parens[ARG(scan)] - scan ); } @@ -7753,10 +7824,10 @@ Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count, STRLEN i; GET_RE_DEBUG_FLAGS_DECL; Perl_re_printf( aTHX_ - "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); + "Offsets: [%" UVuf "]\n\t", (UV)ri->u.offsets[0]); for (i = 1; i <= len; i++) { if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) - Perl_re_printf( aTHX_ "%"UVuf":%"UVuf"[%"UVuf"] ", + Perl_re_printf( aTHX_ "%" UVuf ":%" UVuf "[%" UVuf "] ", (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); } Perl_re_printf( aTHX_ "\n"); @@ -8098,7 +8169,7 @@ Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren, } } else { ret_undef: - sv_setsv(sv,&PL_sv_undef); + sv_set_undef(sv); return; } } @@ -8230,17 +8301,18 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) assert (RExC_parse <= RExC_end); if (RExC_parse == RExC_end) NOOP; - else if (isIDFIRST_lazy_if(RExC_parse, UTF)) { + else if (isIDFIRST_lazy_if_safe(RExC_parse, RExC_end, UTF)) { /* Note that the code here assumes well-formed UTF-8. Skip IDFIRST by * using do...while */ if (UTF) do { RExC_parse += UTF8SKIP(RExC_parse); - } while (isWORDCHAR_utf8((U8*)RExC_parse)); + } while ( RExC_parse < RExC_end + && isWORDCHAR_utf8_safe((U8*)RExC_parse, (U8*) RExC_end)); else do { RExC_parse++; - } while (isWORDCHAR(*RExC_parse)); + } while (RExC_parse < RExC_end && isWORDCHAR(*RExC_parse)); } else { RExC_parse++; /* so the <- from the vFAIL is after the offending character */ @@ -8376,6 +8448,8 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) /* The header definitions are in F */ +#ifndef PERL_IN_XSUB_RE + PERL_STATIC_INLINE UV* S__invlist_array_init(SV* const invlist, const bool will_have_0) { @@ -8402,6 +8476,8 @@ S__invlist_array_init(SV* const invlist, const bool will_have_0) return zero_addr + *offset; } +#endif + PERL_STATIC_INLINE void S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) { @@ -8424,9 +8500,9 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len, const bool offset) STATIC void S_invlist_replace_list_destroys_src(pTHX_ SV * dest, SV * src) { - /* Replaces the inversion list in 'src' with the one in 'dest'. It steals - * the list from 'src', so 'src' is made to have a NULL list. This is - * similar to what SvSetMagicSV() would do, if it were implemented on + /* Replaces the inversion list in 'dest' with the one from 'src'. It + * steals the list from 'src', so 'src' is made to have a NULL list. This + * is similar to what SvSetMagicSV() would do, if it were implemented on * inversion lists, though this routine avoids a copy */ const UV src_len = _invlist_len(src); @@ -8538,6 +8614,8 @@ S_invlist_is_iterating(SV* const invlist) return *(get_invlist_iter_addr(invlist)) < (STRLEN) UV_MAX; } +#ifndef PERL_IN_XSUB_RE + PERL_STATIC_INLINE UV S_invlist_max(SV* const invlist) { @@ -8554,8 +8632,6 @@ S_invlist_max(SV* const invlist) ? FROM_INTERNAL_SIZE(SvCUR(invlist)) - 1 : FROM_INTERNAL_SIZE(SvLEN(invlist)) - 1; } - -#ifndef PERL_IN_XSUB_RE SV* Perl__new_invlist(pTHX_ IV initial_size) { @@ -8641,7 +8717,6 @@ Perl__new_invlist_C_array(pTHX_ const UV* const list) return invlist; } -#endif /* ifndef PERL_IN_XSUB_RE */ STATIC void S_invlist_extend(pTHX_ SV* const invlist, const UV new_max) @@ -8688,7 +8763,7 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, if ( array[final_element] > start || ELEMENT_RANGE_MATCHES_INVLIST(final_element)) { - Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c", + Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%" UVuf ", start=%" UVuf ", match=%c", array[final_element], start, ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f'); } @@ -8743,8 +8818,6 @@ S__append_range_to_invlist(pTHX_ SV* const invlist, } } -#ifndef PERL_IN_XSUB_RE - SSize_t Perl__invlist_search(SV* const invlist, const UV cp) { @@ -8936,13 +9009,13 @@ void Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** output) { - /* Take the union of two inversion lists and point to it. *output - * SHOULD BE DEFINED upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented if not already a - * temporary (mortal); otherwise just its contents will be modified to be - * the union. The first list, , may be NULL, in which case a copy of - * the second list is returned. If is TRUE, the union is - * taken of the complement (inversion) of instead of b itself. + /* Take the union of two inversion lists and point '*output' to it. On + * input, '*output' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly + * even 'a' or 'b'). If to an inversion list, the contents of the original + * list will be replaced by the union. The first list, 'a', may be + * NULL, in which case a copy of the second list is placed in '*output'. + * If 'complement_b' is TRUE, the union is taken of the complement + * (inversion) of 'b' instead of b itself. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -8976,60 +9049,59 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND; assert(a != b); + assert(*output == NULL || SvTYPE(*output) == SVt_INVLIST); len_b = _invlist_len(b); if (len_b == 0) { - /* Here, 'b' is empty. If the output is the complement of 'b', the - * union is all possible code points, and we need not even look at 'a'. - * It's easiest to create a new inversion list that matches everything. - * */ + /* Here, 'b' is empty, hence it's complement is all possible code + * points. So if the union includes the complement of 'b', it includes + * everything, and we need not even look at 'a'. It's easiest to + * create a new inversion list that matches everything. */ if (complement_b) { - SV* everything = _new_invlist(1); - _append_range_to_invlist(everything, 0, UV_MAX); + SV* everything = _add_range_to_invlist(NULL, 0, UV_MAX); - /* If the output didn't exist, just point it at the new list */ - if (*output == NULL) { + if (*output == NULL) { /* If the output didn't exist, just point it + at the new list */ *output = everything; - return; + } + else { /* Otherwise, replace its contents with the new list */ + invlist_replace_list_destroys_src(*output, everything); + SvREFCNT_dec_NN(everything); } - /* Otherwise, replace its contents with the new list */ - invlist_replace_list_destroys_src(*output, everything); - SvREFCNT_dec_NN(everything); return; } - /* Here, we don't want the complement of 'b', and since it is empty, + /* Here, we don't want the complement of 'b', and since 'b' is empty, * the union will come entirely from 'a'. If 'a' is NULL or empty, the * output will be empty */ - if (a == NULL) { - *output = _new_invlist(0); + if (a == NULL || _invlist_len(a) == 0) { + if (*output == NULL) { + *output = _new_invlist(0); + } + else { + invlist_clear(*output); + } return; } - if (_invlist_len(a) == 0) { - invlist_clear(*output); + /* Here, 'a' is not empty, but 'b' is, so 'a' entirely determines the + * union. We can just return a copy of 'a' if '*output' doesn't point + * to an existing list */ + if (*output == NULL) { + *output = invlist_clone(a); return; } - /* Here, 'a' is not empty, and entirely determines the union. If the - * output is not to overwrite 'b', we can just return 'a'. */ - if (*output != b) { - - /* If the output is to overwrite 'a', we have a no-op, as it's - * already in 'a' */ - if (*output == a) { - return; - } - - /* But otherwise we have to copy 'a' to the output */ - *output = invlist_clone(a); + /* If the output is to overwrite 'a', we have a no-op, as it's + * already in 'a' */ + if (*output == a) { return; } - /* Here, 'b' is to be overwritten by the output, which will be 'a' */ + /* Here, '*output' is to be overwritten by 'a' */ u = invlist_clone(a); invlist_replace_list_destroys_src(*output, u); SvREFCNT_dec_NN(u); @@ -9037,41 +9109,24 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, return; } + /* Here 'b' is not empty. See about 'a' */ + if (a == NULL || ((len_a = _invlist_len(a)) == 0)) { /* Here, 'a' is empty (and b is not). That means the union will come - * entirely from 'b'. If the output is not to overwrite 'a', we can - * just return what's in 'b'. */ - if (*output != a) { - - /* If the output is to overwrite 'b', it's already in 'b', but - * otherwise we have to copy 'b' to the output */ - if (*output != b) { - *output = invlist_clone(b); - } - - /* And if the output is to be the inversion of 'b', do that */ - if (complement_b) { - _invlist_invert(*output); - } + * entirely from 'b'. If '*output' is NULL, we can directly return a + * clone of 'b'. Otherwise, we replace the contents of '*output' with + * the clone */ - return; + SV ** dest = (*output == NULL) ? output : &u; + *dest = invlist_clone(b); + if (complement_b) { + _invlist_invert(*dest); } - /* Here, 'a', which is empty or even NULL, is to be overwritten by the - * output, which will either be 'b' or the complement of 'b' */ - - if (a == NULL) { - *output = invlist_clone(b); - } - else { - u = invlist_clone(b); + if (dest == &u) { invlist_replace_list_destroys_src(*output, u); SvREFCNT_dec_NN(u); - } - - if (complement_b) { - _invlist_invert(*output); } return; @@ -9108,8 +9163,8 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, array_u = _invlist_array_init(u, ( len_a > 0 && array_a[0] == 0) || (len_b > 0 && array_b[0] == 0)); - /* Go through each input list item by item, stopping when exhausted one of - * them */ + /* Go through each input list item by item, stopping when have exhausted + * one of them */ while (i_a < len_a && i_b < len_b) { UV cp; /* The element to potentially add to the union's array */ bool cp_in_set; /* is it in the the input list's set or not */ @@ -9212,30 +9267,17 @@ Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, array_u = invlist_array(u); } - /* If the output is not to overwrite either of the inputs, just return the - * calculated union */ - if (a != *output && b != *output) { + if (*output == NULL) { /* Simply return the new inversion list */ *output = u; } else { - /* Here, the output is to be the same as one of the input scalars, - * hence replacing it. The simple thing to do is to free the input - * scalar, making it instead be the output one. But experience has - * shown [perl #127392] that if the input is a mortal, we can get a - * huge build-up of these during regex compilation before they get - * freed. So for that case, replace just the input's interior with - * the union's, and then free the union */ - - assert(! invlist_is_iterating(*output)); - - if (! SvTEMP(*output)) { - SvREFCNT_dec_NN(*output); - *output = u; - } - else { - invlist_replace_list_destroys_src(*output, u); - SvREFCNT_dec_NN(u); - } + /* Otherwise, overwrite the inversion list that was in '*output'. We + * could instead free '*output', and then set it to 'u', but experience + * has shown [perl #127392] that if the input is a mortal, we can get a + * huge build-up of these during regex compilation before they get + * freed. */ + invlist_replace_list_destroys_src(*output, u); + SvREFCNT_dec_NN(u); } return; @@ -9245,14 +9287,13 @@ void Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, const bool complement_b, SV** i) { - /* Take the intersection of two inversion lists and point to it. *i - * SHOULD BE DEFINED upon input, and if it points to one of the two lists, - * the reference count to that list will be decremented if not already a - * temporary (mortal); otherwise just its contents will be modified to be - * the intersection. The first list, , may be NULL, in which case an - * empty list is returned. If is TRUE, the result will be - * the intersection of and the complement (or inversion) of instead - * of directly. + /* Take the intersection of two inversion lists and point '*i' to it. On + * input, '*i' MUST POINT TO NULL OR TO AN SV* INVERSION LIST (possibly + * even 'a' or 'b'). If to an inversion list, the contents of the original + * list will be replaced by the intersection. The first list, 'a', may be + * NULL, in which case '*i' will be an empty list. If 'complement_b' is + * TRUE, the result will be the intersection of 'a' and the complement (or + * inversion) of 'b' instead of 'b' directly. * * The basis for this comes from "Unicode Demystified" Chapter 13 by * Richard Gillam, published by Addison-Wesley, and explained at some @@ -9286,6 +9327,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND; assert(a != b); + assert(*i == NULL || SvTYPE(*i) == SVt_INVLIST); /* Special case if either one is empty */ len_a = (a == NULL) ? 0 : _invlist_len(a); @@ -9301,13 +9343,11 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, return; } - /* If not overwriting either input, just make a copy of 'a' */ - if (*i != b) { + if (*i == NULL) { *i = invlist_clone(a); return; } - /* Here we are overwriting 'b' with 'a's contents */ r = invlist_clone(a); invlist_replace_list_destroys_src(*i, r); SvREFCNT_dec_NN(r); @@ -9356,7 +9396,7 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0 && len_b > 0 && array_b[0] == 0); - /* Go through each list item by item, stopping when exhausted one of + /* Go through each list item by item, stopping when have exhausted one of * them */ while (i_a < len_a && i_b < len_b) { UV cp; /* The element to potentially add to the intersection's @@ -9461,47 +9501,21 @@ Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, array_r = invlist_array(r); } - /* Finish outputting any remaining */ - if (count >= 2) { /* At most one will have a non-zero copy count */ - IV copy_count; - if ((copy_count = len_a - i_a) > 0) { - Copy(array_a + i_a, array_r + i_r, copy_count, UV); - } - else if ((copy_count = len_b - i_b) > 0) { - Copy(array_b + i_b, array_r + i_r, copy_count, UV); - } - } - - /* If the output is not to overwrite either of the inputs, just return the - * calculated intersection */ - if (a != *i && b != *i) { + if (*i == NULL) { /* Simply return the calculated intersection */ *i = r; } - else { - /* Here, the output is to be the same as one of the input scalars, - * hence replacing it. The simple thing to do is to free the input - * scalar, making it instead be the output one. But experience has - * shown [perl #127392] that if the input is a mortal, we can get a - * huge build-up of these during regex compilation before they get - * freed. So for that case, replace just the input's interior with - * the output's, and then free the output. A short-cut in this case - * is if the output is empty, we can just set the input to be empty */ - - assert(! invlist_is_iterating(*i)); - - if (! SvTEMP(*i)) { - SvREFCNT_dec_NN(*i); - *i = r; + else { /* Otherwise, replace the existing inversion list in '*i'. We could + instead free '*i', and then set it to 'r', but experience has + shown [perl #127392] that if the input is a mortal, we can get a + huge build-up of these during regex compilation before they get + freed. */ + if (len_r) { + invlist_replace_list_destroys_src(*i, r); } else { - if (len_r) { - invlist_replace_list_destroys_src(*i, r); - } - else { - invlist_clear(*i); - } - SvREFCNT_dec_NN(r); + invlist_clear(*i); } + SvREFCNT_dec_NN(r); } return; @@ -9788,7 +9802,7 @@ Perl__setup_canned_invlist(pTHX_ const STRLEN size, const UV element0, PERL_ARGS_ASSERT__SETUP_CANNED_INVLIST; - _append_range_to_invlist(invlist, element0, element0); + invlist = add_cp_to_invlist(invlist, element0); offset = *get_invlist_offset_addr(invlist); invlist_set_len(invlist, size, offset); @@ -9978,18 +9992,18 @@ S_invlist_contents(pTHX_ SV* const invlist, const bool traditional_style) invlist_iterinit(invlist); while (invlist_iternext(invlist, &start, &end)) { if (end == UV_MAX) { - Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%cINFINITY%c", + Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%cINFINITY%c", start, intra_range_delimiter, inter_range_delimiter); } else if (end != start) { - Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c%04"UVXf"%c", + Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c%04" UVXf "%c", start, intra_range_delimiter, end, inter_range_delimiter); } else { - Perl_sv_catpvf(aTHX_ output, "%04"UVXf"%c", + Perl_sv_catpvf(aTHX_ output, "%04" UVXf "%c", start, inter_range_delimiter); } } @@ -10036,16 +10050,16 @@ Perl__invlist_dump(pTHX_ PerlIO *file, I32 level, while (invlist_iternext(invlist, &start, &end)) { if (end == UV_MAX) { Perl_dump_indent(aTHX_ level, file, - "%s[%"UVuf"] 0x%04"UVXf" .. INFINITY\n", + "%s[%" UVuf "] 0x%04" UVXf " .. INFINITY\n", indent, (UV)count, start); } else if (end != start) { Perl_dump_indent(aTHX_ level, file, - "%s[%"UVuf"] 0x%04"UVXf" .. 0x%04"UVXf"\n", + "%s[%" UVuf "] 0x%04" UVXf " .. 0x%04" UVXf "\n", indent, (UV)count, start, end); } else { - Perl_dump_indent(aTHX_ level, file, "%s[%"UVuf"] 0x%04"UVXf"\n", + Perl_dump_indent(aTHX_ level, file, "%s[%" UVuf "] 0x%04" UVXf "\n", indent, (UV)count, start); } count += 2; @@ -10061,9 +10075,10 @@ Perl__load_PL_utf8_foldclosures (pTHX) * to force that */ if (! PL_utf8_tofold) { U8 dummy[UTF8_MAXBYTES_CASE+1]; + const U8 hyphen[] = HYPHEN_UTF8; /* This string is just a short named one above \xff */ - to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + toFOLD_utf8_safe(hyphen, hyphen + sizeof(hyphen) - 1, dummy, NULL); assert(PL_utf8_tofold); /* Verify that worked */ } PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); @@ -10083,9 +10098,6 @@ Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) UV len_a = _invlist_len(a); UV len_b = _invlist_len(b); - UV i = 0; /* current index into the arrays */ - bool retval = TRUE; /* Assume are identical until proven otherwise */ - PERL_ARGS_ASSERT__INVLISTEQ; /* If are to compare 'a' with the complement of b, set it @@ -10115,20 +10127,9 @@ Perl__invlistEQ(pTHX_ SV* const a, SV* const b, const bool complement_b) } } - /* Make sure that the lengths are the same, as well as the final element - * before looping through the remainder. (Thus we test the length, final, - * and first elements right off the bat) */ - if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) { - retval = FALSE; - } - else for (i = 0; i < len_a - 1; i++) { - if (array_a[i] != array_b[i]) { - retval = FALSE; - break; - } - } + return len_a == len_b + && memEQ(array_a, array_b, len_a * sizeof(array_a[0])); - return retval; } #endif @@ -10214,7 +10215,7 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) } else { STRLEN len; - to_utf8_fold(s, d, &len); + toFOLD_utf8_safe(s, e, d, &len); d += len; s += UTF8SKIP(s); } @@ -10255,7 +10256,7 @@ S__make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node) { AV* list = (AV*) *listp; IV k; - for (k = 0; k <= av_tindex_nomg(list); k++) { + for (k = 0; k <= av_tindex_skip_len_mg(list); k++) { SV** c_p = av_fetch(list, k, FALSE); UV c; assert(c_p); @@ -10464,26 +10465,28 @@ S_parse_lparen_question_flags(pTHX_ RExC_state_t *pRExC_state) } flagsp = &negflags; wastedflags = 0; /* reset so (?g-c) warns twice */ + x_mod_count = 0; break; case ':': case ')': + + if ((posflags & (RXf_PMf_EXTENDED|RXf_PMf_EXTENDED_MORE)) == RXf_PMf_EXTENDED) { + negflags |= RXf_PMf_EXTENDED_MORE; + } RExC_flags |= posflags; + + if (negflags & RXf_PMf_EXTENDED) { + negflags |= RXf_PMf_EXTENDED_MORE; + } RExC_flags &= ~negflags; set_regex_charset(&RExC_flags, cs); - if (RExC_flags & RXf_PMf_FOLD) { - RExC_contains_i = 1; - } - if (UNLIKELY((x_mod_count) > 1)) { - vFAIL("Only one /x regex modifier is allowed"); - } return; - /*NOTREACHED*/ default: fail_modifiers: RExC_parse += SKIP_IF_CHAR(RExC_parse); /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ - vFAIL2utf8f("Sequence (%"UTF8f"...) not recognized", + vFAIL2utf8f("Sequence (%" UTF8f "...) not recognized", UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); NOT_REACHED; /*NOTREACHED*/ } @@ -10695,7 +10698,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if ( ! op ) { RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1; vFAIL2utf8f( - "Unknown verb pattern '%"UTF8f"'", + "Unknown verb pattern '%" UTF8f "'", UTF8fARG(UTF, verb_len, start_verb)); } if ( arg_required && !start_arg ) { @@ -10860,7 +10863,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_seen |= REG_LOOKBEHIND_SEEN; RExC_in_lookbehind++; RExC_parse++; - assert(RExC_parse < RExC_end); + if (RExC_parse >= RExC_end) { + vFAIL("Sequence (?... not terminated"); + } + /* FALLTHROUGH */ case '=': /* (?=...) */ RExC_seen_zerolen++; @@ -10989,7 +10995,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } RExC_recurse_count++; DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ - "%*s%*s Recurse #%"UVuf" to %"IVdf"\n", + "%*s%*s Recurse #%" UVuf " to %" IVdf "\n", 22, "| |", (int)(depth * 2 + 1), "", (UV)ARG(ret), (IV)ARG2L(ret))); } @@ -11011,7 +11017,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_parse += SKIP_IF_CHAR(RExC_parse); /* diag_listed_as: Sequence (?%s...) not recognized in regex; marked by <-- HERE in m/%s/ */ vFAIL2utf8f( - "Sequence (%"UTF8f"...) not recognized", + "Sequence (%" UTF8f "...) not recognized", UTF8fARG(UTF, RExC_parse-seqstart, seqstart)); NOT_REACHED; /*NOTREACHED*/ } @@ -11026,9 +11032,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) RExC_seen_zerolen++; - if ( !pRExC_state->num_code_blocks - || pRExC_state->code_index >= pRExC_state->num_code_blocks - || pRExC_state->code_blocks[pRExC_state->code_index].start + if ( !pRExC_state->code_blocks + || pRExC_state->code_index + >= pRExC_state->code_blocks->count + || pRExC_state->code_blocks->cb[pRExC_state->code_index].start != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0)) - RExC_start) ) { @@ -11037,7 +11044,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) FAIL("Eval-group not allowed at runtime, use re 'eval'"); } /* this is a pre-compiled code block (?{...}) */ - cb = &pRExC_state->code_blocks[pRExC_state->code_index]; + cb = &pRExC_state->code_blocks->cb[pRExC_state->code_index]; RExC_parse = RExC_start + cb->end; if (!SIZE_ONLY) { OP *o = cb->block; @@ -11213,7 +11220,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } - FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", + FAIL2("panic: regbranch returned NULL, flags=%#" UVxf, (UV) flags); } else REGTAIL(pRExC_state, br, reganode(pRExC_state, @@ -11234,7 +11241,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } - FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", + FAIL2("panic: regbranch returned NULL, flags=%#" UVxf, (UV) flags); } REGTAIL(pRExC_state, ret, lastbr); @@ -11302,7 +11309,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (RExC_open_parens && !RExC_open_parens[parno]) { DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ - "%*s%*s Setting open paren #%"IVdf" to %d\n", + "%*s%*s Setting open paren #%" IVdf " to %d\n", 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ret))); RExC_open_parens[parno]= ret; @@ -11332,7 +11339,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } - FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: regbranch returned NULL, flags=%#" UVxf, (UV) flags); } if (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { @@ -11379,7 +11386,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } - FAIL2("panic: regbranch returned NULL, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: regbranch returned NULL, flags=%#" UVxf, (UV) flags); } REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ lastbr = br; @@ -11396,7 +11403,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ender = reganode(pRExC_state, CLOSE, parno); if ( RExC_close_parens ) { DEBUG_OPTIMISE_MORE_r(Perl_re_printf( aTHX_ - "%*s%*s Setting close paren #%"IVdf" to %d\n", + "%*s%*s Setting close paren #%" IVdf " to %d\n", 22, "| |", (int)(depth * 2 + 1), "", (IV)parno, REG_NODE_NUM(ender))); RExC_close_parens[parno]= ender; if (RExC_nestroot == parno) @@ -11433,7 +11440,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) DEBUG_PARSE_MSG("lsbr"); regprop(RExC_rx, RExC_mysv1, lastbr, NULL, pRExC_state); regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state); - Perl_re_printf( aTHX_ "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + Perl_re_printf( aTHX_ "~ tying lastbr %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n", SvPV_nolen_const(RExC_mysv1), (IV)REG_NODE_NUM(lastbr), SvPV_nolen_const(RExC_mysv2), @@ -11472,7 +11479,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) DEBUG_PARSE_MSG("NADA"); regprop(RExC_rx, RExC_mysv1, ret, NULL, pRExC_state); regprop(RExC_rx, RExC_mysv2, ender, NULL, pRExC_state); - Perl_re_printf( aTHX_ "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n", + Perl_re_printf( aTHX_ "~ converting ret %s (%" IVdf ") to ender %s (%" IVdf ") offset %" IVdf "\n", SvPV_nolen_const(RExC_mysv1), (IV)REG_NODE_NUM(ret), SvPV_nolen_const(RExC_mysv2), @@ -11593,7 +11600,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } - FAIL2("panic: regpiece returned NULL, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: regpiece returned NULL, flags=%#" UVxf, (UV) flags); } else if (ret == NULL) ret = latest; @@ -11622,7 +11629,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) } /* - - regpiece - something followed by possible [*+?] + - regpiece - something followed by possible quantifier * + ? {n,m} * * Note that the branching code sequences used for ? and the general cases * of * and + are somewhat optimized: they use the same NOTHING node as @@ -11665,7 +11672,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8)) *flagp |= flags & (TRYAGAIN|RESTART_PASS1|NEED_UTF8); else - FAIL2("panic: regatom returned NULL, flags=%#"UVxf"", (UV) flags); + FAIL2("panic: regatom returned NULL, flags=%#" UVxf, (UV) flags); return(NULL); } @@ -11717,19 +11724,11 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) nextchar(pRExC_state); if (max < min) { /* If can't match, warn and optimize to fail unconditionally */ - if (SIZE_ONLY) { - - /* We can't back off the size because we have to reserve - * enough space for all the things we are about to throw - * away, but we can shrink it by the amount we are about - * to re-use here */ - RExC_size += PREVOPER(RExC_size) - regarglen[(U8)OPFAIL]; - } - else { + reginsert(pRExC_state, OPFAIL, orig_emit, depth+1); + if (PASS2) { ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match"); - RExC_emit = orig_emit; + NEXT_OFF(orig_emit)= regarglen[OPFAIL] + NODE_STEP_REGNODE; } - ret = reganode(pRExC_state, OPFAIL, 0); return ret; } else if (min == max && *RExC_parse == '?') @@ -11846,7 +11845,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) { SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ ckWARN2reg(RExC_parse, - "%"UTF8f" matches null string many times", + "%" UTF8f " matches null string many times", UTF8fARG(UTF, (RExC_parse >= origparse ? RExC_parse - origparse : 0), @@ -12020,13 +12019,15 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, RExC_parse++; /* Skip past the '{' */ - if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */ - || ! (endbrace == RExC_parse /* nothing between the {} */ + if (! (endbrace = strchr(RExC_parse, '}'))) { /* no trailing brace */ + vFAIL2("Missing right brace on \\%c{}", 'N'); + } + else if(!(endbrace == RExC_parse /* nothing between the {} */ || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked... */ && strnEQ(RExC_parse, "U+", 2)))) /* ... below for a better error msg) */ { - if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */ + RExC_parse = endbrace; /* position msg's '<--HERE' */ vFAIL("\\N{NAME} must be resolved by the lexer"); } @@ -12172,7 +12173,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, /* The values are Unicode, and therefore not subject to recoding, but * have to be converted to native on a non-Unicode (meaning non-ASCII) * platform. */ - RExC_override_recoding = 1; #ifdef EBCDIC RExC_recode_x_to_native = 1; #endif @@ -12183,7 +12183,7 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, *flagp = flags & (RESTART_PASS1|NEED_UTF8); return FALSE; } - FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#"UVxf"", + FAIL2("panic: reg returned NULL to grok_bslash_N, flags=%#" UVxf, (UV) flags); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); @@ -12193,7 +12193,6 @@ S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, RExC_start = RExC_adjusted_start = save_start; RExC_parse = endbrace; RExC_end = orig_end; - RExC_override_recoding = 0; #ifdef EBCDIC RExC_recode_x_to_native = 0; #endif @@ -12549,7 +12548,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) if (ret == NULL) { if (*flagp & (RESTART_PASS1|NEED_UTF8)) return NULL; - FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", + FAIL2("panic: regclass returned NULL to regatom, flags=%#" UVxf, (UV) *flagp); } if (*RExC_parse != ']') { @@ -12576,7 +12575,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) *flagp = flags & (RESTART_PASS1|NEED_UTF8); return NULL; } - FAIL2("panic: reg returned NULL to regatom, flags=%#"UVxf"", + FAIL2("panic: reg returned NULL to regatom, flags=%#" UVxf, (UV) flags); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); @@ -12739,7 +12738,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) bad_bound_type: RExC_parse = endbrace; vFAIL2utf8f( - "'%"UTF8f"' is an unknown bound type", + "'%" UTF8f "' is an unknown bound type", UTF8fARG(UTF, length, endbrace - length)); NOT_REACHED; /*NOTREACHED*/ } @@ -12857,7 +12856,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) /* regclass() can only return RESTART_PASS1 and NEED_UTF8 if * multi-char folds are allowed. */ if (!ret) - FAIL2("panic: regclass returned NULL to regatom, flags=%#"UVxf"", + FAIL2("panic: regclass returned NULL to regatom, flags=%#" UVxf, (UV) *flagp); RExC_parse--; @@ -13392,6 +13391,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) RExC_parse = p + 1; vFAIL("Unescaped left brace in regex is illegal here"); } + goto normal_default; + case '}': + case ']': + if (PASS2 && p > RExC_parse && RExC_strict) { + ckWARN2reg(p + 1, "Unescaped literal '%c'", *p); + } /*FALLTHROUGH*/ default: /* A literal character */ normal_default: @@ -13811,7 +13816,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) skip_to_be_ignored_text(pRExC_state, &RExC_parse, FALSE /* Don't force to /x */ ); if (PASS2 && *RExC_parse == '{' && OP(ret) != SBOL && ! regcurly(RExC_parse)) { - ckWARNregdep(RExC_parse + 1, "Unescaped left brace in regex is deprecated here, passed through"); + ckWARNregdep(RExC_parse + 1, "Unescaped left brace in regex is deprecated here (and will be fatal in Perl 5.30), passed through"); } return(ret); @@ -14689,7 +14694,7 @@ S_handle_possible_posix(pTHX_ RExC_state_t *pRExC_state, ? "^" : ""; RExC_parse = (char *) p; - vFAIL3utf8f("POSIX class [:%s%"UTF8f":] unknown", + vFAIL3utf8f("POSIX class [:%s%" UTF8f ":] unknown", complement_string, UTF8fARG(UTF, RExC_parse - name_start - 2, name_start)); } @@ -14832,7 +14837,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, &posix_warnings )) FAIL2("panic: regclass returned NULL to handle_sets, " - "flags=%#"UVxf"", (UV) *flagp); + "flags=%#" UVxf, (UV) *flagp); /* function call leaves parse pointing to the ']', except * if we faked it */ @@ -14868,7 +14873,7 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, no_close: /* We output the messages even if warnings are off, because we'll fail * the very next thing, and these give a likely diagnosis for that */ - if (posix_warnings && av_tindex_nomg(posix_warnings) >= 0) { + if (posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) { output_or_return_posix_warnings(pRExC_state, posix_warnings, NULL); } @@ -14977,7 +14982,13 @@ S_handle_regex_sets(pTHX_ RExC_state_t *pRExC_state, SV** return_invlist, redo_curchar: - top_index = av_tindex_nomg(stack); +#ifdef ENABLE_REGEX_SETS_DEBUGGING + /* Enable with -Accflags=-DENABLE_REGEX_SETS_DEBUGGING */ + DEBUG_U(dump_regex_sets_structures(pRExC_state, + stack, fence, fence_stack)); +#endif + + top_index = av_tindex_skip_len_mg(stack); switch (curchar) { SV** stacked_ptr; /* Ptr to something already on 'stack' */ @@ -15078,8 +15089,8 @@ redo_curchar: } /* Stack the position of this undealt-with left paren */ - fence = top_index + 1; av_push(fence_stack, newSViv(fence)); + fence = top_index + 1; break; case '\\': @@ -15095,7 +15106,7 @@ redo_curchar: NULL)) { FAIL2("panic: regclass returned NULL to handle_sets, " - "flags=%#"UVxf"", (UV) *flagp); + "flags=%#" UVxf, (UV) *flagp); } /* regclass() will return with parsing just the \ sequence, @@ -15134,7 +15145,7 @@ redo_curchar: )) { FAIL2("panic: regclass returned NULL to handle_sets, " - "flags=%#"UVxf"", (UV) *flagp); + "flags=%#" UVxf, (UV) *flagp); } /* function call leaves parse pointing to the ']', except if we @@ -15155,12 +15166,17 @@ redo_curchar: goto done; case ')': - if (av_tindex_nomg(fence_stack) < 0) { + if (av_tindex_skip_len_mg(fence_stack) < 0) { RExC_parse++; vFAIL("Unexpected ')'"); } - /* If at least two thing on the stack, treat this as an + /* If nothing after the fence, is missing an operand */ + if (top_index - fence < 0) { + RExC_parse++; + goto bad_syntax; + } + /* If at least two things on the stack, treat this as an * operator */ if (top_index - fence >= 1) { goto join_operators; @@ -15298,17 +15314,12 @@ redo_curchar: { SV* i = NULL; SV* u = NULL; - SV* element; _invlist_union(lhs, rhs, &u); _invlist_intersection(lhs, rhs, &i); - /* _invlist_subtract will overwrite rhs - without freeing what it already contains */ - element = rhs; _invlist_subtract(u, i, &rhs); SvREFCNT_dec_NN(i); SvREFCNT_dec_NN(u); - SvREFCNT_dec_NN(element); break; } } @@ -15351,7 +15362,7 @@ redo_curchar: * may have altered the stack in the time since we earlier set * 'top_index'. */ - top_index = av_tindex_nomg(stack); + top_index = av_tindex_skip_len_mg(stack); if (top_index - fence >= 0) { /* If the top entry on the stack is an operator, it had better * be a '!', otherwise the entry below the top operand should @@ -15402,15 +15413,15 @@ redo_curchar: } /* End of loop parsing through the construct */ done: - if (av_tindex_nomg(fence_stack) >= 0) { + if (av_tindex_skip_len_mg(fence_stack) >= 0) { vFAIL("Unmatched ("); } - if (av_tindex_nomg(stack) < 0 /* Was empty */ + if (av_tindex_skip_len_mg(stack) < 0 /* Was empty */ || ((final = av_pop(stack)) == NULL) || ! IS_OPERAND(final) || SvTYPE(final) != SVt_INVLIST - || av_tindex_nomg(stack) >= 0) /* More left on stack */ + || av_tindex_skip_len_mg(stack) >= 0) /* More left on stack */ { bad_syntax: SvREFCNT_dec(final); @@ -15430,10 +15441,10 @@ redo_curchar: result_string = newSVpvs(""); while (invlist_iternext(final, &start, &end)) { if (start == end) { - Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}", start); + Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}", start); } else { - Perl_sv_catpvf(aTHX_ result_string, "\\x{%"UVXf"}-\\x{%"UVXf"}", + Perl_sv_catpvf(aTHX_ result_string, "\\x{%" UVXf "}-\\x{%" UVXf "}", start, end); } } @@ -15463,7 +15474,7 @@ redo_curchar: NULL ); if (!node) - FAIL2("panic: regclass returned NULL to handle_sets, flags=%#"UVxf, + FAIL2("panic: regclass returned NULL to handle_sets, flags=%#" UVxf, PTR2UV(flagp)); /* Fix up the node type if we are in locale. (We have pretended we are @@ -15505,6 +15516,61 @@ redo_curchar: Set_Node_Length(node, RExC_parse - oregcomp_parse + 1); /* MJD */ return node; } + +#ifdef ENABLE_REGEX_SETS_DEBUGGING + +STATIC void +S_dump_regex_sets_structures(pTHX_ RExC_state_t *pRExC_state, + AV * stack, const IV fence, AV * fence_stack) +{ /* Dumps the stacks in handle_regex_sets() */ + + const SSize_t stack_top = av_tindex_skip_len_mg(stack); + const SSize_t fence_stack_top = av_tindex_skip_len_mg(fence_stack); + SSize_t i; + + PERL_ARGS_ASSERT_DUMP_REGEX_SETS_STRUCTURES; + + PerlIO_printf(Perl_debug_log, "\nParse position is:%s\n", RExC_parse); + + if (stack_top < 0) { + PerlIO_printf(Perl_debug_log, "Nothing on stack\n"); + } + else { + PerlIO_printf(Perl_debug_log, "Stack: (fence=%d)\n", (int) fence); + for (i = stack_top; i >= 0; i--) { + SV ** element_ptr = av_fetch(stack, i, FALSE); + if (! element_ptr) { + } + + if (IS_OPERATOR(*element_ptr)) { + PerlIO_printf(Perl_debug_log, "[%d]: %c\n", + (int) i, (int) SvIV(*element_ptr)); + } + else { + PerlIO_printf(Perl_debug_log, "[%d] ", (int) i); + sv_dump(*element_ptr); + } + } + } + + if (fence_stack_top < 0) { + PerlIO_printf(Perl_debug_log, "Nothing on fence_stack\n"); + } + else { + PerlIO_printf(Perl_debug_log, "Fence_stack: \n"); + for (i = fence_stack_top; i >= 0; i--) { + SV ** element_ptr = av_fetch(fence_stack, i, FALSE); + if (! element_ptr) { + } + + PerlIO_printf(Perl_debug_log, "[%d]: %d\n", + (int) i, (int) SvIV(*element_ptr)); + } + } +} + +#endif + #undef IS_OPERATOR #undef IS_OPERAND @@ -15765,8 +15831,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, character; used under /i */ UV n; char * stop_ptr = RExC_end; /* where to stop parsing */ - const bool skip_white = cBOOL(ret_invlist); /* ignore unescaped white - space? */ + + /* ignore unescaped whitespace? */ + const bool skip_white = cBOOL( ret_invlist + || (RExC_flags & RXf_PMf_EXTENDED_MORE)); /* Unicode properties are stored in a swash; this holds the current one * being parsed. If this swash is the only above-latin1 component of the @@ -15911,7 +15979,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, while (1) { if ( posix_warnings - && av_tindex_nomg(posix_warnings) >= 0 + && av_tindex_skip_len_mg(posix_warnings) >= 0 && RExC_parse > not_posix_region_end) { /* Warnings about posix class issues are considered tentative until @@ -15967,7 +16035,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * posix class, and it failed, it was a false alarm, as this * successful one proves */ if ( posix_warnings - && av_tindex_nomg(posix_warnings) >= 0 + && av_tindex_skip_len_mg(posix_warnings) >= 0 && not_posix_region_end >= RExC_parse && not_posix_region_end <= posix_class_end) { @@ -16237,7 +16305,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_parse = e + 1; /* diag_listed_as: Can't find Unicode property definition "%s" */ - vFAIL3utf8f("%s \"%"UTF8f"\"", + vFAIL3utf8f("%s \"%" UTF8f "\"", msg, UTF8fARG(UTF, n, name)); } @@ -16256,7 +16324,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, SAVEFREEPV(name); } } - Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%"UTF8f"%s\n", + Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s%" UTF8f "%s\n", (value == 'p' ? '+' : '!'), (FOLD) ? "__" : "", UTF8fARG(UTF, n, name), @@ -16426,13 +16494,13 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, : 0; if (strict) { vFAIL2utf8f( - "False [] range \"%"UTF8f"\"", + "False [] range \"%" UTF8f "\"", UTF8fARG(UTF, w, rangebegin)); } else { SAVEFREESV(RExC_rx_sv); /* in case of fatal warnings */ ckWARN2reg(RExC_parse, - "False [] range \"%"UTF8f"\"", + "False [] range \"%" UTF8f "\"", UTF8fARG(UTF, w, rangebegin)); (void)ReREFCNT_inc(RExC_rx_sv); cp_list = add_cp_to_invlist(cp_list, '-'); @@ -16620,7 +16688,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, #endif w = RExC_parse - rangebegin; vFAIL2utf8f( - "Invalid [] range \"%"UTF8f"\"", + "Invalid [] range \"%" UTF8f "\"", UTF8fARG(UTF, w, rangebegin)); NOT_REACHED; /* NOTREACHED */ } @@ -16725,7 +16793,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, foldbuf + foldlen); SV* multi_fold = sv_2mortal(newSVpvs("")); - Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value); + Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%" UVXf "}", value); multi_char_matches = add_multi_match(multi_char_matches, @@ -16762,15 +16830,19 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * must be be all digits or all letters of the same case. * Otherwise, the range is non-portable and unclear as to * what it contains */ - if ((isPRINT_A(prevvalue) || isPRINT_A(value)) - && (non_portable_endpoint - || ! ((isDIGIT_A(prevvalue) && isDIGIT_A(value)) - || (isLOWER_A(prevvalue) && isLOWER_A(value)) - || (isUPPER_A(prevvalue) && isUPPER_A(value))))) - { - vWARN(RExC_parse, "Ranges of ASCII printables should be some subset of \"0-9\", \"A-Z\", or \"a-z\""); + if ( (isPRINT_A(prevvalue) || isPRINT_A(value)) + && ( non_portable_endpoint + || ! ( (isDIGIT_A(prevvalue) && isDIGIT_A(value)) + || (isLOWER_A(prevvalue) && isLOWER_A(value)) + || (isUPPER_A(prevvalue) && isUPPER_A(value)) + ))) { + vWARN(RExC_parse, "Ranges of ASCII printables should" + " be some subset of \"0-9\"," + " \"A-Z\", or \"a-z\""); } else if (prevvalue >= 0x660) { /* ARABIC_INDIC_DIGIT_ZERO */ + SSize_t index_start; + SSize_t index_final; /* But the nature of Unicode and languages mean we * can't do the same checks for above-ASCII ranges, @@ -16778,40 +16850,68 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, * contain only digits from the same group of 10. The * ASCII case is handled just above. 0x660 is the * first digit character beyond ASCII. Hence here, the - * range could be a range of digits. Find out. */ - IV index_start = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT], - prevvalue); - IV index_final = _invlist_search(PL_XPosix_ptrs[_CC_DIGIT], - value); - - /* If the range start and final points are in the same - * inversion list element, it means that either both - * are not digits, or both are digits in a consecutive - * sequence of digits. (So far, Unicode has kept all - * such sequences as distinct groups of 10, but assert - * to make sure). If the end points are not in the - * same element, neither should be a digit. */ - if (index_start == index_final) { - assert(! ELEMENT_RANGE_MATCHES_INVLIST(index_start) - || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1] - - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] - == 10) - /* But actually Unicode did have one group of 11 - * 'digits' in 5.2, so in case we are operating - * on that version, let that pass */ - || (invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start+1] - - invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] - == 11 - && invlist_array(PL_XPosix_ptrs[_CC_DIGIT])[index_start] - == 0x19D0) - ); + * range could be a range of digits. First some + * unlikely special cases. Grandfather in that a range + * ending in 19DA (NEW TAI LUE THAM DIGIT ONE) is bad + * if its starting value is one of the 10 digits prior + * to it. This is because it is an alternate way of + * writing 19D1, and some people may expect it to be in + * that group. But it is bad, because it won't give + * the expected results. In Unicode 5.2 it was + * considered to be in that group (of 11, hence), but + * this was fixed in the next version */ + + if (UNLIKELY(value == 0x19DA && prevvalue >= 0x19D0)) { + goto warn_bad_digit_range; } - else if ((index_start >= 0 - && ELEMENT_RANGE_MATCHES_INVLIST(index_start)) - || (index_final >= 0 - && ELEMENT_RANGE_MATCHES_INVLIST(index_final))) + else if (UNLIKELY( prevvalue >= 0x1D7CE + && value <= 0x1D7FF)) { - vWARN(RExC_parse, "Ranges of digits should be from the same group of 10"); + /* This is the only other case currently in Unicode + * where the algorithm below fails. The code + * points just above are the end points of a single + * range containing only decimal digits. It is 5 + * different series of 0-9. All other ranges of + * digits currently in Unicode are just a single + * series. (And mktables will notify us if a later + * Unicode version breaks this.) + * + * If the range being checked is at most 9 long, + * and the digit values represented are in + * numerical order, they are from the same series. + * */ + if ( value - prevvalue > 9 + || ((( value - 0x1D7CE) % 10) + <= (prevvalue - 0x1D7CE) % 10)) + { + goto warn_bad_digit_range; + } + } + else { + + /* For all other ranges of digits in Unicode, the + * algorithm is just to check if both end points + * are in the same series, which is the same range. + * */ + index_start = _invlist_search( + PL_XPosix_ptrs[_CC_DIGIT], + prevvalue); + + /* Warn if the range starts and ends with a digit, + * and they are not in the same group of 10. */ + if ( index_start >= 0 + && ELEMENT_RANGE_MATCHES_INVLIST(index_start) + && (index_final = + _invlist_search(PL_XPosix_ptrs[_CC_DIGIT], + value)) != index_start + && index_final >= 0 + && ELEMENT_RANGE_MATCHES_INVLIST(index_final)) + { + warn_bad_digit_range: + vWARN(RExC_parse, "Ranges of digits should be" + " from the same group of" + " 10"); + } } } } @@ -16892,7 +16992,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, } /* End of loop through all the text within the brackets */ - if ( posix_warnings && av_tindex_nomg(posix_warnings) >= 0) { + if ( posix_warnings && av_tindex_skip_len_mg(posix_warnings) >= 0) { output_or_return_posix_warnings(pRExC_state, posix_warnings, return_posix_warnings); } @@ -16925,7 +17025,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, #endif /* Look at the longest folds first */ - for (cp_count = av_tindex_nomg(multi_char_matches); + for (cp_count = av_tindex_skip_len_mg(multi_char_matches); cp_count > 0; cp_count--) { @@ -16981,7 +17081,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_adjusted_start = RExC_start + prefix_end; RExC_end = RExC_parse + len; RExC_in_multi_char_class = 1; - RExC_override_recoding = 1; RExC_emit = (regnode *)orig_emit; ret = reg(pRExC_state, 1, ®_flags, depth+1); @@ -16994,7 +17093,6 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, RExC_precomp_adj = 0; RExC_end = save_end; RExC_in_multi_char_class = 0; - RExC_override_recoding = 0; SvREFCNT_dec_NN(multi_char_matches); return ret; } @@ -17309,7 +17407,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, { AV* list = (AV*) *listp; IV k; - for (k = 0; k <= av_tindex_nomg(list); k++) { + for (k = 0; k <= av_tindex_skip_len_mg(list); k++) { SV** c_p = av_fetch(list, k, FALSE); UV c; assert(c_p); @@ -17472,22 +17570,15 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, &nonascii_but_latin1_properties); /* And add them to the final list of such characters. */ - if (has_upper_latin1_only_utf8_matches) { - _invlist_union(has_upper_latin1_only_utf8_matches, - nonascii_but_latin1_properties, - &has_upper_latin1_only_utf8_matches); - SvREFCNT_dec_NN(nonascii_but_latin1_properties); - } - else { - has_upper_latin1_only_utf8_matches - = nonascii_but_latin1_properties; - } + _invlist_union(has_upper_latin1_only_utf8_matches, + nonascii_but_latin1_properties, + &has_upper_latin1_only_utf8_matches); /* Remove them from what now becomes the unconditional list */ _invlist_subtract(posixes, nonascii_but_latin1_properties, &posixes); - /* And the remainder are the unconditional ones */ + /* And add those unconditional ones to the final list */ if (cp_list) { _invlist_union(cp_list, posixes, &cp_list); SvREFCNT_dec_NN(posixes); @@ -17497,8 +17588,11 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth, cp_list = posixes; } + SvREFCNT_dec(nonascii_but_latin1_properties); + /* Get rid of any characters that we now know are matched - * unconditionally from the conditional list */ + * unconditionally from the conditional list, which may make + * that list empty */ _invlist_subtract(has_upper_latin1_only_utf8_matches, cp_list, &has_upper_latin1_only_utf8_matches); @@ -18004,7 +18098,7 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, si = *ary; /* ary[0] = the string to initialize the swash with */ - if (av_tindex_nomg(av) >= 2) { + if (av_tindex_skip_len_mg(av) >= 2) { if (only_utf8_locale_ptr && ary[2] && ary[2] != &PL_sv_undef) @@ -18020,7 +18114,7 @@ Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog, * is any inversion list generated at compile time; [4] * indicates if that inversion list has any user-defined * properties in it. */ - if (av_tindex_nomg(av) >= 3) { + if (av_tindex_skip_len_mg(av) >= 3) { invlist = ary[3]; if (SvUV(ary[4])) { swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; @@ -18333,7 +18427,7 @@ S_regnode_guts(pTHX_ RExC_state_t *pRExC_state, const U8 op, const STRLEN extra_ #else if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG( - ("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", + ("%s:%d: (op %s) %s %" UVuf " (len %" UVuf ") (max %" UVuf ").\n", name, __LINE__, PL_reg_name[op], (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] @@ -18410,9 +18504,17 @@ S_reg2Lanode(pTHX_ RExC_state_t *pRExC_state, const U8 op, const U32 arg1, const - reginsert - insert an operator in front of already-emitted operand * * Means relocating the operand. +* +* IMPORTANT NOTE - it is the *callers* responsibility to correctly +* set up NEXT_OFF() of the inserted node if needed. Something like this: +* +* reginsert(pRExC, OPFAIL, orig_emit, depth+1); +* if (PASS2) +* NEXT_OFF(orig_emit) = regarglen[OPFAIL] + NODE_STEP_REGNODE; +* */ STATIC void -S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) +S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *operand, U32 depth) { regnode *src; regnode *dst; @@ -18438,7 +18540,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) dst = RExC_emit; if (RExC_open_parens) { int paren; - /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ + /*DEBUG_PARSE_FMT("inst"," - %" IVdf, (IV)RExC_npar);*/ /* remember that RExC_npar is rex->nparens + 1, * iow it is 1 more than the number of parens seen in * the pattern so far. */ @@ -18446,13 +18548,13 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) /* note, RExC_open_parens[0] is the start of the * regex, it can't move. RExC_close_parens[0] is the end * of the regex, it *can* move. */ - if ( paren && RExC_open_parens[paren] >= opnd ) { + if ( paren && RExC_open_parens[paren] >= operand ) { /*DEBUG_PARSE_FMT("open"," - %d",size);*/ RExC_open_parens[paren] += size; } else { /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ } - if ( RExC_close_parens[paren] >= opnd ) { + if ( RExC_close_parens[paren] >= operand ) { /*DEBUG_PARSE_FMT("close"," - %d",size);*/ RExC_close_parens[paren] += size; } else { @@ -18463,12 +18565,12 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) if (RExC_end_op) RExC_end_op += size; - while (src > opnd) { + while (src > operand) { StructCopy(--src, --dst, regnode); #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD 20010112 */ MJD_OFFSET_DEBUG( - ("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", + ("%s(%d): (op %s) %s copy %" UVuf " -> %" UVuf " (max %" UVuf ").\n", "reg_insert", __LINE__, PL_reg_name[op], @@ -18484,11 +18586,11 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) } - place = opnd; /* Op node, where operand used to be. */ + place = operand; /* Op node, where operand used to be. */ #ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG( - ("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", + ("%s(%d): (op %s) %s %" UVuf " <- %" UVuf " (max %" UVuf ").\n", "reginsert", __LINE__, PL_reg_name[op], @@ -18637,7 +18739,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, DEBUG_PARSE_MSG(""); regprop(RExC_rx, RExC_mysv, val, NULL, pRExC_state); Perl_re_printf( aTHX_ - "~ attach to %s (%"IVdf") offset to %"IVdf"\n", + "~ attach to %s (%" IVdf ") offset to %" IVdf "\n", SvPV_nolen_const(RExC_mysv), (IV)REG_NODE_NUM(val), (IV)(val - scan) @@ -18750,14 +18852,14 @@ Perl_regdump(pTHX_ const regexp *r) RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr), RE_SV_DUMPLEN(r->anchored_substr), 30); Perl_re_printf( aTHX_ - "anchored %s%s at %"IVdf" ", + "anchored %s%s at %" IVdf " ", s, RE_SV_TAIL(r->anchored_substr), (IV)r->anchored_offset); } else if (r->anchored_utf8) { RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8), RE_SV_DUMPLEN(r->anchored_utf8), 30); Perl_re_printf( aTHX_ - "anchored utf8 %s%s at %"IVdf" ", + "anchored utf8 %s%s at %" IVdf " ", s, RE_SV_TAIL(r->anchored_utf8), (IV)r->anchored_offset); } @@ -18765,14 +18867,14 @@ Perl_regdump(pTHX_ const regexp *r) RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr), RE_SV_DUMPLEN(r->float_substr), 30); Perl_re_printf( aTHX_ - "floating %s%s at %"IVdf"..%"UVuf" ", + "floating %s%s at %" IVdf "..%" UVuf " ", s, RE_SV_TAIL(r->float_substr), (IV)r->float_min_offset, (UV)r->float_max_offset); } else if (r->float_utf8) { RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8), RE_SV_DUMPLEN(r->float_utf8), 30); Perl_re_printf( aTHX_ - "floating utf8 %s%s at %"IVdf"..%"UVuf" ", + "floating utf8 %s%s at %" IVdf "..%" UVuf " ", s, RE_SV_TAIL(r->float_utf8), (IV)r->float_min_offset, (UV)r->float_max_offset); } @@ -18804,12 +18906,12 @@ Perl_regdump(pTHX_ const regexp *r) Perl_re_printf( aTHX_ " "); } if (r->intflags & PREGf_GPOS_SEEN) - Perl_re_printf( aTHX_ "GPOS:%"UVuf" ", (UV)r->gofs); + Perl_re_printf( aTHX_ "GPOS:%" UVuf " ", (UV)r->gofs); if (r->intflags & PREGf_SKIP) Perl_re_printf( aTHX_ "plus "); if (r->intflags & PREGf_IMPLICIT) Perl_re_printf( aTHX_ "implicit "); - Perl_re_printf( aTHX_ "minlen %"IVdf" ", (IV)r->minlen); + Perl_re_printf( aTHX_ "minlen %" IVdf " ", (IV)r->minlen); if (r->extflags & RXf_EVAL_SEEN) Perl_re_printf( aTHX_ "with eval "); Perl_re_printf( aTHX_ "\n"); @@ -18885,7 +18987,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ PERL_ARGS_ASSERT_REGPROP; - sv_setpvn(sv, "", 0); + SvPVCLEAR(sv); if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */ /* It would be nice to FAIL() here, but this may be called from @@ -18921,9 +19023,11 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); - DEBUG_TRIE_COMPILE_r( + DEBUG_TRIE_COMPILE_r({ + if (trie->jump) + sv_catpvs(sv, "(JUMP)"); Perl_sv_catpvf(aTHX_ sv, - "", + "", (UV)trie->startstate, (IV)trie->statecount-1, /* -1 because of the unused 0 element */ (UV)trie->wordcount, @@ -18932,7 +19036,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ (UV)TRIE_CHARCOUNT(trie), (UV)trie->uniquecharcount ); - ); + }); if ( IS_ANYOF_TRIE(op) || trie->bitmap ) { sv_catpvs(sv, "["); (void) put_charclass_bitmap_innards(sv, @@ -18946,7 +19050,6 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ ); sv_catpvs(sv, "]"); } - } else if (k == CURLY) { U32 lo = ARG1(o), hi = ARG2(o); if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) @@ -18965,7 +19068,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ { AV *name_list= NULL; U32 parno= OP(o) == ACCEPT ? (U32)ARG2L(o) : ARG(o); - Perl_sv_catpvf(aTHX_ sv, "%"UVuf, (UV)parno); /* Parenth number */ + Perl_sv_catpvf(aTHX_ sv, "%" UVuf, (UV)parno); /* Parenth number */ if ( RXp_PAREN_NAMES(prog) ) { name_list= MUTABLE_AV(progi->data->data[progi->name_list_idx]); } else if ( pRExC_state ) { @@ -18975,7 +19078,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ if ( k != REF || (OP(o) < NREF)) { SV **name= av_fetch(name_list, parno, 0 ); if (name) - Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); + Perl_sv_catpvf(aTHX_ sv, " '%" SVf "'", SVfARG(*name)); } else { SV *sv_dat= MUTABLE_SV(progi->data->data[ parno ]); @@ -18984,10 +19087,10 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_ I32 n; if (name) { for ( n=0; nflags) { - Perl_sv_catpvf(aTHX_ sv, ":%"SVf, + Perl_sv_catpvf(aTHX_ sv, ":%" SVf, SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ])))); } #else @@ -19431,12 +19534,8 @@ Perl_regfree_internal(pTHX_ REGEXP * const rx) if (ri->u.offsets) Safefree(ri->u.offsets); /* 20010421 MJD */ #endif - if (ri->code_blocks) { - int n; - for (n = 0; n < ri->num_code_blocks; n++) - SvREFCNT_dec(ri->code_blocks[n].src_regex); - Safefree(ri->code_blocks); - } + if (ri->code_blocks) + S_free_codeblocks(aTHX_ ri->code_blocks); if (ri->data) { int n = ri->data->count; @@ -19653,16 +19752,18 @@ Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param) Copy(ri->program, reti->program, len+1, regnode); - reti->num_code_blocks = ri->num_code_blocks; if (ri->code_blocks) { int n; - Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block, - struct reg_code_block); - Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks, - struct reg_code_block); - for (n = 0; n < ri->num_code_blocks; n++) - reti->code_blocks[n].src_regex = (REGEXP*) - sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param); + Newx(reti->code_blocks, 1, struct reg_code_blocks); + Newx(reti->code_blocks->cb, ri->code_blocks->count, + struct reg_code_block); + Copy(ri->code_blocks->cb, reti->code_blocks->cb, + ri->code_blocks->count, struct reg_code_block); + for (n = 0; n < ri->code_blocks->count; n++) + reti->code_blocks->cb[n].src_regex = (REGEXP*) + sv_dup_inc((SV*)(ri->code_blocks->cb[n].src_regex), param); + reti->code_blocks->count = ri->code_blocks->count; + reti->code_blocks->refcnt = 1; } else reti->code_blocks = NULL; @@ -19793,7 +19894,7 @@ S_re_croak2(pTHX_ bool utf8, const char* pat1,const char* pat2,...) l1 = 512; Copy(message, buf, l1 , char); /* l1-1 to avoid \n */ - Perl_croak(aTHX_ "%"UTF8f, UTF8fARG(utf8, l1-1, buf)); + Perl_croak(aTHX_ "%" UTF8f, UTF8fARG(utf8, l1-1, buf)); } /* XXX Here's a total kludge. But we need to re-enter for swash routines. */ @@ -19845,7 +19946,7 @@ S_put_code_point(pTHX_ SV *sv, UV c) PERL_ARGS_ASSERT_PUT_CODE_POINT; if (c > 255) { - Perl_sv_catpvf(aTHX_ sv, "\\x{%04"UVXf"}", c); + Perl_sv_catpvf(aTHX_ sv, "\\x{%04" UVXf "}", c); } else if (isPRINT(c)) { const char string = (char) c; @@ -20029,10 +20130,10 @@ S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals) : NUM_ANYOF_CODE_POINTS - 1; #if NUM_ANYOF_CODE_POINTS > 256 format = (this_end < 256) - ? "\\x%02"UVXf"-\\x%02"UVXf"" - : "\\x{%04"UVXf"}-\\x{%04"UVXf"}"; + ? "\\x%02" UVXf "-\\x%02" UVXf + : "\\x{%04" UVXf "}-\\x{%04" UVXf "}"; #else - format = "\\x%02"UVXf"-\\x%02"UVXf""; + format = "\\x%02" UVXf "-\\x%02" UVXf; #endif GCC_DIAG_IGNORE(-Wformat-nonliteral); Perl_sv_catpvf(aTHX_ sv, format, start, this_end); @@ -20434,7 +20535,7 @@ S_put_charclass_bitmap_innards(pTHX_ SV *sv, #define CLEAR_OPTSTART \ if (optstart) STMT_START { \ DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ \ - " (%"IVdf" nodes)\n", (IV)(node - optstart))); \ + " (%" IVdf " nodes)\n", (IV)(node - optstart))); \ optstart=NULL; \ } STMT_END @@ -20483,7 +20584,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, CLEAR_OPTSTART; regprop(r, sv, node, NULL, NULL); - Perl_re_printf( aTHX_ "%4"IVdf":%*s%s", (IV)(node - start), + Perl_re_printf( aTHX_ "%4" IVdf ":%*s%s", (IV)(node - start), (int)(2*indent + 1), "", SvPVX_const(sv)); if (OP(node) != OPTIMIZED) { @@ -20493,7 +20594,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, && PL_regkind[OP(next)] != BRANCH ) Perl_re_printf( aTHX_ " (FAIL)"); else - Perl_re_printf( aTHX_ " (%"IVdf")", (IV)(next - start)); + Perl_re_printf( aTHX_ " (%" IVdf ")", (IV)(next - start)); Perl_re_printf( aTHX_ "\n"); } @@ -20528,7 +20629,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, #endif const regnode *nextbranch= NULL; I32 word_idx; - sv_setpvs(sv, ""); + SvPVCLEAR(sv); for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) { SV ** const elem_ptr = av_fetch(trie_words,word_idx,0); @@ -20548,7 +20649,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, ); if (trie->jump) { U16 dist= trie->jump[word_idx+1]; - Perl_re_printf( aTHX_ "(%"UVuf")\n", + Perl_re_printf( aTHX_ "(%" UVuf ")\n", (UV)((dist ? this_trie + dist : next) - start)); if (dist) { if (!nextbranch)