X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/4d919e5b5b77044f49e291c3d7df6a8cc423b9f5..e41ffe51c12c0cad6aa835df69c2ea84774bb0df:/regexec.c diff --git a/regexec.c b/regexec.c index 35f0f51..9155de8 100644 --- a/regexec.c +++ b/regexec.c @@ -87,9 +87,6 @@ #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0) -#define RS_init 1 /* eval environment created */ -#define RS_set 2 /* replsv value is set */ - #ifndef STATIC #define STATIC static #endif @@ -122,12 +119,26 @@ #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) /* these are unrolled below in the CCC_TRY_XXX defined */ -#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \ - if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)str); assert(ok); LEAVE; } } STMT_END +#ifdef EBCDIC + /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just + * skip the check on EBCDIC platforms */ +# define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class) +#else +# define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \ + if (!CAT2(PL_utf8_,class)) { \ + bool ok; \ + ENTER; save_re_context(); \ + ok=CAT2(is_utf8_,class)((const U8*)str); \ + assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END +#endif /* Doesn't do an assert to verify that is correct */ #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \ - if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END + if (!CAT2(PL_utf8_,class)) { \ + bool throw_away PERL_UNUSED_DECL; \ + ENTER; save_re_context(); \ + throw_away = CAT2(is_utf8_,class)((const U8*)" "); \ + LEAVE; } } STMT_END #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a") #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0") @@ -150,35 +161,6 @@ LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \ LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */ -/* - We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test - so that it is possible to override the option here without having to - rebuild the entire core. as we are required to do if we change regcomp.h - which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined. -*/ -#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS -#define BROKEN_UNICODE_CHARCLASS_MAPPINGS -#endif - -#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS -#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS_ALNUM() -#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS_SPACE() -#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS_DIGIT() -#define RE_utf8_perl_word PL_utf8_alnum -#define RE_utf8_perl_space PL_utf8_space -#define RE_utf8_posix_digit PL_utf8_digit -#define perl_word alnum -#define perl_space space -#define posix_digit digit -#else -#define LOAD_UTF8_CHARCLASS_PERL_WORD() LOAD_UTF8_CHARCLASS(perl_word,"a") -#define LOAD_UTF8_CHARCLASS_PERL_SPACE() LOAD_UTF8_CHARCLASS(perl_space," ") -#define LOAD_UTF8_CHARCLASS_POSIX_DIGIT() LOAD_UTF8_CHARCLASS(posix_digit,"0") -#define RE_utf8_perl_word PL_utf8_perl_word -#define RE_utf8_perl_space PL_utf8_perl_space -#define RE_utf8_posix_digit PL_utf8_posix_digit -#endif - #define PLACEHOLDER /* Something for the preprocessor to grab onto */ /* The actual code for CCC_TRY, which uses several variables from the routine @@ -318,13 +300,13 @@ /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA || OP(rn)==EXACTF) || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) -#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn) == EXACTFA) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -350,25 +332,28 @@ static void restore_pos(pTHX_ void *arg); -#define REGCP_PAREN_ELEMS 4 -#define REGCP_OTHER_ELEMS 5 +#define REGCP_PAREN_ELEMS 3 +#define REGCP_OTHER_ELEMS 3 #define REGCP_FRAME_ELEMS 1 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and * are needed for the regexp context stack bookkeeping. */ STATIC CHECKPOINT -S_regcppush(pTHX_ I32 parenfloor) +S_regcppush(pTHX_ const regexp *rex, I32 parenfloor) { dVAR; const int retval = PL_savestack_ix; const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS; const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; - int p; + I32 p; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGCPPUSH; + if (paren_elems_to_push < 0) - Perl_croak(aTHX_ "panic: paren_elems_to_push < 0"); + Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0", + paren_elems_to_push); if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf @@ -377,25 +362,31 @@ S_regcppush(pTHX_ I32 parenfloor) SSGROW(total_elems + REGCP_FRAME_ELEMS); - for (p = PL_regsize; p > parenfloor; p--) { + DEBUG_BUFFERS_r( + if ((int)PL_regsize > (int)parenfloor) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n", + PTR2UV(rex), + PTR2UV(rex->offs) + ); + ); + for (p = parenfloor+1; p <= (I32)PL_regsize; p++) { /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ - SSPUSHINT(PL_regoffs[p].end); - SSPUSHINT(PL_regoffs[p].start); - SSPUSHPTR(PL_reg_start_tmp[p]); - SSPUSHINT(p); + SSPUSHINT(rex->offs[p].end); + SSPUSHINT(rex->offs[p].start); + SSPUSHINT(rex->offs[p].start_tmp); DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, - " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n", - (UV)p, (IV)PL_regoffs[p].start, - (IV)(PL_reg_start_tmp[p] - PL_bostr), - (IV)PL_regoffs[p].end + " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", + (UV)p, + (IV)rex->offs[p].start, + (IV)rex->offs[p].start_tmp, + (IV)rex->offs[p].end )); } /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ - SSPUSHPTR(PL_regoffs); SSPUSHINT(PL_regsize); - SSPUSHINT(*PL_reglastparen); - SSPUSHINT(*PL_reglastcloseparen); - SSPUSHPTR(PL_reginput); + SSPUSHINT(rex->lastparen); + SSPUSHINT(rex->lastcloseparen); SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */ return retval; @@ -417,12 +408,12 @@ S_regcppush(pTHX_ I32 parenfloor) (IV)(cp), (IV)PL_savestack_ix)); \ regcpblow(cp) -STATIC char * -S_regcppop(pTHX_ const regexp *rex) +STATIC void +S_regcppop(pTHX_ regexp *rex) { dVAR; UV i; - char *input; + U32 paren; GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCPPOP; @@ -431,38 +422,38 @@ S_regcppop(pTHX_ const regexp *rex) i = SSPOPUV; assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */ - input = (char *) SSPOPPTR; - *PL_reglastcloseparen = SSPOPINT; - *PL_reglastparen = SSPOPINT; + rex->lastcloseparen = SSPOPINT; + rex->lastparen = SSPOPINT; PL_regsize = SSPOPINT; - PL_regoffs=(regexp_paren_pair *) SSPOPPTR; i -= REGCP_OTHER_ELEMS; /* Now restore the parentheses context. */ + DEBUG_BUFFERS_r( + if (i || rex->lastparen + 1 <= rex->nparens) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n", + PTR2UV(rex), + PTR2UV(rex->offs) + ); + ); + paren = PL_regsize; for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { I32 tmps; - U32 paren = (U32)SSPOPINT; - PL_reg_start_tmp[paren] = (char *) SSPOPPTR; - PL_regoffs[paren].start = SSPOPINT; + rex->offs[paren].start_tmp = SSPOPINT; + rex->offs[paren].start = SSPOPINT; tmps = SSPOPINT; - if (paren <= *PL_reglastparen) - PL_regoffs[paren].end = tmps; - DEBUG_BUFFERS_r( - PerlIO_printf(Perl_debug_log, - " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n", - (UV)paren, (IV)PL_regoffs[paren].start, - (IV)(PL_reg_start_tmp[paren] - PL_bostr), - (IV)PL_regoffs[paren].end, - (paren > *PL_reglastparen ? "(no)" : "")); + if (paren <= rex->lastparen) + rex->offs[paren].end = tmps; + DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n", + (UV)paren, + (IV)rex->offs[paren].start, + (IV)rex->offs[paren].start_tmp, + (IV)rex->offs[paren].end, + (paren > rex->lastparen ? "(skipped)" : "")); ); + paren--; } - DEBUG_BUFFERS_r( - if (*PL_reglastparen + 1 <= rex->nparens) { - PerlIO_printf(Perl_debug_log, - " restoring \\%"IVdf"..\\%"IVdf" to undef\n", - (IV)(*PL_reglastparen + 1), (IV)rex->nparens); - } - ); #if 1 /* It would seem that the similar code in regtry() * already takes care of this, and in fact it is in @@ -473,13 +464,29 @@ S_regcppop(pTHX_ const regexp *rex) * this code seems to be necessary or otherwise * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ * --jhi updated by dapm */ - for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) { + for (i = rex->lastparen + 1; i <= rex->nparens; i++) { if (i > PL_regsize) - PL_regoffs[i].start = -1; - PL_regoffs[i].end = -1; + rex->offs[i].start = -1; + rex->offs[i].end = -1; + DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %s ..-1 undeffing\n", + (UV)i, + (i > PL_regsize) ? "-1" : " " + )); } #endif - return input; +} + +/* restore the parens and associated vars at savestack position ix, + * but without popping the stack */ + +STATIC void +S_regcp_restore(pTHX_ regexp *rex, I32 ix) +{ + I32 tmpix = PL_savestack_ix; + PL_savestack_ix = ix; + regcppop(rex); + PL_savestack_ix = tmpix; } #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ @@ -575,6 +582,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, I32 ml_anch; register char *other_last = NULL; /* other substr checked before this */ char *check_at = NULL; /* check substr found at this pos */ + char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/ const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; RXi_GET_DECL(prog,progi); #ifdef DEBUGGING @@ -583,6 +591,8 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_RE_INTUIT_START; + PERL_UNUSED_ARG(flags); + PERL_UNUSED_ARG(data); RX_MATCH_UTF8_set(rx,utf8_target); @@ -701,6 +711,8 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, { I32 srch_start_shift = start_shift; I32 srch_end_shift = end_shift; + U8* start_point; + U8* end_point; if (srch_start_shift < 0 && strbeg - s > srch_start_shift) { srch_end_shift -= ((strbeg - s) - srch_start_shift); srch_start_shift = strbeg - s; @@ -713,27 +725,6 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, (IV)prog->check_end_shift); }); - if (flags & REXEC_SCREAM) { - I32 p = -1; /* Internal iterator of scream. */ - I32 * const pp = data ? data->scream_pos : &p; - - if (PL_screamfirst[BmRARE(check)] >= 0 - || ( BmRARE(check) == '\n' - && (BmPREVIOUS(check) == SvCUR(check) - 1) - && SvTAIL(check) )) - s = screaminstr(sv, check, - srch_start_shift + (s - strbeg), srch_end_shift, pp, 0); - else - goto fail_finish; - /* we may be pointing at the wrong string */ - if (s && RXp_MATCH_COPIED(prog)) - s = strbeg + (s - SvPVX_const(sv)); - if (data) - *data->scream_olds = s; - } - else { - U8* start_point; - U8* end_point; if (prog->extflags & RXf_CANY_SEEN) { start_point= (U8*)(s + srch_start_shift); end_point= (U8*)(strend - srch_end_shift); @@ -751,7 +742,6 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, s = fbm_instr( start_point, end_point, check, multiline ? FBMrf_MULTILINE : 0); } - } /* Update the count-of-usability, remove useless subpatterns, unshift s. */ @@ -1089,12 +1079,16 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, else endpos= strend; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n", - (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg))); - + if (checked_upto < s) + checked_upto = s; + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", + (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); + t = s; - s = find_byclass(prog, progi->regstclass, s, endpos, NULL); - if (!s) { + s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL); + if (s) { + checked_upto = s; + } else { #ifdef DEBUGGING const char *what = NULL; #endif @@ -1107,6 +1101,9 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, "This position contradicts STCLASS...\n") ); if ((prog->extflags & RXf_ANCH) && !ml_anch) goto fail; + checked_upto = HOPBACKc(endpos, start_shift); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", + (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); /* Contradict one of substrings */ if (prog->anchored_substr || prog->anchored_utf8) { if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) { @@ -1186,58 +1183,61 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, #define DECL_TRIE_TYPE(scan) \ const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ - trie_type = (scan->flags != EXACT) \ - ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \ - : (utf8_target ? trie_utf8 : trie_plain) - -#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \ -uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ - switch (trie_type) { \ - case trie_utf8_fold: \ - if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - uscan += len; \ - len=0; \ - } else { \ - uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \ - uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \ - foldlen -= UNISKIP( uvc ); \ - uscan = foldbuf + UNISKIP( uvc ); \ - } \ - break; \ - case trie_latin_utf8_fold: \ - if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - uscan += len; \ - len=0; \ - } else { \ - len = 1; \ - uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \ - foldlen -= UNISKIP( uvc ); \ - uscan = foldbuf + UNISKIP( uvc ); \ - } \ - break; \ - case trie_utf8: \ - uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \ - break; \ - case trie_plain: \ - uvc = (UV)*uc; \ - len = 1; \ - } \ - if (uvc < 256) { \ - charid = trie->charmap[ uvc ]; \ - } \ - else { \ - charid = 0; \ - if (widecharmap) { \ - SV** const svpp = hv_fetch(widecharmap, \ - (char*)&uvc, sizeof(UV), 0); \ - if (svpp) \ - charid = (U16)SvIV(*svpp); \ - } \ - } \ + trie_type = ((scan->flags == EXACT) \ + ? (utf8_target ? trie_utf8 : trie_plain) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + +#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \ +uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ + STRLEN skiplen; \ + switch (trie_type) { \ + case trie_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \ + len = UTF8SKIP(uc); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_latin_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + len = 1; \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_utf8: \ + uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + break; \ + case trie_plain: \ + uvc = (UV)*uc; \ + len = 1; \ + } \ + if (uvc < 256) { \ + charid = trie->charmap[ uvc ]; \ + } \ + else { \ + charid = 0; \ + if (widecharmap) { \ + SV** const svpp = hv_fetch(widecharmap, \ + (char*)&uvc, sizeof(UV), 0); \ + if (svpp) \ + charid = (U16)SvIV(*svpp); \ + } \ + } \ } STMT_END #define REXEC_FBC_EXACTISH_SCAN(CoNd) \ @@ -1442,23 +1442,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, reginclass(prog, c, (U8*)s, &inclasslen, utf8_target)); } else { - while (s < strend) { - STRLEN skip = 1; - - if (REGINCLASS(prog, c, (U8*)s) || - (ANYOF_FOLD_SHARP_S(c, s, strend) && - /* The assignment of 2 is intentional: - * for the folded sharp s, the skip is 2. */ - (skip = SHARP_S_SKIP))) { - if (tmp && (!reginfo || regtry(reginfo, &s))) - goto got_it; - else - tmp = doevery; - } - else - tmp = 1; - s += skip; - } + REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); } break; case CANY: @@ -1479,22 +1463,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, folder = foldEQ_latin1; /* /a, except the sharp s one which */ goto do_exactf_non_utf8; /* isn't dealt with by these */ - case EXACTFU: - if (UTF_PATTERN || utf8_target) { - utf8_fold_flags = 0; - goto do_exactf_utf8; - } - fold_array = PL_fold_latin1; - folder = foldEQ_latin1; - /* XXX This uses the full utf8 fold because if the pattern contains - * 'ss' it could match LATIN_SMALL_LETTER SHARP_S in the string. - * There could be a new node type, say EXACTFU_SS, which is - * generated by regcomp only if there is an 'ss', and then every - * other case could goto do_exactf_non_utf8;*/ - goto do_exactf_utf8; - case EXACTF: - if (UTF_PATTERN || utf8_target) { + if (utf8_target) { + + /* regcomp.c already folded this if pattern is in UTF-8 */ utf8_fold_flags = 0; goto do_exactf_utf8; } @@ -1509,10 +1481,32 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } fold_array = PL_fold_locale; folder = foldEQ_locale; + goto do_exactf_non_utf8; + + case EXACTFU_SS: + if (UTF_PATTERN) { + utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; + } + goto do_exactf_utf8; + + case EXACTFU_TRICKYFOLD: + case EXACTFU: + if (UTF_PATTERN || utf8_target) { + utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0; + goto do_exactf_utf8; + } + + /* Any 'ss' in the pattern should have been replaced by regcomp, + * so we don't have to worry here about this single special case + * in the Latin1 range */ + fold_array = PL_fold_latin1; + folder = foldEQ_latin1; /* FALL THROUGH */ - do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */ + do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there + are no glitches with fold-length differences + between the target string and pattern */ /* The idea in the non-utf8 EXACTF* cases is to first find the * first character of the EXACTF* node and then, if necessary, @@ -1523,6 +1517,11 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, pat_string = STRING(c); ln = STR_LEN(c); /* length to match in octets/bytes */ + /* We know that we have to match at least 'ln' bytes (which is the + * same as characters, since not utf8). If we have to match 3 + * characters, and there are only 2 availabe, we know without + * trying that it will fail; so don't start a match past the + * required minimum number from the far end */ e = HOP3c(strend, -((I32)ln), s); if (!reginfo && e < s) { @@ -1540,6 +1539,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; do_exactf_utf8: + { + unsigned expansion; + /* If one of the operands is in utf8, we can't use the simpler * folding above, due to the fact that many different characters @@ -1552,12 +1554,35 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ? utf8_length((U8 *) pat_string, (U8 *) pat_end) : ln; + /* We have 'lnc' characters to match in the pattern, but because of + * multi-character folding, each character in the target can match + * up to 3 characters (Unicode guarantees it will never exceed + * this) if it is utf8-encoded; and up to 2 if not (based on the + * fact that the Latin 1 folds are already determined, and the + * only multi-char fold in that range is the sharp-s folding to + * 'ss'. Thus, a pattern character can match as little as 1/3 of a + * string character. Adjust lnc accordingly, rounding up, so that + * if we need to match at least 4+1/3 chars, that really is 5. */ + expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; + lnc = (lnc + expansion - 1) / expansion; + + /* As in the non-UTF8 case, if we have to match 3 characters, and + * only 2 are left, it's guaranteed to fail, so don't start a + * match that would require us to go beyond the end of the string + */ e = HOP3c(strend, -((I32)lnc), s); if (!reginfo && e < s) { e = s; /* Due to minlen logic of intuit() */ } + /* XXX Note that we could recalculate e to stop the loop earlier, + * as the worst case expansion above will rarely be met, and as we + * go along we would usually find that e moves further to the left. + * This would happen only after we reached the point in the loop + * where if there were no expansion we should fail. Unclear if + * worth the expense */ + while (s <= e) { char *my_strend= (char *)strend; if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, @@ -1566,9 +1591,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, { goto got_it; } - s += UTF8SKIP(s); + s += (utf8_target) ? UTF8SKIP(s) : 1; } break; + } case BOUNDL: PL_reg_flags |= RF_tainted; FBC_BOUND(isALNUM_LC, @@ -1619,15 +1645,15 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; case ALNUMU: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_PERL_WORD(), - swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target), + LOAD_UTF8_CHARCLASS_ALNUM(), + swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target), isWORDCHAR_L1((U8) *s) ); break; case ALNUM: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_PERL_WORD(), - swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target), + LOAD_UTF8_CHARCLASS_ALNUM(), + swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target), isWORDCHAR((U8) *s) ); break; @@ -1638,15 +1664,15 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; case NALNUMU: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_PERL_WORD(), - swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target), + LOAD_UTF8_CHARCLASS_ALNUM(), + !swash_fetch(PL_utf8_alnum,(U8*)s, utf8_target), ! isWORDCHAR_L1((U8) *s) ); break; case NALNUM: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_PERL_WORD(), - !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target), + LOAD_UTF8_CHARCLASS_ALNUM(), + !swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target), ! isALNUM(*s) ); break; @@ -1664,15 +1690,15 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; case SPACEU: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_PERL_SPACE(), - *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target), + LOAD_UTF8_CHARCLASS_SPACE(), + *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target), isSPACE_L1((U8) *s) ); break; case SPACE: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_PERL_SPACE(), - *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target), + LOAD_UTF8_CHARCLASS_SPACE(), + *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target), isSPACE((U8) *s) ); break; @@ -1689,15 +1715,15 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; case NSPACEU: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_PERL_SPACE(), - !( *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)), + LOAD_UTF8_CHARCLASS_SPACE(), + !( *s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)), ! isSPACE_L1((U8) *s) ); break; case NSPACE: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_PERL_SPACE(), - !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)), + LOAD_UTF8_CHARCLASS_SPACE(), + !(*s == ' ' || swash_fetch(PL_utf8_space,(U8*)s, utf8_target)), ! isSPACE((U8) *s) ); break; @@ -1715,8 +1741,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; case DIGIT: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_POSIX_DIGIT(), - swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target), + LOAD_UTF8_CHARCLASS_DIGIT(), + swash_fetch(PL_utf8_digit,(U8*)s, utf8_target), isDIGIT(*s) ); break; @@ -1733,8 +1759,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; case NDIGIT: REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_POSIX_DIGIT(), - !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target), + LOAD_UTF8_CHARCLASS_DIGIT(), + !swash_fetch(PL_utf8_digit,(U8*)s, utf8_target), !isDIGIT(*s) ); break; @@ -2069,7 +2095,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre } PL_reg_flags = 0; - PL_reg_eval_set = 0; + PL_reg_state.re_state_eval_setup_done = FALSE; PL_reg_maxiter = 0; if (RX_UTF8(rx)) @@ -2135,6 +2161,12 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre swap = prog->offs; /* do we need a save destructor here for eval dies? */ Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair); + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(swap), + PTR2UV(prog->offs) + )); } if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) { re_scream_pos_data d; @@ -2218,8 +2250,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/ s--; } - /* We can use a more efficient search as newlines are the same in unicode as they are in latin */ - while (s < end) { + /* We can use a more efficient search as newlines are the same in unicode as they are in latin */ + while (s <= end) { /* note it could be possible to match at the end of the string */ if (*s++ == '\n') { /* don't need PL_utf8skip here */ if (regtry(®info, &s)) goto got_it; @@ -2327,15 +2359,9 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre dontbother = end_shift; strend = HOPc(strend, -dontbother); while ( (s <= last) && - ((flags & REXEC_SCREAM) - ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg, - end_shift, &scream_pos, 0)) - : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), + (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), (unsigned char*)strend, must, - multiline ? FBMrf_MULTILINE : 0))) ) { - /* we may be pointing at the wrong string */ - if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog)) - s = strbeg + (s - SvPVX_const(sv)); + multiline ? FBMrf_MULTILINE : 0)) ) { DEBUG_EXECUTE_r( did_match = 1 ); if (HOPc(s, -back_max) > last1) { last1 = HOPc(s, -back_min); @@ -2399,48 +2425,80 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre dontbother = 0; if (prog->float_substr != NULL || prog->float_utf8 != NULL) { /* Trim the end. */ - char *last; + char *last= NULL; SV* float_real; + STRLEN len; + const char *little; if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog); float_real = utf8_target ? prog->float_utf8 : prog->float_substr; - if (flags & REXEC_SCREAM) { - last = screaminstr(sv, float_real, s - strbeg, - end_shift, &scream_pos, 1); /* last one */ - if (!last) - last = scream_olds; /* Only one occurrence. */ - /* we may be pointing at the wrong string */ - else if (RXp_MATCH_COPIED(prog)) - s = strbeg + (s - SvPVX_const(sv)); - } - else { - STRLEN len; - const char * const little = SvPV_const(float_real, len); - - if (SvTAIL(float_real)) { - if (memEQ(strend - len + 1, little, len - 1)) - last = strend - len + 1; - else if (!multiline) - last = memEQ(strend - len, little, len) - ? strend - len : NULL; - else + little = SvPV_const(float_real, len); + if (SvTAIL(float_real)) { + /* This means that float_real contains an artificial \n on the end + * due to the presence of something like this: /foo$/ + * where we can match both "foo" and "foo\n" at the end of the string. + * So we have to compare the end of the string first against the float_real + * without the \n and then against the full float_real with the string. + * We have to watch out for cases where the string might be smaller + * than the float_real or the float_real without the \n. + */ + char *checkpos= strend - len; + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%sChecking for float_real.%s\n", + PL_colors[4], PL_colors[5])); + if (checkpos + 1 < strbeg) { + /* can't match, even if we remove the trailing \n string is too short to match */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString shorter than required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } else if (memEQ(checkpos + 1, little, len - 1)) { + /* can match, the end of the string matches without the "\n" */ + last = checkpos + 1; + } else if (checkpos < strbeg) { + /* cant match, string is too short when the "\n" is included */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString does not contain required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } else if (!multiline) { + /* non multiline match, so compare with the "\n" at the end of the string */ + if (memEQ(checkpos, little, len)) { + last= checkpos; + } else { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString does not contain required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } + } else { + /* multiline match, so we have to search for a place where the full string is located */ goto find_last; - } else { + } + } else { find_last: if (len) last = rninstr(s, strend, little, little + len); else last = strend; /* matching "$" */ - } } - if (last == NULL) { + if (!last) { + /* at one point this block contained a comment which was probably + * incorrect, which said that this was a "should not happen" case. + * Even if it was true when it was written I am pretty sure it is + * not anymore, so I have removed the comment and replaced it with + * this one. Yves */ DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "%sCan't trim the tail, match fails (should not happen)%s\n", - PL_colors[4], PL_colors[5])); - goto phooey; /* Should not happen! */ + "String does not contain required substring, cannot match.\n" + )); + goto phooey; } dontbother = strend - last + prog->float_min_offset; } @@ -2469,10 +2527,18 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre goto phooey; got_it: + DEBUG_BUFFERS_r( + if (swap) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(swap) + ); + ); Safefree(swap); RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted); - if (PL_reg_eval_set) + if (PL_reg_state.re_state_eval_setup_done) restore_pos(aTHX_ prog); if (RXp_PAREN_NAMES(prog)) (void)hv_iterinit(RXp_PAREN_NAMES(prog)); @@ -2513,10 +2579,16 @@ got_it: phooey: DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", PL_colors[4], PL_colors[5])); - if (PL_reg_eval_set) + if (PL_reg_state.re_state_eval_setup_done) restore_pos(aTHX_ prog); if (swap) { /* we failed :-( roll it back */ + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(prog->offs), + PTR2UV(swap) + )); Safefree(prog->offs); prog->offs = swap; } @@ -2525,6 +2597,16 @@ phooey: } +/* Set which rex is pointed to by PL_reg_state, handling ref counting. + * Do inc before dec, in case old and new rex are the same */ +#define SET_reg_curpm(Re2) \ + if (PL_reg_state.re_state_eval_setup_done) { \ + (void)ReREFCNT_inc(Re2); \ + ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ + PM_SETRE((PL_reg_curpm), (Re2)); \ + } + + /* - regtry - try match at specific point */ @@ -2542,22 +2624,12 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) reginfo->cutpoint=NULL; - if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) { + if ((prog->extflags & RXf_EVAL_SEEN) + && !PL_reg_state.re_state_eval_setup_done) + { MAGIC *mg; - PL_reg_eval_set = RS_init; - DEBUG_EXECUTE_r(DEBUG_s( - PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n", - (IV)(PL_stack_sp - PL_stack_base)); - )); - SAVESTACK_CXPOS(); - cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base; - /* Otherwise OP_NEXTSTATE will free whatever on stack now. */ - SAVETMPS; - /* Apparently this is not needed, judging by wantarray. */ - /* SAVEI8(cxstack[cxstack_ix].blk_gimme); - cxstack[cxstack_ix].blk_gimme = G_SCALAR; */ - + PL_reg_state.re_state_eval_setup_done = TRUE; if (reginfo->sv) { /* Make $_ available to executed code. */ if (reginfo->sv != DEFSV) { @@ -2593,16 +2665,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) } #endif } -#ifdef USE_ITHREADS - /* It seems that non-ithreads works both with and without this code. - So for efficiency reasons it seems best not to have the code - compiled when it is not needed. */ - /* This is safe against NULLs: */ - ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); - /* PM_reg_curpm owns a reference to this regexp. */ - ReREFCNT_inc(rx); -#endif - PM_SETRE(PL_reg_curpm, rx); + SET_reg_curpm(rx); PL_reg_oldcurpm = PL_curpm; PL_curpm = PL_reg_curpm; if (RXp_MATCH_COPIED(prog)) { @@ -2621,30 +2684,22 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) prog->subbeg = PL_bostr; prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */ } - DEBUG_EXECUTE_r(PL_reg_starttry = *startpos); +#ifdef DEBUGGING + PL_reg_starttry = *startpos; +#endif prog->offs[0].start = *startpos - PL_bostr; PL_reginput = *startpos; - PL_reglastparen = &prog->lastparen; - PL_reglastcloseparen = &prog->lastcloseparen; prog->lastparen = 0; prog->lastcloseparen = 0; PL_regsize = 0; - PL_regoffs = prog->offs; - if (PL_reg_start_tmpl <= prog->nparens) { - PL_reg_start_tmpl = prog->nparens*3/2 + 3; - if(PL_reg_start_tmp) - Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); - else - Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*); - } /* XXXX What this code is doing here?!!! There should be no need - to do this again and again, PL_reglastparen should take care of + to do this again and again, prog->lastparen should take care of this! --ilya*/ /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. * Actually, the code in regcppop() (which Ilya may be meaning by - * PL_reglastparen), is not needed at all by the test suite + * prog->lastparen), is not needed at all by the test suite * (op/regexp, op/pat, op/split), but that code is needed otherwise * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ * Meanwhile, this code *is* needed for the @@ -2653,9 +2708,9 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) * --jhi updated by dapm */ #if 1 if (prog->nparens) { - regexp_paren_pair *pp = PL_regoffs; + regexp_paren_pair *pp = prog->offs; register I32 i; - for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) { + for (i = prog->nparens; i > (I32)prog->lastparen; i--) { ++pp; pp->start = -1; pp->end = -1; @@ -2664,7 +2719,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) #endif REGCP_SET(lastcp); if (regmatch(reginfo, progi->program + 1)) { - PL_regoffs[0].end = PL_reginput - PL_bostr; + prog->offs[0].end = PL_reginput - PL_bostr; return 1; } if (reginfo->cutpoint) @@ -3000,8 +3055,8 @@ S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; for ( n=0; n= nums[n] && - PL_regoffs[nums[n]].end != -1) + if ((I32)rex->lastparen >= nums[n] && + rex->offs[nums[n]].end != -1) { return nums[n]; } @@ -3029,10 +3084,6 @@ S_clear_backtrack_stack(pTHX_ void *p) } -#define SETREX(Re1,Re2) \ - if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \ - Re1 = (Re2) - STATIC I32 /* 0 failure, 1 success */ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) { @@ -3094,10 +3145,23 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) false: plain (?=foo) true: used as a condition: (?(?=foo)) */ + PAD* last_pad = NULL; + dMULTICALL; + I32 gimme = G_SCALAR; + CV *caller_cv = NULL; /* who called us */ + CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ + CHECKPOINT runops_cp; /* savestack position before executing EVAL */ + #ifdef DEBUGGING GET_RE_DEBUG_FLAGS_DECL; #endif + /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ + multicall_oldcatch = 0; + multicall_cv = NULL; + cx = NULL; + + PERL_ARGS_ASSERT_REGMATCH; DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ @@ -3147,10 +3211,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) reenter_switch: - assert(PL_reglastparen == &rex->lastparen); - assert(PL_reglastcloseparen == &rex->lastcloseparen); - assert(PL_regoffs == rex->offs); - switch (state_num) { case BOL: if (locinput == PL_bostr) @@ -3177,14 +3237,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case KEEPS: /* update the startpoint */ - st->u.keeper.val = PL_regoffs[0].start; + st->u.keeper.val = rex->offs[0].start; PL_reginput = locinput; - PL_regoffs[0].start = locinput - PL_bostr; + rex->offs[0].start = locinput - PL_bostr; PUSH_STATE_GOTO(KEEPS_next, next); /*NOT-REACHED*/ case KEEPS_next_fail: /* rollback the start point change */ - PL_regoffs[0].start = st->u.keeper.val; + rex->offs[0].start = st->u.keeper.val; sayNO_SILENT; /*NOT-REACHED*/ case EOL: @@ -3240,16 +3300,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* In this case the charclass data is available inline so we can fail fast without a lot of extra overhead. */ - if (scan->flags == EXACT || !utf8_target) { - if(!ANYOF_BITMAP_TEST(scan, *locinput)) { - DEBUG_EXECUTE_r( - PerlIO_printf(Perl_debug_log, - "%*s %sfailed to match trie start class...%s\n", - REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) - ); - sayNO_SILENT; - /* NOTREACHED */ - } + if(!ANYOF_BITMAP_TEST(scan, *locinput)) { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed to match trie start class...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + sayNO_SILENT; + /* NOTREACHED */ } /* FALL THROUGH */ case TRIE: @@ -3307,9 +3365,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); U32 state = trie->startstate; - if (trie->bitmap && trie_type != trie_utf8_fold && - !TRIE_BITMAP_TEST(trie,*locinput) - ) { + if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) { if (trie->states[ state ].wordnum) { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, @@ -3338,7 +3394,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) U32 charcount = 0; /* how many input chars we have matched */ U32 accepted = 0; /* have we seen any accepting states? */ - ST.B = next; ST.jump = trie->jump; ST.me = scan; ST.firstpos = NULL; @@ -3442,9 +3497,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case TRIE_next_fail: /* we failed - try next alternative */ if ( ST.jump) { REGCP_UNWIND(ST.cp); - for (n = *PL_reglastparen; n > ST.lastparen; n--) - PL_regoffs[n].end = -1; - *PL_reglastparen = n; + for (n = rex->lastparen; n > ST.lastparen; n--) + rex->offs[n].end = -1; + rex->lastparen = n; + rex->lastcloseparen = ST.lastcloseparen; } if (!--ST.accepted) { DEBUG_EXECUTE_r({ @@ -3478,7 +3534,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } if ( ST.jump) { - ST.lastparen = *PL_reglastparen; + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; REGCP_SET(ST.cp); } @@ -3535,9 +3592,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) PL_reginput = (char *)uc; } - scan = (ST.jump && ST.jump[ST.nextword]) - ? ST.me + ST.jump[ST.nextword] - : ST.B; + scan = ST.me + ((ST.jump && ST.jump[ST.nextword]) + ? ST.jump[ST.nextword] + : NEXT_OFF(ST.me)); DEBUG_EXECUTE_r({ PerlIO_printf( Perl_debug_log, @@ -3643,10 +3700,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) fold_utf8_flags = FOLDEQ_UTF8_LOCALE; goto do_exactf; + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: case EXACTFU: folder = foldEQ_latin1; fold_array = PL_fold_latin1; - fold_utf8_flags = 0; + fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0; goto do_exactf; case EXACTFA: @@ -3664,26 +3723,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) s = STRING(scan); ln = STR_LEN(scan); - if (utf8_target || UTF_PATTERN) { - /* Either target or the pattern are utf8. */ + if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) { + /* Either target or the pattern are utf8, or has the issue where + * the fold lengths may differ. */ const char * const l = locinput; char *e = PL_regeol; if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN), - l, &e, 0, utf8_target, fold_utf8_flags)) { - /* One more case for the sharp s: - * pack("U0U*", 0xDF) =~ /ss/i, - * the 0xC3 0x9F are the UTF-8 - * byte sequence for the U+00DF. */ - - if (!(utf8_target && - toLOWER(s[0]) == 's' && - ln >= 2 && - toLOWER(s[1]) == 's' && - (U8)l[0] == 0xC3 && - e - l >= 2 && - (U8)l[1] == 0x9F)) - sayNO; + l, &e, 0, utf8_target, fold_utf8_flags)) + { + sayNO; } locinput = e; nextchr = UCHARAT(locinput); @@ -3719,7 +3768,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case NBOUNDU: case NBOUNDA: /* was last char in word? */ - if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) { + if (utf8_target + && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET + && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET) + { if (locinput == PL_bostr) ln = '\n'; else { @@ -3766,6 +3818,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) n = isALNUM(nextchr); break; case REGEX_ASCII_RESTRICTED_CHARSET: + case REGEX_ASCII_MORE_RESTRICTED_CHARSET: ln = isWORDCHAR_A(ln); n = isWORDCHAR_A(nextchr); break; @@ -3808,18 +3861,18 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8, ALNUMU, NALNUMU, isWORDCHAR_L1, ALNUMA, NALNUMA, isWORDCHAR_A, - perl_word, "a"); + alnum, "a"); CCC_TRY_U(SPACE, NSPACE, isSPACE, SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8, SPACEU, NSPACEU, isSPACE_L1, SPACEA, NSPACEA, isSPACE_A, - perl_space, " "); + space, " "); CCC_TRY(DIGIT, NDIGIT, isDIGIT, DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8, DIGITA, NDIGITA, isDIGIT_A, - posix_digit, "0"); + digit, "0"); case CLUMP: /* Match \X: logical Unicode character. This is defined as a Unicode extended Grapheme Cluster */ @@ -3865,17 +3918,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) L* (L | LVT T* | V V* T* | LV V* T*) That means that if we have seen any L's at all we can quit - there, but if the next character is a LVT, a V or and LV we + there, but if the next character is an LVT, a V, or an LV we should keep going. There is a subtlety with Prepend* which showed up in testing. Note that the Begin, and only the Begin is required in: | Prepend* Begin Extend* - Also, Begin contains '! Control'. A Prepend must be a '! - Control', which means it must be a Begin. What it comes down to - is that if we match Prepend* and then find no suitable Begin - afterwards, that if we backtrack the last Prepend, that one will - be a suitable Begin. + Also, Begin contains '! Control'. A Prepend must be a + '! Control', which means it must also be a Begin. What it + comes down to is that if we match Prepend* and then find no + suitable Begin afterwards, that if we backtrack the last + Prepend, that one will be a suitable Begin. */ if (locinput >= PL_regeol) @@ -3885,7 +3938,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* Match either CR LF or '.', as all the other possibilities * require utf8 */ locinput++; /* Match the . or CR */ - if (nextchr == '\r' + if (nextchr == '\r' /* And if it was CR, and the next is LF, + match the LF */ && locinput < PL_regeol && UCHARAT(locinput) == '\n') locinput++; } @@ -4126,11 +4180,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) n = ARG(scan); /* which paren pair */ do_nref_ref_common: - ln = PL_regoffs[n].start; + ln = rex->offs[n].start; PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ - if (*PL_reglastparen < n || ln == -1) + if (rex->lastparen < n || ln == -1) sayNO; /* Do not match unless seen CLOSEn. */ - if (ln == PL_regoffs[n].end) + if (ln == rex->offs[n].end) break; s = PL_bostr + ln; @@ -4144,7 +4198,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * not going off the end given by PL_regeol, and returns in * limit upon success, how much of the current input was * matched */ - if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target, + if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target, locinput, &limit, 0, utf8_target, utf8_fold_flags)) { sayNO; @@ -4159,7 +4213,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) (type == REF || UCHARAT(s) != fold_array[nextchr])) sayNO; - ln = PL_regoffs[n].end - ln; + ln = rex->offs[n].end - ln; if (locinput + ln > PL_regeol) sayNO; if (ln > 1 && (type == REF @@ -4200,7 +4254,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) re_sv = rex_sv; re = rex; rei = rexi; - (void)ReREFCNT_inc(rex_sv); if (OP(scan)==GOSUB) { startpoint = scan + ARG2L(scan); ST.close_paren = ARG(scan); @@ -4219,13 +4272,19 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } { /* execute the code in the {...} */ + dSP; - SV ** const before = SP; - OP_4tree * const oop = PL_op; + SV ** before; + OP * const oop = PL_op; COP * const ocurcop = PL_curcop; - PAD *old_comppad; + OP *nop; char *saved_regeol = PL_regeol; struct re_save_state saved_state; + CV *newcv; + + /* save *all* paren positions */ + regcppush(rex, 0); + REGCP_SET(runops_cp); /* To not corrupt the existing regex state while executing the * eval we would normally put it on the save stack, like with @@ -4243,18 +4302,99 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) */ Copy(&PL_reg_state, &saved_state, 1, struct re_save_state); + PL_reg_state.re_reparsing = FALSE; + + if (!caller_cv) + caller_cv = find_runcv(NULL); + n = ARG(scan); - PL_op = (OP_4tree*)rexi->data->data[n]; + + if (rexi->data->what[n] == 'r') { /* code from an external qr */ + newcv = ((struct regexp *)SvANY( + (REGEXP*)(rexi->data->data[n]) + ))->qr_anoncv + ; + nop = (OP*)rexi->data->data[n+1]; + } + else if (rexi->data->what[n] == 'l') { /* literal code */ + newcv = caller_cv; + nop = (OP*)rexi->data->data[n]; + assert(CvDEPTH(newcv)); + } + else { + /* literal with own CV */ + assert(rexi->data->what[n] == 'L'); + newcv = rex->qr_anoncv; + nop = (OP*)rexi->data->data[n]; + } + + /* the initial nextstate you would normally execute + * at the start of an eval (which would cause error + * messages to come from the eval), may be optimised + * away from the execution path in the regex code blocks; + * so manually set PL_curcop to it initially */ + { + OP *o = cUNOPx(nop)->op_first; + assert(o->op_type == OP_NULL); + if (o->op_targ == OP_SCOPE) { + o = cUNOPo->op_first; + } + else { + assert(o->op_targ == OP_LEAVE); + o = cUNOPo->op_first; + assert(o->op_type == OP_ENTER); + o = o->op_sibling; + } + + if (o->op_type != OP_STUB) { + assert( o->op_type == OP_NEXTSTATE + || o->op_type == OP_DBSTATE + || (o->op_type == OP_NULL + && ( o->op_targ == OP_NEXTSTATE + || o->op_targ == OP_DBSTATE + ) + ) + ); + PL_curcop = (COP*)o; + } + } + nop = nop->op_next; + DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, - " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) ); - PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]); - PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr; + " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) ); + + /* normally if we're about to execute code from the same + * CV that we used previously, we just use the existing + * CX stack entry. However, its possible that in the + * meantime we may have backtracked, popped from the save + * stack, and undone the SAVECOMPPAD(s) associated with + * PUSH_MULTICALL; in which case PL_comppad no longer + * points to newcv's pad. */ + if (newcv != last_pushed_cv || PL_comppad != last_pad) + { + I32 depth = (newcv == caller_cv) ? 0 : 1; + if (last_pushed_cv) { + CHANGE_MULTICALL_WITHDEPTH(newcv, depth); + } + else { + PUSH_MULTICALL_WITHDEPTH(newcv, depth); + } + last_pushed_cv = newcv; + } + last_pad = PL_comppad; + + rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr; if (sv_yes_mark) { SV *sv_mrk = get_sv("REGMARK", 1); sv_setsv(sv_mrk, sv_yes_mark); } + /* we don't use MULTICALL here as we want to call the + * first op of the block of interest, rather than the + * first op of the sub */ + before = SP; + PL_op = nop; CALLRUNOPS(aTHX); /* Scalar context. */ SPAGAIN; if (SP == before) @@ -4266,12 +4406,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) Copy(&saved_state, &PL_reg_state, 1, struct re_save_state); + /* *** Note that at this point we don't restore + * PL_comppad, (or pop the CxSUB) on the assumption it may + * be used again soon. This is safe as long as nothing + * in the regexp code uses the pad ! */ PL_op = oop; - PAD_RESTORE_LOCAL(old_comppad); PL_curcop = ocurcop; PL_regeol = saved_regeol; if (!logical) { /* /(?{...})/ */ + /* restore all paren positions. Note that where the + * return value is used, we must delay this as the + * returned string to be compiled may be $1 for + * example */ + S_regcp_restore(aTHX_ rex, runops_cp); sv_setsv(save_scalar(PL_replgv), ret); break; } @@ -4291,25 +4439,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) rx = (REGEXP*) sv; } else if (SvSMAGICAL(sv)) { mg = mg_find(sv, PERL_MAGIC_qr); - assert(mg); } } else if (SvTYPE(ret) == SVt_REGEXP) { rx = (REGEXP*) ret; } else if (SvSMAGICAL(ret)) { - if (SvGMAGICAL(ret)) { - /* I don't believe that there is ever qr magic - here. */ - assert(!mg_find(ret, PERL_MAGIC_qr)); - sv_unmagic(ret, PERL_MAGIC_qr); - } - else { - mg = mg_find(ret, PERL_MAGIC_qr); - /* testing suggests mg only ends up non-NULL for - scalars who were upgraded and compiled in the - else block below. In turn, this is only - triggered in the "postponed utf8 string" tests - in t/op/pat.t */ - } + mg = mg_find(ret, PERL_MAGIC_qr); } if (mg) { @@ -4335,7 +4469,15 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) const char *const p = SvPV(ret, len); ret = newSVpvn_flags(p, len, SVs_TEMP); } - rx = CALLREGCOMP(ret, pm_flags); + if (rex->intflags & PREGf_USE_RE_EVAL) + pm_flags |= PMf_USE_RE_EVAL; + + /* if we got here, it should be an engine which + * supports compiling code blocks and stuff */ + assert(rex->engine && rex->engine->op_comp); + rx = rex->engine->op_comp(aTHX_ &ret, 1, NULL, + rex->engine, NULL, NULL, 0, pm_flags); + if (!(SvFLAGS(ret) & (SVs_TEMP | SVs_PADTMP | SVf_READONLY | SVs_GMG))) { @@ -4345,6 +4487,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0); } PL_regsize = osize; + /* safe to do now that any $1 etc has been + * interpolated into the new pattern string and + * compiled */ + S_regcp_restore(aTHX_ rex, runops_cp); } re_sv = rx; re = (struct regexp *)SvANY(rx); @@ -4359,25 +4505,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ); startpoint = rei->program + 1; ST.close_paren = 0; /* only used for GOSUB */ - /* borrowed from regtry */ - if (PL_reg_start_tmpl <= re->nparens) { - PL_reg_start_tmpl = re->nparens*3/2 + 3; - if(PL_reg_start_tmp) - Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); - else - Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*); - } eval_recurse_doit: /* Share code with GOSUB below this line */ /* run the pattern returned from (??{...}) */ - ST.cp = regcppush(0); /* Save *all* the positions. */ + ST.cp = regcppush(rex, 0); /* Save *all* the positions. */ REGCP_SET(ST.lastcp); - PL_regoffs = re->offs; /* essentially NOOP on GOSUB */ - - /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */ - PL_reglastparen = &re->lastparen; - PL_reglastcloseparen = &re->lastcloseparen; re->lastparen = 0; re->lastcloseparen = 0; @@ -4396,7 +4529,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ST.prev_rex = rex_sv; ST.prev_curlyx = cur_curlyx; - SETREX(rex_sv,re_sv); + rex_sv = re_sv; + SET_reg_curpm(rex_sv); rex = re; rexi = rei; cur_curlyx = NULL; @@ -4409,6 +4543,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } /* logical is 1, /(?(?{...})X|Y)/ */ sw = cBOOL(SvTRUE(ret)); + S_regcp_restore(aTHX_ rex, runops_cp); logical = 0; break; } @@ -4416,20 +4551,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case EVAL_AB: /* cleanup after a successful (??{A})B */ /* note: this is called twice; first after popping B, then A */ PL_reg_flags ^= ST.toggle_reg_flags; - ReREFCNT_dec(rex_sv); - SETREX(rex_sv,ST.prev_rex); + rex_sv = ST.prev_rex; + SET_reg_curpm(rex_sv); rex = (struct regexp *)SvANY(rex_sv); rexi = RXi_GET(rex); regcpblow(ST.cp); cur_eval = ST.prev_eval; cur_curlyx = ST.prev_curlyx; - /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ - PL_reglastparen = &rex->lastparen; - PL_reglastcloseparen = &rex->lastcloseparen; - /* also update PL_regoffs */ - PL_regoffs = rex->offs; - /* XXXX This is too dramatic a measure... */ PL_reg_maxiter = 0; if ( nochange_depth ) @@ -4440,13 +4569,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ /* note: this is called twice; first after popping B, then A */ PL_reg_flags ^= ST.toggle_reg_flags; - ReREFCNT_dec(rex_sv); - SETREX(rex_sv,ST.prev_rex); + rex_sv = ST.prev_rex; + SET_reg_curpm(rex_sv); rex = (struct regexp *)SvANY(rex_sv); rexi = RXi_GET(rex); - /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ - PL_reglastparen = &rex->lastparen; - PL_reglastcloseparen = &rex->lastcloseparen; PL_reginput = locinput; REGCP_UNWIND(ST.lastcp); @@ -4462,20 +4588,41 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case OPEN: n = ARG(scan); /* which paren pair */ - PL_reg_start_tmp[n] = locinput; + rex->offs[n].start_tmp = locinput - PL_bostr; if (n > PL_regsize) PL_regsize = n; + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n", + PTR2UV(rex), + PTR2UV(rex->offs), + (UV)n, + (IV)rex->offs[n].start_tmp, + (UV)PL_regsize + )); lastopen = n; break; + +/* XXX really need to log other places start/end are set too */ +#define CLOSE_CAPTURE \ + rex->offs[n].start = rex->offs[n].start_tmp; \ + rex->offs[n].end = locinput - PL_bostr; \ + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \ + "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \ + PTR2UV(rex), \ + PTR2UV(rex->offs), \ + (UV)n, \ + (IV)rex->offs[n].start, \ + (IV)rex->offs[n].end \ + )) + case CLOSE: n = ARG(scan); /* which paren pair */ - PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr; - PL_regoffs[n].end = locinput - PL_bostr; + CLOSE_CAPTURE; /*if (n > PL_regsize) PL_regsize = n;*/ - if (n > *PL_reglastparen) - *PL_reglastparen = n; - *PL_reglastcloseparen = n; + if (n > rex->lastparen) + rex->lastparen = n; + rex->lastcloseparen = n; if (cur_eval && cur_eval->u.eval.close_paren == n) { goto fake_end; } @@ -4490,14 +4637,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) if ( OP(cursor)==CLOSE ){ n = ARG(cursor); if ( n <= lastopen ) { - PL_regoffs[n].start - = PL_reg_start_tmp[n] - PL_bostr; - PL_regoffs[n].end = locinput - PL_bostr; + CLOSE_CAPTURE; /*if (n > PL_regsize) PL_regsize = n;*/ - if (n > *PL_reglastparen) - *PL_reglastparen = n; - *PL_reglastcloseparen = n; + if (n > rex->lastparen) + rex->lastparen = n; + rex->lastcloseparen = n; if ( n == ARG(scan) || (cur_eval && cur_eval->u.eval.close_paren == n)) break; @@ -4509,7 +4654,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /*NOTREACHED*/ case GROUPP: n = ARG(scan); /* which paren pair */ - sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1); + sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); break; case NGROUPP: /* reg_check_named_buff_matched returns 0 for no match */ @@ -4632,8 +4777,8 @@ NULL /* XXXX Probably it is better to teach regpush to support parenfloor > PL_regsize... */ - if (parenfloor > (I32)*PL_reglastparen) - parenfloor = *PL_reglastparen; /* Pessimization... */ + if (parenfloor > (I32)rex->lastparen) + parenfloor = rex->lastparen; /* Pessimization... */ ST.prev_curlyx= cur_curlyx; cur_curlyx = st; @@ -4693,7 +4838,10 @@ NULL /* First just match a string of min A's. */ if (n < min) { + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor); cur_curlyx->u.curlyx.lastloc = locinput; + REGCP_SET(ST.lastcp); + PUSH_STATE_GOTO(WHILEM_A_pre, A); /* NOTREACHED */ } @@ -4766,7 +4914,7 @@ NULL if (cur_curlyx->u.curlyx.minmod) { ST.save_curlyx = cur_curlyx; cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; - ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor); + ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor); REGCP_SET(ST.lastcp); PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B); /* NOTREACHED */ @@ -4775,7 +4923,7 @@ NULL /* Prefer A over B for maximal matching. */ if (n < max) { /* More greed allowed? */ - ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor); + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor); cur_curlyx->u.curlyx.lastloc = locinput; REGCP_SET(ST.lastcp); PUSH_STATE_GOTO(WHILEM_A_max, A); @@ -4799,10 +4947,10 @@ NULL /* NOTREACHED */ case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ - REGCP_UNWIND(ST.lastcp); - regcppop(rex); /* FALL THROUGH */ case WHILEM_A_pre_fail: /* just failed to match even minimal A */ + REGCP_UNWIND(ST.lastcp); + regcppop(rex); cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; cur_curlyx->u.curlyx.count--; CACHEsayNO; @@ -4822,8 +4970,9 @@ NULL && !(PL_reg_flags & RF_warned)) { PL_reg_flags |= RF_warned; - Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded", - "Complex regular subexpression recursion", + Perl_warner(aTHX_ packWARN(WARN_REGEXP), + "Complex regular subexpression recursion limit (%d) " + "exceeded", REG_INFTY - 1); } @@ -4846,8 +4995,8 @@ NULL { PL_reg_flags |= RF_warned; Perl_warner(aTHX_ packWARN(WARN_REGEXP), - "%s limit (%d) exceeded", - "Complex regular subexpression recursion", + "Complex regular subexpression recursion " + "limit (%d) exceeded", REG_INFTY - 1); } cur_curlyx->u.curlyx.count--; @@ -4860,7 +5009,7 @@ NULL /* Try grabbing another A and see if it helps. */ PL_reginput = locinput; cur_curlyx->u.curlyx.lastloc = locinput; - ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor); + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor); REGCP_SET(ST.lastcp); PUSH_STATE_GOTO(WHILEM_A_min, /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS); @@ -4878,7 +5027,8 @@ NULL case BRANCH: /* /(...|A|...)/ */ scan = NEXTOPER(scan); /* scan now points to inner node */ - ST.lastparen = *PL_reglastparen; + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; ST.next_branch = next; REGCP_SET(ST.cp); PL_reginput = locinput; @@ -4912,10 +5062,10 @@ NULL no_final = 0; } REGCP_UNWIND(ST.cp); - for (n = *PL_reglastparen; n > ST.lastparen; n--) - PL_regoffs[n].end = -1; - *PL_reglastparen = n; - /*dmq: *PL_reglastcloseparen = n; */ + for (n = rex->lastparen; n > ST.lastparen; n--) + rex->offs[n].end = -1; + rex->lastparen = n; + rex->lastcloseparen = ST.lastcloseparen; scan = ST.next_branch; /* no more branches? */ if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { @@ -4949,13 +5099,14 @@ NULL ST.me = scan; scan = NEXTOPER(scan) + NODE_STEP_REGNODE; + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + /* if paren positive, emulate an OPEN/CLOSE around A */ if (ST.me->flags) { U32 paren = ST.me->flags; if (paren > PL_regsize) PL_regsize = paren; - if (paren > *PL_reglastparen) - *PL_reglastparen = paren; scan += NEXT_OFF(scan); /* Skip former OPEN. */ } ST.A = scan; @@ -5049,6 +5200,8 @@ NULL switch (OP(text_node)) { case EXACTF: ST.c2 = PL_fold[ST.c1]; break; case EXACTFA: + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; default: ST.c2 = ST.c1; @@ -5079,16 +5232,18 @@ NULL } if (ST.me->flags) { - /* mark current A as captured */ + /* emulate CLOSE: mark current A as captured */ I32 paren = ST.me->flags; if (ST.count) { - PL_regoffs[paren].start + rex->offs[paren].start = HOPc(PL_reginput, -ST.alen) - PL_bostr; - PL_regoffs[paren].end = PL_reginput - PL_bostr; - /*dmq: *PL_reglastcloseparen = paren; */ + rex->offs[paren].end = PL_reginput - PL_bostr; + if ((U32)paren > rex->lastparen) + rex->lastparen = paren; + rex->lastcloseparen = paren; } else - PL_regoffs[paren].end = -1; + rex->offs[paren].end = -1; if (cur_eval && cur_eval->u.eval.close_paren && cur_eval->u.eval.close_paren == (U32)ST.me->flags) { @@ -5104,6 +5259,8 @@ NULL case CURLYM_B_fail: /* just failed to match a B */ REGCP_UNWIND(ST.cp); + rex->lastparen = ST.lastparen; + rex->lastcloseparen = ST.lastcloseparen; if (ST.minmod) { I32 max = ARG2(ST.me); if (max != REG_INFTY && ST.count == max) @@ -5123,12 +5280,17 @@ NULL #define CURLY_SETPAREN(paren, success) \ if (paren) { \ if (success) { \ - PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \ - PL_regoffs[paren].end = locinput - PL_bostr; \ - *PL_reglastcloseparen = paren; \ + rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \ + rex->offs[paren].end = locinput - PL_bostr; \ + if (paren > rex->lastparen) \ + rex->lastparen = paren; \ + rex->lastcloseparen = paren; \ + } \ + else { \ + rex->offs[paren].end = -1; \ + rex->lastparen = ST.lastparen; \ + rex->lastcloseparen = ST.lastcloseparen; \ } \ - else \ - PL_regoffs[paren].end = -1; \ } case STAR: /* /A*B/ where A is width 1 */ @@ -5145,10 +5307,10 @@ NULL goto repeat; case CURLYN: /* /(A){m,n}B/ where A is width 1 */ ST.paren = scan->flags; /* Which paren to set */ + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; if (ST.paren > PL_regsize) PL_regsize = ST.paren; - if (ST.paren > *PL_reglastparen) - *PL_reglastparen = ST.paren; ST.min = ARG1(scan); /* min to match */ ST.max = ARG2(scan); /* max to match */ if (cur_eval && cur_eval->u.eval.close_paren && @@ -5203,6 +5365,8 @@ NULL switch (OP(text_node)) { case EXACTF: ST.c2 = PL_fold[ST.c1]; break; case EXACTFA: + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; default: ST.c2 = ST.c1; break; @@ -5210,25 +5374,12 @@ NULL } else { /* UTF_PATTERN */ if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) { - STRLEN ulen1, ulen2; - U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; - U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; + STRLEN ulen; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; - to_utf8_lower((U8*)s, tmpbuf1, &ulen1); - to_utf8_upper((U8*)s, tmpbuf2, &ulen2); -#ifdef EBCDIC - ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0, - ckWARN(WARN_UTF8) ? - 0 : UTF8_ALLOW_ANY); - ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0, - ckWARN(WARN_UTF8) ? - 0 : UTF8_ALLOW_ANY); -#else - ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0, + to_utf8_fold((U8*)s, tmpbuf, &ulen); + ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0, uniflags); - ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0, - uniflags); -#endif } else { ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0, @@ -5303,8 +5454,6 @@ NULL case CURLY_B_min_known_fail: /* failed to find B in a non-greedy match where c1,c2 valid */ - if (ST.paren && ST.count) - PL_regoffs[ST.paren].end = -1; PL_reginput = locinput; /* Could be reset... */ REGCP_UNWIND(ST.cp); @@ -5381,8 +5530,6 @@ NULL case CURLY_B_min_fail: /* failed to find B in a non-greedy match where c1,c2 invalid */ - if (ST.paren && ST.count) - PL_regoffs[ST.paren].end = -1; REGCP_UNWIND(ST.cp); /* failed -- move forward one */ @@ -5428,8 +5575,6 @@ NULL /* FALL THROUGH */ case CURLY_B_max_fail: /* failed to find B in a greedy match */ - if (ST.paren && ST.count) - PL_regoffs[ST.paren].end = -1; REGCP_UNWIND(ST.cp); /* back up. */ @@ -5444,32 +5589,24 @@ NULL fake_end: if (cur_eval) { /* we've just finished A in /(??{A})B/; now continue with B */ - I32 tmpix; st->u.eval.toggle_reg_flags = cur_eval->u.eval.toggle_reg_flags; PL_reg_flags ^= st->u.eval.toggle_reg_flags; st->u.eval.prev_rex = rex_sv; /* inner */ - SETREX(rex_sv,cur_eval->u.eval.prev_rex); + st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */ + rex_sv = cur_eval->u.eval.prev_rex; + SET_reg_curpm(rex_sv); rex = (struct regexp *)SvANY(rex_sv); rexi = RXi_GET(rex); cur_curlyx = cur_eval->u.eval.prev_curlyx; - ReREFCNT_inc(rex_sv); - st->u.eval.cp = regcppush(0); /* Save *all* the positions. */ - - /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ - PL_reglastparen = &rex->lastparen; - PL_reglastcloseparen = &rex->lastcloseparen; REGCP_SET(st->u.eval.lastcp); PL_reginput = locinput; /* Restore parens of the outer rex without popping the * savestack */ - tmpix = PL_savestack_ix; - PL_savestack_ix = cur_eval->u.eval.lastcp; - regcppop(rex); - PL_savestack_ix = tmpix; + S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp); st->u.eval.prev_eval = cur_eval; cur_eval = cur_eval->u.eval.prev_eval; @@ -5671,27 +5808,6 @@ NULL sayNO; /* NOTREACHED */ #undef ST - case FOLDCHAR: - n = ARG(scan); - if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) { - locinput += ln; - } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) { - sayNO; - } else { - U8 folded[UTF8_MAXBYTES_CASE+1]; - STRLEN foldlen; - const char * const l = locinput; - char *e = PL_regeol; - to_uni_fold(n, folded, &foldlen); - - if (! foldEQ_utf8((const char*) folded, 0, foldlen, 1, - l, &e, 0, utf8_target)) { - sayNO; - } - locinput = e; - } - nextchr = UCHARAT(locinput); - break; case LNBREAK: if ((n=is_LNBREAK(locinput,utf8_target))) { locinput += n; @@ -5702,6 +5818,8 @@ NULL #define CASE_CLASS(nAmE) \ case nAmE: \ + if (locinput >= PL_regeol) \ + sayNO; \ if ((n=is_##nAmE(locinput,utf8_target))) { \ locinput += n; \ nextchr = UCHARAT(locinput); \ @@ -5709,6 +5827,8 @@ NULL sayNO; \ break; \ case N##nAmE: \ + if (locinput >= PL_regeol) \ + sayNO; \ if ((n=is_##nAmE(locinput,utf8_target))) { \ sayNO; \ } else { \ @@ -5834,7 +5954,7 @@ yes: DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", PL_colors[4], PL_colors[5])); - if (PL_reg_eval_set) { + if (PL_reg_state.re_state_eval_setup_done) { /* each successfully executed (?{...}) block does the equivalent of * local $^R = do {...} * When popping the save stack, all these locals would be undone; @@ -5900,6 +6020,12 @@ no_silent: sv_setsv(sv_mrk, sv_yes_mark); } + + if (last_pushed_cv) { + dSP; + POP_MULTICALL; + } + /* clean up; in particular, free all slabs above current one */ LEAVE_SCOPE(oldsave); @@ -5983,7 +6109,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) /* Here, the string is utf8, and the pattern char is different * in utf8 than not, so can't compare them directly. Outside the - * loop, find find the two utf8 bytes that represent c, and then + * loop, find the two utf8 bytes that represent c, and then * look for those in sequence in the utf8 string */ U8 high = UTF8_TWO_BYTE_HI(c); U8 low = UTF8_TWO_BYTE_LO(c); @@ -6009,8 +6135,13 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) goto do_exactf; case EXACTF: + utf8_flags = 0; + goto do_exactf; + + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: case EXACTFU: - utf8_flags = 0; + utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0; /* The comments for the EXACT case above apply as well to these fold * ones */ @@ -6019,7 +6150,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) c = (U8)*STRING(p); assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); - if (utf8_target) { /* Use full Unicode fold matching */ + if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */ char *tmpeol = loceol; while (hardcount < max && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, @@ -6050,6 +6181,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) switch (OP(p)) { case EXACTF: folded = PL_fold[c]; break; case EXACTFA: + case EXACTFU_TRICKYFOLD: case EXACTFU: folded = PL_fold_latin1[c]; break; case EXACTFL: folded = PL_fold_locale[c]; break; default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); @@ -6450,20 +6582,41 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) /* -- regclass_swash - prepare the utf8 swash -*/ - +- regclass_swash - prepare the utf8 swash. Wraps the shared core version to +create a copy so that changes the caller makes won't change the shared one + */ SV * Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp) { + PERL_ARGS_ASSERT_REGCLASS_SWASH; + return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp)); +} +#endif + +STATIC SV * +S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp) +{ + /* Returns the swash for the input 'node' in the regex 'prog'. + * If is true, will attempt to create the swash if not already + * done. + * If is non-null, will return the swash initialization string in + * it. + * If is non-null, will return the alternates to the regular swash + * in it + * Tied intimately to how regcomp.c sets up the data structure */ + dVAR; SV *sw = NULL; SV *si = NULL; SV *alt = NULL; + SV* invlist = NULL; + RXi_GET_DECL(prog,progi); const struct reg_data * const data = prog ? progi->data : NULL; - PERL_ARGS_ASSERT_REGCLASS_SWASH; + PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH; + + assert(ANYOF_NONBITMAP(node)); if (data && data->count) { const U32 n = ARG(node); @@ -6472,34 +6625,82 @@ Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool SV * const rv = MUTABLE_SV(data->data[n]); AV * const av = MUTABLE_AV(SvRV(rv)); SV **const ary = AvARRAY(av); - SV **a, **b; + bool invlist_has_user_defined_property; - /* See the end of regcomp.c:S_regclass() for - * documentation of these array elements. */ - - si = *ary; - a = SvROK(ary[1]) ? &ary[1] : NULL; - b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL; + si = *ary; /* ary[0] = the string to initialize the swash with */ + + /* Elements 3 and 4 are either both present or both absent. [3] is + * any inversion list generated at compile time; [4] indicates if + * that inversion list has any user-defined properties in it. */ + if (av_len(av) >= 3) { + invlist = ary[3]; + invlist_has_user_defined_property = cBOOL(SvUV(ary[4])); + } + else { + invlist = NULL; + invlist_has_user_defined_property = FALSE; + } - if (a) - sw = *a; + /* Element [1] is reserved for the set-up swash. If already there, + * return it; if not, create it and store it there */ + if (SvROK(ary[1])) { + sw = ary[1]; + } else if (si && doinit) { - sw = swash_init("utf8", "", si, 1, 0); + + sw = _core_swash_init("utf8", /* the utf8 package */ + "", /* nameless */ + si, + 1, /* binary */ + 0, /* not from tr/// */ + FALSE, /* is error if can't find + property */ + invlist, + invlist_has_user_defined_property); (void)av_store(av, 1, sw); } - if (b) - alt = *b; + + /* Element [2] is for any multi-char folds. Note that is a + * fundamentally flawed design, because can't backtrack and try + * again. See [perl #89774] */ + if (SvTYPE(ary[2]) == SVt_PVAV) { + alt = ary[2]; + } } } - if (listsvp) - *listsvp = si; + if (listsvp) { + SV* matches_string = newSVpvn("", 0); + SV** invlistsvp; + + /* Use the swash, if any, which has to have incorporated into it all + * possibilities */ + if ( sw + && SvROK(sw) + && SvTYPE(SvRV(sw)) == SVt_PVHV + && (invlistsvp = hv_fetchs(MUTABLE_HV(SvRV(sw)), "INVLIST", FALSE))) + { + invlist = *invlistsvp; + } + else if (si && si != &PL_sv_undef) { + + /* If no swash, use the input nitialization string, if available */ + sv_catsv(matches_string, si); + } + + /* Add the inversion list to whatever we have. This may have come from + * the swash, or from an input parameter */ + if (invlist) { + sv_catsv(matches_string, _invlist_contents(invlist)); + } + *listsvp = matches_string; + } + if (altsvp) *altsvp = alt; return sw; } -#endif /* - reginclass - determine if a character falls into a character class @@ -6590,8 +6791,8 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) || (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) || (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) || (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) || @@ -6608,8 +6809,8 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) || (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) || (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c)) + (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c)) ) /* How's that for a conditional? */ ) { match = TRUE; @@ -6620,19 +6821,25 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, /* If the bitmap didn't (or couldn't) match, and something outside the * bitmap could match, try that. Locale nodes specifiy completely the * behavior of code points in the bit map (otherwise, a utf8 target would - * cause them to be treated as Unicode and not locale), except XXX in + * cause them to be treated as Unicode and not locale), except in * the very unlikely event when this node is a synthetic start class, which - * could be a combination of locale and non-locale nodes */ + * could be a combination of locale and non-locale nodes. So allow locale + * to match for the synthetic start class, which will give a false + * positive that will be resolved when the match is done again as not part + * of the synthetic start class */ if (!match) { if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { match = TRUE; /* Everything above 255 matches */ } - else if ((flags & ANYOF_NONBITMAP_NON_UTF8 - || (utf8_target && ANYOF_NONBITMAP(n) - && (c >=256 || ! (flags & ANYOF_LOCALE))))) + else if (ANYOF_NONBITMAP(n) + && ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target + && (c >=256 + || (! (flags & ANYOF_LOCALE)) + || (flags & ANYOF_IS_SYNTHETIC))))) { AV *av; - SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av); + SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av); if (sw) { U8 * utf8_p; @@ -6662,31 +6869,19 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, else if (flags & ANYOF_LOC_NONBITMAP_FOLD) { /* Here, we need to test if the fold of the target string - * matches. In the case of a multi-char fold that is - * caught by regcomp.c, it has stored all such folds into - * 'av'; we linearly check to see if any match the target - * string (folded). We know that the originals were each - * one character, but we don't currently know how many - * characters/bytes each folded to, except we do know that - * there are small limits imposed by Unicode. XXX A - * performance enhancement would be to have regcomp.c store - * the max number of chars/bytes that are in an av entry, - * as, say the 0th element. Even better would be to have a - * hash of the few characters that can start a multi-char - * fold to the max number of chars of those folds. - * - * Further down, if there isn't a - * match in the av, we will check if there is another - * fold-type match. For that, we also need the fold, but - * only the first character. No sense in folding it twice, - * so we do it here, even if there isn't any multi-char - * fold, so we always fold at least the first character. - * If the node is a straight ANYOF node, or there is only - * one character available in the string, or if there isn't - * any av, that's all we have to fold. In the case of a - * multi-char fold, we do have guarantees in Unicode that - * it can only expand up to so many characters and so many - * bytes. We keep track so don't exceed either. + * matches. The non-multi char folds have all been moved to + * the compilation phase, and the multi-char folds have + * been stored by regcomp into 'av'; we linearly check to + * see if any match the target string (folded). We know + * that the originals were each one character, but we don't + * currently know how many characters/bytes each folded to, + * except we do know that there are small limits imposed by + * Unicode. XXX A performance enhancement would be to have + * regcomp.c store the max number of chars/bytes that are + * in an av entry, as, say the 0th element. Even better + * would be to have a hash of the few characters that can + * start a multi-char fold to the max number of chars of + * those folds. * * If there is a match, we will need to advance (if lenp is * specified) the match pointer in the target string. But @@ -6696,28 +6891,34 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, * create a map so that we know how many bytes in the * source to advance given that we have matched a certain * number of bytes in the fold. This map is stored in - * 'map_fold_len_back'. The first character in the fold - * has array element 1 contain the number of bytes in the - * source that folded to it; the 2nd is the cumulative - * number to match it; ... */ - U8 map_fold_len_back[UTF8_MAX_FOLD_CHAR_EXPAND+1] = { 0 }; + * 'map_fold_len_back'. Let n mean the number of bytes in + * the fold of the first character that we are folding. + * Then map_fold_len_back[n] is set to the number of bytes + * in that first character. Similarly let m be the + * corresponding number for the second character to be + * folded. Then map_fold_len_back[n+m] is set to the + * number of bytes occupied by the first two source + * characters. ... */ + U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 }; U8 folded[UTF8_MAXBYTES_CASE+1]; STRLEN foldlen = 0; /* num bytes in fold of 1st char */ - STRLEN foldlen_for_av; /* num bytes in fold of all chars */ + STRLEN total_foldlen = 0; /* num bytes in fold of all + chars */ if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) { /* Here, only need to fold the first char of the target - * string */ + * string. It the source wasn't utf8, is 1 byte long */ to_utf8_fold(utf8_p, folded, &foldlen); - foldlen_for_av = foldlen; - map_fold_len_back[1] = UTF8SKIP(utf8_p); + total_foldlen = foldlen; + map_fold_len_back[foldlen] = (utf8_target) + ? UTF8SKIP(utf8_p) + : 1; } else { /* Here, need to fold more than the first char. Do so * up to the limits */ - UV which_char = 0; U8* source_ptr = utf8_p; /* The source for the fold is the regex target string */ @@ -6725,8 +6926,10 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, U8* e = utf8_p + maxlen; /* Can't go beyond last available byte in the target string */ - while (which_char < UTF8_MAX_FOLD_CHAR_EXPAND - && source_ptr < e) + U8 i; + for (i = 0; + i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e; + i++) { /* Fold the next character */ @@ -6744,125 +6947,57 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, break; } - /* Save the first character's folded length, in - * case we have to use it later */ - if (! foldlen) { - foldlen = this_char_foldlen; - } - - /* Here, add the fold of this character */ + /* Add the fold of this character */ Copy(this_char_folded, folded_ptr, this_char_foldlen, U8); - which_char++; - map_fold_len_back[which_char] = - map_fold_len_back[which_char - 1] - + UTF8SKIP(source_ptr); - folded_ptr += this_char_foldlen; source_ptr += UTF8SKIP(source_ptr); + folded_ptr += this_char_foldlen; + total_foldlen = folded_ptr - folded; + + /* Create map from the number of bytes in the fold + * back to the number of bytes in the source. If + * the source isn't utf8, the byte count is just + * the number of characters so far */ + map_fold_len_back[total_foldlen] + = (utf8_target) + ? source_ptr - utf8_p + : i + 1; } *folded_ptr = '\0'; - foldlen_for_av = folded_ptr - folded; } /* Do the linear search to see if the fold is in the list - * of multi-char folds. (Useless to look if won't be able - * to store that it is a multi-char fold in *lenp) */ - if (lenp && av) { + * of multi-char folds. */ + if (av) { I32 i; for (i = 0; i <= av_len(av); i++) { SV* const sv = *av_fetch(av, i, FALSE); STRLEN len; const char * const s = SvPV_const(sv, len); - if (len <= foldlen_for_av && memEQ(s, - (char*)folded, - len)) + + if (len <= total_foldlen + && memEQ(s, (char*)folded, len) + + /* If 0, means matched a partial char. See + * [perl #90536] */ + && map_fold_len_back[len]) { /* Advance the target string ptr to account for * this fold, but have to translate from the * folded length to the corresponding source - * length. The array is indexed by how many - * characters in the match */ - *lenp = map_fold_len_back[ - utf8_length(folded, folded + len)]; + * length. */ + if (lenp) { + *lenp = map_fold_len_back[len]; + } match = TRUE; break; } } } -#if 0 - if (!match) { /* See if the folded version matches */ - SV** listp; - - /* Consider "k" =~ /[K]/i. The line above would have - * just folded the 'k' to itself, and that isn't going - * to match 'K'. So we look through the closure of - * everything that folds to 'k'. That will find the - * 'K'. Initialize the list, if necessary */ - if (! PL_utf8_foldclosures) { - - /* If the folds haven't been read in, call a fold - * function to force that */ - if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; - STRLEN dummy_len; - to_utf8_fold((U8*) "A", dummy, &dummy_len); - } - PL_utf8_foldclosures = - _swash_inversion_hash(PL_utf8_tofold); - } - - /* The data structure is a hash with the keys every - * character that is folded to, like 'k', and the - * values each an array of everything that folds to its - * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */ - if ((listp = hv_fetch(PL_utf8_foldclosures, - (char *) folded, foldlen, FALSE))) - { - AV* list = (AV*) *listp; - IV i; - for (i = 0; i <= av_len(list); i++) { - SV** try_p = av_fetch(list, i, FALSE); - char* try_c; - if (try_p == NULL) { - Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); - } - /* Don't have to worry about embedded nulls - * since NULL isn't folded or foldable */ - try_c = SvPVX(*try_p); - - /* The fold in a few cases of an above Latin1 - * char is in the Latin1 range, and hence may - * be in the bitmap */ - if (UTF8_IS_INVARIANT(*try_c) - && ANYOF_BITMAP_TEST(n, - UNI_TO_NATIVE(*try_c))) - { - match = TRUE; - break; - } - else if - (UTF8_IS_DOWNGRADEABLE_START(*try_c) - && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE( - TWO_BYTE_UTF8_TO_UNI(try_c[0], - try_c[1])))) - { - /* Since the fold comes from internally - * generated data, we can safely assume it - * is valid utf8 in the test above */ - match = TRUE; - break; - } else if (swash_fetch(sw, (U8*) try_c, TRUE)) { - match = TRUE; - break; - } - } - } - } -#endif } /* If we allocated a string above, free it */ @@ -6877,6 +7012,10 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, STATIC U8 * S_reghop3(U8 *s, I32 off, const U8* lim) { + /* return the position 'off' UTF-8 characters away from 's', forward if + * 'off' >= 0, backwards if negative. But don't go outside of position + * 'lim', which better be < s if off < 0 */ + dVAR; PERL_ARGS_ASSERT_REGHOP3; @@ -6967,7 +7106,7 @@ restore_pos(pTHX_ void *arg) { dVAR; regexp * const rex = (regexp *)arg; - if (PL_reg_eval_set) { + if (PL_reg_state.re_state_eval_setup_done) { if (PL_reg_oldsaved) { rex->subbeg = PL_reg_oldsaved; rex->sublen = PL_reg_oldsavedlen; @@ -6977,7 +7116,7 @@ restore_pos(pTHX_ void *arg) RXp_MATCH_COPIED_on(rex); } PL_reg_magic->mg_len = PL_reg_oldpos; - PL_reg_eval_set = 0; + PL_reg_state.re_state_eval_setup_done = FALSE; PL_curpm = PL_reg_oldcurpm; } } @@ -6996,16 +7135,16 @@ S_to_utf8_substr(pTHX_ register regexp *prog) prog->substrs->data[i].utf8_substr = sv; sv_utf8_upgrade(sv); if (SvVALID(prog->substrs->data[i].substr)) { - const U8 flags = BmFLAGS(prog->substrs->data[i].substr); - if (flags & FBMcf_TAIL) { + if (SvTAIL(prog->substrs->data[i].substr)) { /* Trim the trailing \n that fbm_compile added last time. */ SvCUR_set(sv, SvCUR(sv) - 1); /* Whilst this makes the SV technically "invalid" (as its buffer is no longer followed by "\0") when fbm_compile() adds the "\n" back, a "\0" is restored. */ - } - fbm_compile(sv, flags); + fbm_compile(sv, FBMcf_TAIL); + } else + fbm_compile(sv, 0); } if (prog->substrs->data[i].substr == prog->check_substr) prog->check_utf8 = sv; @@ -7027,15 +7166,14 @@ S_to_byte_substr(pTHX_ register regexp *prog) SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); if (sv_utf8_downgrade(sv, TRUE)) { if (SvVALID(prog->substrs->data[i].utf8_substr)) { - const U8 flags - = BmFLAGS(prog->substrs->data[i].utf8_substr); - if (flags & FBMcf_TAIL) { + if (SvTAIL(prog->substrs->data[i].utf8_substr)) { /* Trim the trailing \n that fbm_compile added last time. */ SvCUR_set(sv, SvCUR(sv) - 1); - } - fbm_compile(sv, flags); - } + fbm_compile(sv, FBMcf_TAIL); + } else + fbm_compile(sv, 0); + } } else { SvREFCNT_dec(sv); sv = &PL_sv_undef; @@ -7051,8 +7189,8 @@ S_to_byte_substr(pTHX_ register regexp *prog) * Local variables: * c-indentation-style: bsd * c-basic-offset: 4 - * indent-tabs-mode: t + * indent-tabs-mode: nil * End: * - * ex: set ts=8 sts=4 sw=4 noet: + * ex: set ts=8 sts=4 sw=4 et: */