X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/8a90a8fee1032a1bdee2a164f8265ff160fe22f0..61dad979a56eaefa315dbe8b01c52f0cb2723105:/regexec.c diff --git a/regexec.c b/regexec.c index 4abb712..57f47ce 100644 --- a/regexec.c +++ b/regexec.c @@ -80,6 +80,9 @@ # include "regcomp.h" #endif +#include "inline_invlist.c" +#include "utf8_strings.h" + #define RF_tainted 1 /* tainted information used? e.g. locale */ #define RF_warned 2 /* warned about big count? */ @@ -87,9 +90,6 @@ #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0) -#define RS_init 1 /* eval environment created */ -#define RS_set 2 /* replsv value is set */ - #ifndef STATIC #define STATIC static #endif @@ -122,23 +122,17 @@ #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) /* these are unrolled below in the CCC_TRY_XXX defined */ -#ifdef EBCDIC - /* Often 'str' is a hard-coded utf8 string instead of utfebcdic. so just - * skip the check on EBCDIC platforms */ -# define LOAD_UTF8_CHARCLASS(class,str) LOAD_UTF8_CHARCLASS_NO_CHECK(class) -#else -# define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \ +#define LOAD_UTF8_CHARCLASS(class,str) STMT_START { \ if (!CAT2(PL_utf8_,class)) { \ bool ok; \ ENTER; save_re_context(); \ ok=CAT2(is_utf8_,class)((const U8*)str); \ + PERL_UNUSED_VAR(ok); \ assert(ok); assert(CAT2(PL_utf8_,class)); LEAVE; } } STMT_END -#endif - /* Doesn't do an assert to verify that is correct */ #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \ if (!CAT2(PL_utf8_,class)) { \ - bool throw_away __attribute__unused__; \ + bool throw_away PERL_UNUSED_DECL; \ ENTER; save_re_context(); \ throw_away = CAT2(is_utf8_,class)((const U8*)" "); \ LEAVE; } } STMT_END @@ -148,21 +142,17 @@ #define LOAD_UTF8_CHARCLASS_SPACE() LOAD_UTF8_CHARCLASS(space," ") #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ - LOAD_UTF8_CHARCLASS(X_begin, " "); \ - LOAD_UTF8_CHARCLASS(X_non_hangul, "A"); \ - /* These are utf8 constants, and not utf-ebcdic constants, so the \ - * assert should likely and hopefully fail on an EBCDIC machine */ \ - LOAD_UTF8_CHARCLASS(X_extend, "\xcc\x80"); /* U+0300 */ \ - \ - /* No asserts are done for these, in case called on an early \ - * Unicode version in which they map to nothing */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* U+0E40 "\xe0\xb9\x80" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_L); /* U+1100 "\xe1\x84\x80" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV); /* U+AC00 "\xea\xb0\x80" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_LVT); /* U+AC01 "\xea\xb0\x81" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_LV_LVT_V);/* U+AC01 "\xea\xb0\x81" */\ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_T); /* U+11A8 "\xe1\x86\xa8" */ \ - LOAD_UTF8_CHARCLASS_NO_CHECK(X_V) /* U+1160 "\xe1\x85\xa0" */ + /* No asserts are done for some of these, in case called on a */ \ + /* Unicode version in which they map to nothing */ \ + LOAD_UTF8_CHARCLASS(X_begin, HYPHEN_UTF8); \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_special_begin); \ + LOAD_UTF8_CHARCLASS(X_extend, COMBINING_GRAVE_ACCENT_UTF8); \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_prepend);/* empty in most releases*/ \ + LOAD_UTF8_CHARCLASS(X_L, HANGUL_CHOSEONG_KIYEOK_UTF8); \ + LOAD_UTF8_CHARCLASS(X_LV_LVT_V, HANGUL_JUNGSEONG_FILLER_UTF8); \ + LOAD_UTF8_CHARCLASS_NO_CHECK(X_RI); /* empty in many releases */ \ + LOAD_UTF8_CHARCLASS(X_T, HANGUL_JONGSEONG_KIYEOK_UTF8); \ + LOAD_UTF8_CHARCLASS(X_V, HANGUL_JUNGSEONG_FILLER_UTF8) #define PLACEHOLDER /* Something for the preprocessor to grab onto */ @@ -303,13 +293,13 @@ /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( (OP(rn)==EXACTFU || OP(rn)==EXACTFA || OP(rn)==EXACTF) || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) -#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn) == EXACTFA) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -335,25 +325,28 @@ static void restore_pos(pTHX_ void *arg); -#define REGCP_PAREN_ELEMS 4 -#define REGCP_OTHER_ELEMS 5 +#define REGCP_PAREN_ELEMS 3 +#define REGCP_OTHER_ELEMS 3 #define REGCP_FRAME_ELEMS 1 /* REGCP_FRAME_ELEMS are not part of the REGCP_OTHER_ELEMS and * are needed for the regexp context stack bookkeeping. */ STATIC CHECKPOINT -S_regcppush(pTHX_ I32 parenfloor) +S_regcppush(pTHX_ const regexp *rex, I32 parenfloor) { dVAR; const int retval = PL_savestack_ix; const int paren_elems_to_push = (PL_regsize - parenfloor) * REGCP_PAREN_ELEMS; const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; - int p; + I32 p; GET_RE_DEBUG_FLAGS_DECL; + PERL_ARGS_ASSERT_REGCPPUSH; + if (paren_elems_to_push < 0) - Perl_croak(aTHX_ "panic: paren_elems_to_push < 0"); + Perl_croak(aTHX_ "panic: paren_elems_to_push, %i < 0", + paren_elems_to_push); if ((elems_shifted >> SAVE_TIGHT_SHIFT) != total_elems) Perl_croak(aTHX_ "panic: paren_elems_to_push offset %"UVuf @@ -362,25 +355,31 @@ S_regcppush(pTHX_ I32 parenfloor) SSGROW(total_elems + REGCP_FRAME_ELEMS); - for (p = PL_regsize; p > parenfloor; p--) { + DEBUG_BUFFERS_r( + if ((int)PL_regsize > (int)parenfloor) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": saving capture indices:\n", + PTR2UV(rex), + PTR2UV(rex->offs) + ); + ); + for (p = parenfloor+1; p <= (I32)PL_regsize; p++) { /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ - SSPUSHINT(PL_regoffs[p].end); - SSPUSHINT(PL_regoffs[p].start); - SSPUSHPTR(PL_reg_start_tmp[p]); - SSPUSHINT(p); + SSPUSHINT(rex->offs[p].end); + SSPUSHINT(rex->offs[p].start); + SSPUSHINT(rex->offs[p].start_tmp); DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, - " saving \\%"UVuf" %"IVdf"(%"IVdf")..%"IVdf"\n", - (UV)p, (IV)PL_regoffs[p].start, - (IV)(PL_reg_start_tmp[p] - PL_bostr), - (IV)PL_regoffs[p].end + " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", + (UV)p, + (IV)rex->offs[p].start, + (IV)rex->offs[p].start_tmp, + (IV)rex->offs[p].end )); } /* REGCP_OTHER_ELEMS are pushed in any case, parentheses or no. */ - SSPUSHPTR(PL_regoffs); SSPUSHINT(PL_regsize); - SSPUSHINT(*PL_reglastparen); - SSPUSHINT(*PL_reglastcloseparen); - SSPUSHPTR(PL_reginput); + SSPUSHINT(rex->lastparen); + SSPUSHINT(rex->lastcloseparen); SSPUSHUV(SAVEt_REGCONTEXT | elems_shifted); /* Magic cookie. */ return retval; @@ -402,12 +401,19 @@ S_regcppush(pTHX_ I32 parenfloor) (IV)(cp), (IV)PL_savestack_ix)); \ regcpblow(cp) -STATIC char * -S_regcppop(pTHX_ const regexp *rex) +#define UNWIND_PAREN(lp, lcp) \ + for (n = rex->lastparen; n > lp; n--) \ + rex->offs[n].end = -1; \ + rex->lastparen = n; \ + rex->lastcloseparen = lcp; + + +STATIC void +S_regcppop(pTHX_ regexp *rex) { dVAR; UV i; - char *input; + U32 paren; GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_REGCPPOP; @@ -416,38 +422,38 @@ S_regcppop(pTHX_ const regexp *rex) i = SSPOPUV; assert((i & SAVE_MASK) == SAVEt_REGCONTEXT); /* Check that the magic cookie is there. */ i >>= SAVE_TIGHT_SHIFT; /* Parentheses elements to pop. */ - input = (char *) SSPOPPTR; - *PL_reglastcloseparen = SSPOPINT; - *PL_reglastparen = SSPOPINT; + rex->lastcloseparen = SSPOPINT; + rex->lastparen = SSPOPINT; PL_regsize = SSPOPINT; - PL_regoffs=(regexp_paren_pair *) SSPOPPTR; i -= REGCP_OTHER_ELEMS; /* Now restore the parentheses context. */ + DEBUG_BUFFERS_r( + if (i || rex->lastparen + 1 <= rex->nparens) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": restoring capture indices to:\n", + PTR2UV(rex), + PTR2UV(rex->offs) + ); + ); + paren = PL_regsize; for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { I32 tmps; - U32 paren = (U32)SSPOPINT; - PL_reg_start_tmp[paren] = (char *) SSPOPPTR; - PL_regoffs[paren].start = SSPOPINT; + rex->offs[paren].start_tmp = SSPOPINT; + rex->offs[paren].start = SSPOPINT; tmps = SSPOPINT; - if (paren <= *PL_reglastparen) - PL_regoffs[paren].end = tmps; - DEBUG_BUFFERS_r( - PerlIO_printf(Perl_debug_log, - " restoring \\%"UVuf" to %"IVdf"(%"IVdf")..%"IVdf"%s\n", - (UV)paren, (IV)PL_regoffs[paren].start, - (IV)(PL_reg_start_tmp[paren] - PL_bostr), - (IV)PL_regoffs[paren].end, - (paren > *PL_reglastparen ? "(no)" : "")); + if (paren <= rex->lastparen) + rex->offs[paren].end = tmps; + DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"%s\n", + (UV)paren, + (IV)rex->offs[paren].start, + (IV)rex->offs[paren].start_tmp, + (IV)rex->offs[paren].end, + (paren > rex->lastparen ? "(skipped)" : "")); ); + paren--; } - DEBUG_BUFFERS_r( - if (*PL_reglastparen + 1 <= rex->nparens) { - PerlIO_printf(Perl_debug_log, - " restoring \\%"IVdf"..\\%"IVdf" to undef\n", - (IV)(*PL_reglastparen + 1), (IV)rex->nparens); - } - ); #if 1 /* It would seem that the similar code in regtry() * already takes care of this, and in fact it is in @@ -458,13 +464,29 @@ S_regcppop(pTHX_ const regexp *rex) * this code seems to be necessary or otherwise * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ * --jhi updated by dapm */ - for (i = *PL_reglastparen + 1; i <= rex->nparens; i++) { + for (i = rex->lastparen + 1; i <= rex->nparens; i++) { if (i > PL_regsize) - PL_regoffs[i].start = -1; - PL_regoffs[i].end = -1; + rex->offs[i].start = -1; + rex->offs[i].end = -1; + DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, + " \\%"UVuf": %s ..-1 undeffing\n", + (UV)i, + (i > PL_regsize) ? "-1" : " " + )); } #endif - return input; +} + +/* restore the parens and associated vars at savestack position ix, + * but without popping the stack */ + +STATIC void +S_regcp_restore(pTHX_ regexp *rex, I32 ix) +{ + I32 tmpix = PL_savestack_ix; + PL_savestack_ix = ix; + regcppop(rex); + PL_savestack_ix = tmpix; } #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ @@ -549,17 +571,18 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, { dVAR; struct regexp *const prog = (struct regexp *)SvANY(rx); - register I32 start_shift = 0; + I32 start_shift = 0; /* Should be nonnegative! */ - register I32 end_shift = 0; - register char *s; - register SV *check; + I32 end_shift = 0; + char *s; + SV *check; char *strbeg; char *t; const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ I32 ml_anch; - register char *other_last = NULL; /* other substr checked before this */ + char *other_last = NULL; /* other substr checked before this */ char *check_at = NULL; /* check substr found at this pos */ + char *checked_upto = NULL; /* how far into the string we have already checked using find_byclass*/ const I32 multiline = prog->extflags & RXf_PMf_MULTILINE; RXi_GET_DECL(prog,progi); #ifdef DEBUGGING @@ -568,6 +591,8 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, GET_RE_DEBUG_FLAGS_DECL; PERL_ARGS_ASSERT_RE_INTUIT_START; + PERL_UNUSED_ARG(flags); + PERL_UNUSED_ARG(data); RX_MATCH_UTF8_set(rx,utf8_target); @@ -686,6 +711,8 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, { I32 srch_start_shift = start_shift; I32 srch_end_shift = end_shift; + U8* start_point; + U8* end_point; if (srch_start_shift < 0 && strbeg - s > srch_start_shift) { srch_end_shift -= ((strbeg - s) - srch_start_shift); srch_start_shift = strbeg - s; @@ -698,42 +725,6 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, (IV)prog->check_end_shift); }); - if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) { - I32 p = -1; /* Internal iterator of scream. */ - I32 * const pp = data ? data->scream_pos : &p; - const MAGIC *mg; - bool found = FALSE; - - assert(SvMAGICAL(sv)); - mg = mg_find(sv, PERL_MAGIC_study); - assert(mg); - - if (mg->mg_private == 1) { - found = ((U8 *)mg->mg_ptr)[BmRARE(check)] != (U8)~0; - } else if (mg->mg_private == 2) { - found = ((U16 *)mg->mg_ptr)[BmRARE(check)] != (U16)~0; - } else { - assert (mg->mg_private == 4); - found = ((U32 *)mg->mg_ptr)[BmRARE(check)] != (U32)~0; - } - - if (found - || ( BmRARE(check) == '\n' - && (BmPREVIOUS(check) == SvCUR(check) - 1) - && SvTAIL(check) )) - s = screaminstr(sv, check, - srch_start_shift + (s - strbeg), srch_end_shift, pp, 0); - else - goto fail_finish; - /* we may be pointing at the wrong string */ - if (s && RXp_MATCH_COPIED(prog)) - s = strbeg + (s - SvPVX_const(sv)); - if (data) - *data->scream_olds = s; - } - else { - U8* start_point; - U8* end_point; if (prog->extflags & RXf_CANY_SEEN) { start_point= (U8*)(s + srch_start_shift); end_point= (U8*)(strend - srch_end_shift); @@ -751,7 +742,6 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, s = fbm_instr( start_point, end_point, check, multiline ? FBMrf_MULTILINE : 0); } - } /* Update the count-of-usability, remove useless subpatterns, unshift s. */ @@ -1089,12 +1079,16 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, else endpos= strend; - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf"\n", - (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg))); - + if (checked_upto < s) + checked_upto = s; + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" s: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", + (IV)start_shift, (IV)(check_at - strbeg), (IV)(s - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); + t = s; - s = find_byclass(prog, progi->regstclass, s, endpos, NULL); - if (!s) { + s = find_byclass(prog, progi->regstclass, checked_upto, endpos, NULL); + if (s) { + checked_upto = s; + } else { #ifdef DEBUGGING const char *what = NULL; #endif @@ -1107,6 +1101,9 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, "This position contradicts STCLASS...\n") ); if ((prog->extflags & RXf_ANCH) && !ml_anch) goto fail; + checked_upto = HOPBACKc(endpos, start_shift); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "start_shift: %"IVdf" check_at: %"IVdf" endpos: %"IVdf" checked_upto: %"IVdf"\n", + (IV)start_shift, (IV)(check_at - strbeg), (IV)(endpos - strbeg), (IV)(checked_upto- strbeg))); /* Contradict one of substrings */ if (prog->anchored_substr || prog->anchored_utf8) { if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) { @@ -1186,58 +1183,61 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, #define DECL_TRIE_TYPE(scan) \ const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ - trie_type = (scan->flags != EXACT) \ - ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \ - : (utf8_target ? trie_utf8 : trie_plain) - -#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \ -uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ - switch (trie_type) { \ - case trie_utf8_fold: \ - if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - uscan += len; \ - len=0; \ - } else { \ - uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \ - uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \ - foldlen -= UNISKIP( uvc ); \ - uscan = foldbuf + UNISKIP( uvc ); \ - } \ - break; \ - case trie_latin_utf8_fold: \ - if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( uscan, UTF8_MAXLEN, &len, uniflags ); \ - foldlen -= len; \ - uscan += len; \ - len=0; \ - } else { \ - len = 1; \ - uvc = to_uni_fold( *(U8*)uc, foldbuf, &foldlen ); \ - foldlen -= UNISKIP( uvc ); \ - uscan = foldbuf + UNISKIP( uvc ); \ - } \ - break; \ - case trie_utf8: \ - uvc = utf8n_to_uvuni( (U8*)uc, UTF8_MAXLEN, &len, uniflags ); \ - break; \ - case trie_plain: \ - uvc = (UV)*uc; \ - len = 1; \ - } \ - if (uvc < 256) { \ - charid = trie->charmap[ uvc ]; \ - } \ - else { \ - charid = 0; \ - if (widecharmap) { \ - SV** const svpp = hv_fetch(widecharmap, \ - (char*)&uvc, sizeof(UV), 0); \ - if (svpp) \ - charid = (U16)SvIV(*svpp); \ - } \ - } \ + trie_type = ((scan->flags == EXACT) \ + ? (utf8_target ? trie_utf8 : trie_plain) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + +#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \ +uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ + STRLEN skiplen; \ + switch (trie_type) { \ + case trie_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \ + len = UTF8SKIP(uc); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_latin_utf8_fold: \ + if ( foldlen>0 ) { \ + uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + foldlen -= len; \ + uscan += len; \ + len=0; \ + } else { \ + len = 1; \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \ + skiplen = UNISKIP( uvc ); \ + foldlen -= skiplen; \ + uscan = foldbuf + skiplen; \ + } \ + break; \ + case trie_utf8: \ + uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + break; \ + case trie_plain: \ + uvc = (UV)*uc; \ + len = 1; \ + } \ + if (uvc < 256) { \ + charid = trie->charmap[ uvc ]; \ + } \ + else { \ + charid = 0; \ + if (widecharmap) { \ + SV** const svpp = hv_fetch(widecharmap, \ + (char*)&uvc, sizeof(UV), 0); \ + if (svpp) \ + charid = (U16)SvIV(*svpp); \ + } \ + } \ } STMT_END #define REXEC_FBC_EXACTISH_SCAN(CoNd) \ @@ -1421,12 +1421,12 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const U8 *fold_array; /* array for folding ords < 256 */ STRLEN ln; STRLEN lnc; - register STRLEN uskip; + STRLEN uskip; U8 c1; U8 c2; char *e; - register I32 tmp = 1; /* Scratch variable? */ - register const bool utf8_target = PL_reg_match_utf8; + I32 tmp = 1; /* Scratch variable? */ + const bool utf8_target = PL_reg_match_utf8; UV utf8_fold_flags = 0; RXi_GET_DECL(prog,progi); @@ -1464,7 +1464,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, goto do_exactf_non_utf8; /* isn't dealt with by these */ case EXACTF: - if (UTF_PATTERN || utf8_target) { + if (utf8_target) { + + /* regcomp.c already folded this if pattern is in UTF-8 */ utf8_fold_flags = 0; goto do_exactf_utf8; } @@ -1481,9 +1483,16 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, folder = foldEQ_locale; goto do_exactf_non_utf8; + case EXACTFU_SS: + if (UTF_PATTERN) { + utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; + } + goto do_exactf_utf8; + + case EXACTFU_TRICKYFOLD: case EXACTFU: if (UTF_PATTERN || utf8_target) { - utf8_fold_flags = 0; + utf8_fold_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0; goto do_exactf_utf8; } @@ -1495,7 +1504,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* FALL THROUGH */ - do_exactf_non_utf8: /* Neither pattern nor string are UTF8 */ + do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there + are no glitches with fold-length differences + between the target string and pattern */ /* The idea in the non-utf8 EXACTF* cases is to first find the * first character of the EXACTF* node and then, if necessary, @@ -1528,6 +1539,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; do_exactf_utf8: + { + unsigned expansion; + /* If one of the operands is in utf8, we can't use the simpler * folding above, due to the fact that many different characters @@ -1540,13 +1554,35 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ? utf8_length((U8 *) pat_string, (U8 *) pat_end) : ln; - /* Set the end position to the final character available */ - e = HOP3c(strend, -1, s); + /* We have 'lnc' characters to match in the pattern, but because of + * multi-character folding, each character in the target can match + * up to 3 characters (Unicode guarantees it will never exceed + * this) if it is utf8-encoded; and up to 2 if not (based on the + * fact that the Latin 1 folds are already determined, and the + * only multi-char fold in that range is the sharp-s folding to + * 'ss'. Thus, a pattern character can match as little as 1/3 of a + * string character. Adjust lnc accordingly, rounding up, so that + * if we need to match at least 4+1/3 chars, that really is 5. */ + expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; + lnc = (lnc + expansion - 1) / expansion; + + /* As in the non-UTF8 case, if we have to match 3 characters, and + * only 2 are left, it's guaranteed to fail, so don't start a + * match that would require us to go beyond the end of the string + */ + e = HOP3c(strend, -((I32)lnc), s); if (!reginfo && e < s) { e = s; /* Due to minlen logic of intuit() */ } + /* XXX Note that we could recalculate e to stop the loop earlier, + * as the worst case expansion above will rarely be met, and as we + * go along we would usually find that e moves further to the left. + * This would happen only after we reached the point in the loop + * where if there were no expansion we should fail. Unclear if + * worth the expense */ + while (s <= e) { char *my_strend= (char *)strend; if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, @@ -1555,9 +1591,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, { goto got_it; } - s += UTF8SKIP(s); + s += (utf8_target) ? UTF8SKIP(s) : 1; } break; + } case BOUNDL: PL_reg_flags |= RF_tainted; FBC_BOUND(isALNUM_LC, @@ -1769,6 +1806,20 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, !is_HORIZWS_latin1(s) ); break; + case POSIXA: + /* Don't need to worry about utf8, as it can match only a single + * byte invariant character. The flag in this node type is the + * class number to pass to _generic_isCC() to build a mask for + * searching in PL_charclass[] */ + REXEC_FBC_CLASS_SCAN( _generic_isCC_A(*s, FLAGS(c))); + break; + case NPOSIXA: + REXEC_FBC_CSCAN( + !_generic_isCC_A(*s, FLAGS(c)), + !_generic_isCC_A(*s, FLAGS(c)) + ); + break; + case AHOCORASICKC: case AHOCORASICK: { @@ -2011,7 +2062,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre dVAR; struct regexp *const prog = (struct regexp *)SvANY(rx); /*register*/ char *s; - register regnode *c; + regnode *c; /*register*/ char *startpos = stringarg; I32 minlen; /* must match at least this many chars */ I32 dontbother = 0; /* how many characters not to try at end */ @@ -2058,7 +2109,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre } PL_reg_flags = 0; - PL_reg_eval_set = 0; + PL_reg_state.re_state_eval_setup_done = FALSE; PL_reg_maxiter = 0; if (RX_UTF8(rx)) @@ -2124,6 +2175,12 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre swap = prog->offs; /* do we need a save destructor here for eval dies? */ Newxz(prog->offs, (prog->nparens + 1), regexp_paren_pair); + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" saving offs: orig=0x%"UVxf" new=0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(swap), + PTR2UV(prog->offs) + )); } if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) { re_scream_pos_data d; @@ -2207,8 +2264,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/ s--; } - /* We can use a more efficient search as newlines are the same in unicode as they are in latin */ - while (s < end) { + /* We can use a more efficient search as newlines are the same in unicode as they are in latin */ + while (s <= end) { /* note it could be possible to match at the end of the string */ if (*s++ == '\n') { /* don't need PL_utf8skip here */ if (regtry(®info, &s)) goto got_it; @@ -2316,15 +2373,9 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre dontbother = end_shift; strend = HOPc(strend, -dontbother); while ( (s <= last) && - ((flags & REXEC_SCREAM) && SvSCREAM(sv) - ? (s = screaminstr(sv, must, HOP3c(s, back_min, (back_min<0 ? strbeg : strend)) - strbeg, - end_shift, &scream_pos, 0)) - : (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), + (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), (unsigned char*)strend, must, - multiline ? FBMrf_MULTILINE : 0))) ) { - /* we may be pointing at the wrong string */ - if ((flags & REXEC_SCREAM) && RXp_MATCH_COPIED(prog)) - s = strbeg + (s - SvPVX_const(sv)); + multiline ? FBMrf_MULTILINE : 0)) ) { DEBUG_EXECUTE_r( did_match = 1 ); if (HOPc(s, -back_max) > last1) { last1 = HOPc(s, -back_min); @@ -2388,48 +2439,80 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre dontbother = 0; if (prog->float_substr != NULL || prog->float_utf8 != NULL) { /* Trim the end. */ - char *last; + char *last= NULL; SV* float_real; + STRLEN len; + const char *little; if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog); float_real = utf8_target ? prog->float_utf8 : prog->float_substr; - if ((flags & REXEC_SCREAM) && SvSCREAM(sv)) { - last = screaminstr(sv, float_real, s - strbeg, - end_shift, &scream_pos, 1); /* last one */ - if (!last) - last = scream_olds; /* Only one occurrence. */ - /* we may be pointing at the wrong string */ - else if (RXp_MATCH_COPIED(prog)) - s = strbeg + (s - SvPVX_const(sv)); - } - else { - STRLEN len; - const char * const little = SvPV_const(float_real, len); - - if (SvTAIL(float_real)) { - if (memEQ(strend - len + 1, little, len - 1)) - last = strend - len + 1; - else if (!multiline) - last = memEQ(strend - len, little, len) - ? strend - len : NULL; - else + little = SvPV_const(float_real, len); + if (SvTAIL(float_real)) { + /* This means that float_real contains an artificial \n on the end + * due to the presence of something like this: /foo$/ + * where we can match both "foo" and "foo\n" at the end of the string. + * So we have to compare the end of the string first against the float_real + * without the \n and then against the full float_real with the string. + * We have to watch out for cases where the string might be smaller + * than the float_real or the float_real without the \n. + */ + char *checkpos= strend - len; + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%sChecking for float_real.%s\n", + PL_colors[4], PL_colors[5])); + if (checkpos + 1 < strbeg) { + /* can't match, even if we remove the trailing \n string is too short to match */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString shorter than required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } else if (memEQ(checkpos + 1, little, len - 1)) { + /* can match, the end of the string matches without the "\n" */ + last = checkpos + 1; + } else if (checkpos < strbeg) { + /* cant match, string is too short when the "\n" is included */ + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString does not contain required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } else if (!multiline) { + /* non multiline match, so compare with the "\n" at the end of the string */ + if (memEQ(checkpos, little, len)) { + last= checkpos; + } else { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%sString does not contain required trailing substring, cannot match.%s\n", + PL_colors[4], PL_colors[5])); + goto phooey; + } + } else { + /* multiline match, so we have to search for a place where the full string is located */ goto find_last; - } else { + } + } else { find_last: if (len) last = rninstr(s, strend, little, little + len); else last = strend; /* matching "$" */ - } } - if (last == NULL) { + if (!last) { + /* at one point this block contained a comment which was probably + * incorrect, which said that this was a "should not happen" case. + * Even if it was true when it was written I am pretty sure it is + * not anymore, so I have removed the comment and replaced it with + * this one. Yves */ DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "%sCan't trim the tail, match fails (should not happen)%s\n", - PL_colors[4], PL_colors[5])); - goto phooey; /* Should not happen! */ + "String does not contain required substring, cannot match.\n" + )); + goto phooey; } dontbother = strend - last + prog->float_min_offset; } @@ -2458,10 +2541,18 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre goto phooey; got_it: + DEBUG_BUFFERS_r( + if (swap) + PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" freeing offs: 0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(swap) + ); + ); Safefree(swap); RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted); - if (PL_reg_eval_set) + if (PL_reg_state.re_state_eval_setup_done) restore_pos(aTHX_ prog); if (RXp_PAREN_NAMES(prog)) (void)hv_iterinit(RXp_PAREN_NAMES(prog)); @@ -2502,10 +2593,16 @@ got_it: phooey: DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n", PL_colors[4], PL_colors[5])); - if (PL_reg_eval_set) + if (PL_reg_state.re_state_eval_setup_done) restore_pos(aTHX_ prog); if (swap) { /* we failed :-( roll it back */ + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" rolling back offs: freeing=0x%"UVxf" restoring=0x%"UVxf"\n", + PTR2UV(prog), + PTR2UV(prog->offs), + PTR2UV(swap) + )); Safefree(prog->offs); prog->offs = swap; } @@ -2514,6 +2611,16 @@ phooey: } +/* Set which rex is pointed to by PL_reg_state, handling ref counting. + * Do inc before dec, in case old and new rex are the same */ +#define SET_reg_curpm(Re2) \ + if (PL_reg_state.re_state_eval_setup_done) { \ + (void)ReREFCNT_inc(Re2); \ + ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); \ + PM_SETRE((PL_reg_curpm), (Re2)); \ + } + + /* - regtry - try match at specific point */ @@ -2531,22 +2638,12 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) reginfo->cutpoint=NULL; - if ((prog->extflags & RXf_EVAL_SEEN) && !PL_reg_eval_set) { + if ((prog->extflags & RXf_EVAL_SEEN) + && !PL_reg_state.re_state_eval_setup_done) + { MAGIC *mg; - PL_reg_eval_set = RS_init; - DEBUG_EXECUTE_r(DEBUG_s( - PerlIO_printf(Perl_debug_log, " setting stack tmpbase at %"IVdf"\n", - (IV)(PL_stack_sp - PL_stack_base)); - )); - SAVESTACK_CXPOS(); - cxstack[cxstack_ix].blk_oldsp = PL_stack_sp - PL_stack_base; - /* Otherwise OP_NEXTSTATE will free whatever on stack now. */ - SAVETMPS; - /* Apparently this is not needed, judging by wantarray. */ - /* SAVEI8(cxstack[cxstack_ix].blk_gimme); - cxstack[cxstack_ix].blk_gimme = G_SCALAR; */ - + PL_reg_state.re_state_eval_setup_done = TRUE; if (reginfo->sv) { /* Make $_ available to executed code. */ if (reginfo->sv != DEFSV) { @@ -2582,16 +2679,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) } #endif } -#ifdef USE_ITHREADS - /* It seems that non-ithreads works both with and without this code. - So for efficiency reasons it seems best not to have the code - compiled when it is not needed. */ - /* This is safe against NULLs: */ - ReREFCNT_dec(PM_GETRE(PL_reg_curpm)); - /* PM_reg_curpm owns a reference to this regexp. */ - (void)ReREFCNT_inc(rx); -#endif - PM_SETRE(PL_reg_curpm, rx); + SET_reg_curpm(rx); PL_reg_oldcurpm = PL_curpm; PL_curpm = PL_reg_curpm; if (RXp_MATCH_COPIED(prog)) { @@ -2610,30 +2698,22 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) prog->subbeg = PL_bostr; prog->sublen = PL_regeol - PL_bostr; /* strend may have been modified */ } - DEBUG_EXECUTE_r(PL_reg_starttry = *startpos); +#ifdef DEBUGGING + PL_reg_starttry = *startpos; +#endif prog->offs[0].start = *startpos - PL_bostr; PL_reginput = *startpos; - PL_reglastparen = &prog->lastparen; - PL_reglastcloseparen = &prog->lastcloseparen; prog->lastparen = 0; prog->lastcloseparen = 0; PL_regsize = 0; - PL_regoffs = prog->offs; - if (PL_reg_start_tmpl <= prog->nparens) { - PL_reg_start_tmpl = prog->nparens*3/2 + 3; - if(PL_reg_start_tmp) - Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); - else - Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*); - } /* XXXX What this code is doing here?!!! There should be no need - to do this again and again, PL_reglastparen should take care of + to do this again and again, prog->lastparen should take care of this! --ilya*/ /* Tests pat.t#187 and split.t#{13,14} seem to depend on this code. * Actually, the code in regcppop() (which Ilya may be meaning by - * PL_reglastparen), is not needed at all by the test suite + * prog->lastparen), is not needed at all by the test suite * (op/regexp, op/pat, op/split), but that code is needed otherwise * this erroneously leaves $1 defined: "1" =~ /^(?:(\d)x)?\d$/ * Meanwhile, this code *is* needed for the @@ -2642,9 +2722,9 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) * --jhi updated by dapm */ #if 1 if (prog->nparens) { - regexp_paren_pair *pp = PL_regoffs; - register I32 i; - for (i = prog->nparens; i > (I32)*PL_reglastparen; i--) { + regexp_paren_pair *pp = prog->offs; + I32 i; + for (i = prog->nparens; i > (I32)prog->lastparen; i--) { ++pp; pp->start = -1; pp->end = -1; @@ -2653,7 +2733,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startpos) #endif REGCP_SET(lastcp); if (regmatch(reginfo, progi->program + 1)) { - PL_regoffs[0].end = PL_reginput - PL_bostr; + prog->offs[0].end = PL_reginput - PL_bostr; return 1; } if (reginfo->cutpoint) @@ -2989,8 +3069,8 @@ S_reg_check_named_buff_matched(pTHX_ const regexp *rex, const regnode *scan) PERL_ARGS_ASSERT_REG_CHECK_NAMED_BUFF_MATCHED; for ( n=0; n= nums[n] && - PL_regoffs[nums[n]].end != -1) + if ((I32)rex->lastparen >= nums[n] && + rex->offs[nums[n]].end != -1) { return nums[n]; } @@ -3018,10 +3098,6 @@ S_clear_backtrack_stack(pTHX_ void *p) } -#define SETREX(Re1,Re2) \ - if (PL_reg_eval_set) PM_SETRE((PL_reg_curpm), (Re2)); \ - Re1 = (Re2) - STATIC I32 /* 0 failure, 1 success */ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) { @@ -3029,21 +3105,21 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) dMY_CXT; #endif dVAR; - register const bool utf8_target = PL_reg_match_utf8; + const bool utf8_target = PL_reg_match_utf8; const U32 uniflags = UTF8_ALLOW_DEFAULT; REGEXP *rex_sv = reginfo->prog; regexp *rex = (struct regexp *)SvANY(rex_sv); RXi_GET_DECL(rex,rexi); I32 oldsave; /* the current state. This is a cached copy of PL_regmatch_state */ - register regmatch_state *st; + regmatch_state *st; /* cache heavy used fields of st in registers */ - register regnode *scan; - register regnode *next; - register U32 n = 0; /* general value; init to avoid compiler warning */ - register I32 ln = 0; /* len or last; init to avoid compiler warning */ - register char *locinput = PL_reginput; - register I32 nextchr; /* is always set to UCHARAT(locinput) */ + regnode *scan; + regnode *next; + U32 n = 0; /* general value; init to avoid compiler warning */ + I32 ln = 0; /* len or last; init to avoid compiler warning */ + char *locinput = PL_reginput; + I32 nextchr; /* is always set to UCHARAT(locinput) */ bool result = 0; /* return value of S_regmatch */ int depth = 0; /* depth of backtrack stack */ @@ -3083,10 +3159,25 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) false: plain (?=foo) true: used as a condition: (?(?=foo)) */ + PAD* last_pad = NULL; + dMULTICALL; + I32 gimme = G_SCALAR; + CV *caller_cv = NULL; /* who called us */ + CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ + CHECKPOINT runops_cp; /* savestack position before executing EVAL */ + #ifdef DEBUGGING GET_RE_DEBUG_FLAGS_DECL; #endif + /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ + multicall_oldcatch = 0; + multicall_cv = NULL; + cx = NULL; + PERL_UNUSED_VAR(multicall_cop); + PERL_UNUSED_VAR(newsp); + + PERL_ARGS_ASSERT_REGMATCH; DEBUG_OPTIMISE_r( DEBUG_EXECUTE_r({ @@ -3136,10 +3227,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) reenter_switch: - assert(PL_reglastparen == &rex->lastparen); - assert(PL_reglastcloseparen == &rex->lastcloseparen); - assert(PL_regoffs == rex->offs); - switch (state_num) { case BOL: if (locinput == PL_bostr) @@ -3166,14 +3253,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case KEEPS: /* update the startpoint */ - st->u.keeper.val = PL_regoffs[0].start; + st->u.keeper.val = rex->offs[0].start; PL_reginput = locinput; - PL_regoffs[0].start = locinput - PL_bostr; + rex->offs[0].start = locinput - PL_bostr; PUSH_STATE_GOTO(KEEPS_next, next); /*NOT-REACHED*/ case KEEPS_next_fail: /* rollback the start point change */ - PL_regoffs[0].start = st->u.keeper.val; + rex->offs[0].start = st->u.keeper.val; sayNO_SILENT; /*NOT-REACHED*/ case EOL: @@ -3229,16 +3316,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* In this case the charclass data is available inline so we can fail fast without a lot of extra overhead. */ - if (scan->flags == EXACT || !utf8_target) { - if(!ANYOF_BITMAP_TEST(scan, *locinput)) { - DEBUG_EXECUTE_r( - PerlIO_printf(Perl_debug_log, - "%*s %sfailed to match trie start class...%s\n", - REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) - ); - sayNO_SILENT; - /* NOTREACHED */ - } + if(!ANYOF_BITMAP_TEST(scan, *locinput)) { + DEBUG_EXECUTE_r( + PerlIO_printf(Perl_debug_log, + "%*s %sfailed to match trie start class...%s\n", + REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) + ); + sayNO_SILENT; + assert(0); /* NOTREACHED */ } /* FALL THROUGH */ case TRIE: @@ -3296,9 +3381,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]); U32 state = trie->startstate; - if (trie->bitmap && trie_type != trie_utf8_fold && - !TRIE_BITMAP_TEST(trie,*locinput) - ) { + if (trie->bitmap && !TRIE_BITMAP_TEST(trie,*locinput) ) { if (trie->states[ state ].wordnum) { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, @@ -3327,7 +3410,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) U32 charcount = 0; /* how many input chars we have matched */ U32 accepted = 0; /* have we seen any accepting states? */ - ST.B = next; ST.jump = trie->jump; ST.me = scan; ST.firstpos = NULL; @@ -3426,14 +3508,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ); goto trie_first_try; /* jump into the fail handler */ }} - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case TRIE_next_fail: /* we failed - try next alternative */ if ( ST.jump) { REGCP_UNWIND(ST.cp); - for (n = *PL_reglastparen; n > ST.lastparen; n--) - PL_regoffs[n].end = -1; - *PL_reglastparen = n; + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); } if (!--ST.accepted) { DEBUG_EXECUTE_r({ @@ -3448,10 +3528,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) { /* Find next-highest word to process. Note that this code * is O(N^2) per trie run (O(N) per branch), so keep tight */ - register U16 min = 0; - register U16 word; - register U16 const nextword = ST.nextword; - register reg_trie_wordinfo * const wordinfo + U16 min = 0; + U16 word; + U16 const nextword = ST.nextword; + reg_trie_wordinfo * const wordinfo = ((reg_trie_data*)rexi->data->data[ARG(ST.me)])->wordinfo; for (word=ST.topword; word; word=wordinfo[word].prev) { if (word > nextword && (!min || word < min)) @@ -3467,7 +3547,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } if ( ST.jump) { - ST.lastparen = *PL_reglastparen; + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; REGCP_SET(ST.cp); } @@ -3524,9 +3605,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) PL_reginput = (char *)uc; } - scan = (ST.jump && ST.jump[ST.nextword]) - ? ST.me + ST.jump[ST.nextword] - : ST.B; + scan = ST.me + ((ST.jump && ST.jump[ST.nextword]) + ? ST.jump[ST.nextword] + : NEXT_OFF(ST.me)); DEBUG_EXECUTE_r({ PerlIO_printf( Perl_debug_log, @@ -3540,7 +3621,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) if (ST.accepted > 1 || has_cutgroup) { PUSH_STATE_GOTO(TRIE_next, scan); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } /* only one choice left - just continue */ DEBUG_EXECUTE_r({ @@ -3565,7 +3646,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) locinput = PL_reginput; nextchr = UCHARAT(locinput); continue; /* execute rest of RE */ - /* NOTREACHED */ + assert(0); /* NOTREACHED */ #undef ST case EXACT: { @@ -3632,10 +3713,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) fold_utf8_flags = FOLDEQ_UTF8_LOCALE; goto do_exactf; + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: case EXACTFU: folder = foldEQ_latin1; fold_array = PL_fold_latin1; - fold_utf8_flags = 0; + fold_utf8_flags = (UTF_PATTERN) ? FOLDEQ_S1_ALREADY_FOLDED : 0; goto do_exactf; case EXACTFA: @@ -3653,13 +3736,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) s = STRING(scan); ln = STR_LEN(scan); - if (utf8_target || UTF_PATTERN) { - /* Either target or the pattern are utf8. */ + if (utf8_target || UTF_PATTERN || state_num == EXACTFU_SS) { + /* Either target or the pattern are utf8, or has the issue where + * the fold lengths may differ. */ const char * const l = locinput; char *e = PL_regeol; if (! foldEQ_utf8_flags(s, 0, ln, cBOOL(UTF_PATTERN), - l, &e, 0, utf8_target, fold_utf8_flags)) + l, &e, 0, utf8_target, fold_utf8_flags)) { sayNO; } @@ -3803,6 +3887,26 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) DIGITA, NDIGITA, isDIGIT_A, digit, "0"); + case POSIXA: + if (locinput >= PL_regeol || ! _generic_isCC_A(nextchr, FLAGS(scan))) { + sayNO; + } + /* Matched a utf8-invariant, so don't have to worry about utf8 */ + nextchr = UCHARAT(++locinput); + break; + case NPOSIXA: + if (locinput >= PL_regeol || _generic_isCC_A(nextchr, FLAGS(scan))) { + sayNO; + } + if (utf8_target) { + locinput += PL_utf8skip[nextchr]; + nextchr = UCHARAT(locinput); + } + else { + nextchr = UCHARAT(++locinput); + } + break; + case CLUMP: /* Match \X: logical Unicode character. This is defined as a Unicode extended Grapheme Cluster */ /* From http://www.unicode.org/reports/tr29 (5.2 version). An @@ -3812,9 +3916,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) | Prepend* Begin Extend* | . - Begin is (Hangul-syllable | ! Control) - Extend is (Grapheme_Extend | Spacing_Mark) - Control is [ GCB_Control CR LF ] + Begin is: ( Special_Begin | ! Control ) + Special_Begin is: ( Regional-Indicator+ | Hangul-syllable ) + Extend is: ( Grapheme_Extend | Spacing_Mark ) + Control is: [ GCB_Control CR LF ] + Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) )) The discussion below shows how the code for CLUMP is derived from this regex. Note that most of these concepts are from @@ -3840,11 +3946,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) | (L* ( LVT | ( V | LV ) V*) T*) The last two terms can be combined like this: - L* ( L - | (( LVT | ( V | LV ) V*) T*)) - - And refactored into this: - L* (L | LVT T* | V V* T* | LV V* T*) + L* ( L | (( LVT | ( V | LV ) V*) T*)) That means that if we have seen any L's at all we can quit there, but if the next character is an LVT, a V, or an LV we @@ -3888,14 +3990,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) LOAD_UTF8_CHARCLASS_GCB(); - /* Match (prepend)* */ - while (locinput < PL_regeol - && swash_fetch(PL_utf8_X_prepend, - (U8*)locinput, utf8_target)) - { - previous_prepend = locinput; - locinput += UTF8SKIP(locinput); - } + /* Match (prepend)*, but don't bother trying if empty (as + * being set to _undef indicates) */ + if (PL_utf8_X_prepend != &PL_sv_undef) { + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_prepend, + (U8*)locinput, utf8_target)) + { + previous_prepend = locinput; + locinput += UTF8SKIP(locinput); + } + } /* As noted above, if we matched a prepend character, but * the next thing won't match, back off the last prepend we @@ -3922,21 +4027,32 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } else { /* Here is the beginning of a character that can have - * an extender. It is either a hangul syllable, or a - * non-control */ - if (swash_fetch(PL_utf8_X_non_hangul, + * an extender. It is either a special begin character + * that requires complicated handling, or a non-control + * */ + if (! swash_fetch(PL_utf8_X_special_begin, (U8*)locinput, utf8_target)) { - /* Here not a Hangul syllable, must be a + /* Here not a special begin, must be a * ('! * Control') */ locinput += UTF8SKIP(locinput); } else { - /* Here is a Hangul syllable. It can be composed - * of several individual characters. One - * possibility is T+ */ - if (swash_fetch(PL_utf8_X_T, + /* Here is a special begin. It can be composed + * of several individual characters. One + * possibility is RI+ */ + if (swash_fetch(PL_utf8_X_RI, + (U8*)locinput, utf8_target)) + { + while (locinput < PL_regeol + && swash_fetch(PL_utf8_X_RI, + (U8*)locinput, utf8_target)) + { + locinput += UTF8SKIP(locinput); + } + } else /* Another possibility is T+ */ + if (swash_fetch(PL_utf8_X_T, (U8*)locinput, utf8_target)) { while (locinput < PL_regeol @@ -3947,9 +4063,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } } else { - /* Here, not T+, but is a Hangul. That means - * it is one of the others: L, LV, LVT or V, - * and matches: + /* Here, neither RI+ nor T+; must be some other + * Hangul. That means it is one of the others: + * L, LV, LVT or V, and matches: * L* (L | LVT T* | V V* T* | LV V* T*) */ /* Match L* */ @@ -3973,9 +4089,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* Otherwise keep going. Must be LV, LVT * or V. See if LVT */ - if (swash_fetch(PL_utf8_X_LVT, - (U8*)locinput, utf8_target)) - { + if (is_utf8_X_LVT((U8*)locinput)) { locinput += UTF8SKIP(locinput); } else { @@ -4109,11 +4223,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) n = ARG(scan); /* which paren pair */ do_nref_ref_common: - ln = PL_regoffs[n].start; + ln = rex->offs[n].start; PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ - if (*PL_reglastparen < n || ln == -1) + if (rex->lastparen < n || ln == -1) sayNO; /* Do not match unless seen CLOSEn. */ - if (ln == PL_regoffs[n].end) + if (ln == rex->offs[n].end) break; s = PL_bostr + ln; @@ -4127,7 +4241,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * not going off the end given by PL_regeol, and returns in * limit upon success, how much of the current input was * matched */ - if (! foldEQ_utf8_flags(s, NULL, PL_regoffs[n].end - ln, utf8_target, + if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target, locinput, &limit, 0, utf8_target, utf8_fold_flags)) { sayNO; @@ -4142,7 +4256,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) (type == REF || UCHARAT(s) != fold_array[nextchr])) sayNO; - ln = PL_regoffs[n].end - ln; + ln = rex->offs[n].end - ln; if (locinput + ln > PL_regeol) sayNO; if (ln > 1 && (type == REF @@ -4183,7 +4297,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) re_sv = rex_sv; re = rex; rei = rexi; - (void)ReREFCNT_inc(rex_sv); if (OP(scan)==GOSUB) { startpoint = scan + ARG2L(scan); ST.close_paren = ARG(scan); @@ -4192,7 +4305,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ST.close_paren = 0; } goto eval_recurse_doit; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ if (cur_eval && cur_eval->locinput==locinput) { if ( ++nochange_depth > max_nochange_depth ) @@ -4202,13 +4315,19 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } { /* execute the code in the {...} */ + dSP; - SV ** const before = SP; - OP_4tree * const oop = PL_op; + SV ** before; + OP * const oop = PL_op; COP * const ocurcop = PL_curcop; - PAD *old_comppad; + OP *nop; char *saved_regeol = PL_regeol; struct re_save_state saved_state; + CV *newcv; + + /* save *all* paren positions */ + regcppush(rex, 0); + REGCP_SET(runops_cp); /* To not corrupt the existing regex state while executing the * eval we would normally put it on the save stack, like with @@ -4226,24 +4345,99 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) */ Copy(&PL_reg_state, &saved_state, 1, struct re_save_state); + PL_reg_state.re_reparsing = FALSE; + + if (!caller_cv) + caller_cv = find_runcv(NULL); + n = ARG(scan); - PL_op = (OP_4tree*)rexi->data->data[n]; + + if (rexi->data->what[n] == 'r') { /* code from an external qr */ + newcv = ((struct regexp *)SvANY( + (REGEXP*)(rexi->data->data[n]) + ))->qr_anoncv + ; + nop = (OP*)rexi->data->data[n+1]; + } + else if (rexi->data->what[n] == 'l') { /* literal code */ + newcv = caller_cv; + nop = (OP*)rexi->data->data[n]; + assert(CvDEPTH(newcv)); + } + else { + /* literal with own CV */ + assert(rexi->data->what[n] == 'L'); + newcv = rex->qr_anoncv; + nop = (OP*)rexi->data->data[n]; + } + + /* normally if we're about to execute code from the same + * CV that we used previously, we just use the existing + * CX stack entry. However, its possible that in the + * meantime we may have backtracked, popped from the save + * stack, and undone the SAVECOMPPAD(s) associated with + * PUSH_MULTICALL; in which case PL_comppad no longer + * points to newcv's pad. */ + if (newcv != last_pushed_cv || PL_comppad != last_pad) + { + I32 depth = (newcv == caller_cv) ? 0 : 1; + if (last_pushed_cv) { + CHANGE_MULTICALL_WITHDEPTH(newcv, depth); + } + else { + PUSH_MULTICALL_WITHDEPTH(newcv, depth); + } + last_pushed_cv = newcv; + } + last_pad = PL_comppad; + + /* the initial nextstate you would normally execute + * at the start of an eval (which would cause error + * messages to come from the eval), may be optimised + * away from the execution path in the regex code blocks; + * so manually set PL_curcop to it initially */ + { + OP *o = cUNOPx(nop)->op_first; + assert(o->op_type == OP_NULL); + if (o->op_targ == OP_SCOPE) { + o = cUNOPo->op_first; + } + else { + assert(o->op_targ == OP_LEAVE); + o = cUNOPo->op_first; + assert(o->op_type == OP_ENTER); + o = o->op_sibling; + } + + if (o->op_type != OP_STUB) { + assert( o->op_type == OP_NEXTSTATE + || o->op_type == OP_DBSTATE + || (o->op_type == OP_NULL + && ( o->op_targ == OP_NEXTSTATE + || o->op_targ == OP_DBSTATE + ) + ) + ); + PL_curcop = (COP*)o; + } + } + nop = nop->op_next; + DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, - " re_eval 0x%"UVxf"\n", PTR2UV(PL_op)) ); - /* wrap the call in two SAVECOMPPADs. This ensures that - * when the save stack is eventually unwound, all the - * accumulated SAVEt_CLEARSV's will be processed with - * interspersed SAVEt_COMPPAD's to ensure that lexicals - * are cleared in the right pad */ - SAVECOMPPAD(); - PAD_SAVE_LOCAL(old_comppad, (PAD*)rexi->data->data[n + 2]); - PL_regoffs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr; + " re EVAL PL_op=0x%"UVxf"\n", PTR2UV(nop)) ); + + rex->offs[0].end = PL_reg_magic->mg_len = locinput - PL_bostr; if (sv_yes_mark) { SV *sv_mrk = get_sv("REGMARK", 1); sv_setsv(sv_mrk, sv_yes_mark); } + /* we don't use MULTICALL here as we want to call the + * first op of the block of interest, rather than the + * first op of the sub */ + before = SP; + PL_op = nop; CALLRUNOPS(aTHX); /* Scalar context. */ SPAGAIN; if (SP == before) @@ -4253,91 +4447,107 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) PUTBACK; } + /* before restoring everything, evaluate the returned + * value, so that 'uninit' warnings don't use the wrong + * PL_op or pad. Also need to process any magic vars + * (e.g. $1) *before* parentheses are restored */ + + PL_op = NULL; + + re_sv = NULL; + if (logical == 0) /* (?{})/ */ + sv_setsv(save_scalar(PL_replgv), ret); /* $^R */ + else if (logical == 1) { /* /(?(?{...})X|Y)/ */ + sw = cBOOL(SvTRUE(ret)); + logical = 0; + } + else { /* /(??{}) */ + /* if its overloaded, let the regex compiler handle + * it; otherwise extract regex, or stringify */ + if (!SvAMAGIC(ret)) { + SV *sv = ret; + if (SvROK(sv)) + sv = SvRV(sv); + if (SvTYPE(sv) == SVt_REGEXP) + re_sv = (REGEXP*) sv; + else if (SvSMAGICAL(sv)) { + MAGIC *mg = mg_find(sv, PERL_MAGIC_qr); + if (mg) + re_sv = (REGEXP *) mg->mg_obj; + } + + /* force any magic, undef warnings here */ + if (!re_sv) { + ret = sv_mortalcopy(ret); + (void) SvPV_force_nolen(ret); + } + } + + } + Copy(&saved_state, &PL_reg_state, 1, struct re_save_state); + /* *** Note that at this point we don't restore + * PL_comppad, (or pop the CxSUB) on the assumption it may + * be used again soon. This is safe as long as nothing + * in the regexp code uses the pad ! */ PL_op = oop; - SAVECOMPPAD(); - PAD_RESTORE_LOCAL(old_comppad); PL_curcop = ocurcop; PL_regeol = saved_regeol; - if (!logical) { - /* /(?{...})/ */ - sv_setsv(save_scalar(PL_replgv), ret); + S_regcp_restore(aTHX_ rex, runops_cp); + + if (logical != 2) break; - } } - if (logical == 2) { /* Postponed subexpression: /(??{...})/ */ + + /* only /(??{})/ from now on */ logical = 0; { /* extract RE object from returned value; compiling if * necessary */ - MAGIC *mg = NULL; - REGEXP *rx = NULL; - - if (SvROK(ret)) { - SV *const sv = SvRV(ret); - if (SvTYPE(sv) == SVt_REGEXP) { - rx = (REGEXP*) sv; - } else if (SvSMAGICAL(sv)) { - mg = mg_find(sv, PERL_MAGIC_qr); - assert(mg); - } - } else if (SvTYPE(ret) == SVt_REGEXP) { - rx = (REGEXP*) ret; - } else if (SvSMAGICAL(ret)) { - if (SvGMAGICAL(ret)) { - /* I don't believe that there is ever qr magic - here. */ - assert(!mg_find(ret, PERL_MAGIC_qr)); - sv_unmagic(ret, PERL_MAGIC_qr); - } - else { - mg = mg_find(ret, PERL_MAGIC_qr); - /* testing suggests mg only ends up non-NULL for - scalars who were upgraded and compiled in the - else block below. In turn, this is only - triggered in the "postponed utf8 string" tests - in t/op/pat.t */ - } - } - - if (mg) { - rx = (REGEXP *) mg->mg_obj; /*XXX:dmq*/ - assert(rx); - } - if (rx) { - rx = reg_temp_copy(NULL, rx); + if (re_sv) { + re_sv = reg_temp_copy(NULL, re_sv); } else { U32 pm_flags = 0; const I32 osize = PL_regsize; - if (DO_UTF8(ret)) { - assert (SvUTF8(ret)); - } else if (SvUTF8(ret)) { - /* Not doing UTF-8, despite what the SV says. Is - this only if we're trapped in use 'bytes'? */ - /* Make a copy of the octet sequence, but without - the flag on, as the compiler now honours the - SvUTF8 flag on ret. */ + if (SvUTF8(ret) && IN_BYTES) { + /* In use 'bytes': make a copy of the octet + * sequence, but without the flag on */ STRLEN len; const char *const p = SvPV(ret, len); ret = newSVpvn_flags(p, len, SVs_TEMP); } - rx = CALLREGCOMP(ret, pm_flags); + if (rex->intflags & PREGf_USE_RE_EVAL) + pm_flags |= PMf_USE_RE_EVAL; + + /* if we got here, it should be an engine which + * supports compiling code blocks and stuff */ + assert(rex->engine && rex->engine->op_comp); + assert(!(scan->flags & ~RXf_PMf_COMPILETIME)); + re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL, + rex->engine, NULL, NULL, + /* copy /msix etc to inner pattern */ + scan->flags, + pm_flags); + if (!(SvFLAGS(ret) & (SVs_TEMP | SVs_PADTMP | SVf_READONLY | SVs_GMG))) { /* This isn't a first class regexp. Instead, it's caching a regexp onto an existing, Perl visible scalar. */ - sv_magic(ret, MUTABLE_SV(rx), PERL_MAGIC_qr, 0, 0); + sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); } PL_regsize = osize; + /* safe to do now that any $1 etc has been + * interpolated into the new pattern string and + * compiled */ + S_regcp_restore(aTHX_ rex, runops_cp); } - re_sv = rx; - re = (struct regexp *)SvANY(rx); + re = (struct regexp *)SvANY(re_sv); } RXp_MATCH_COPIED_off(re); re->subbeg = rex->subbeg; @@ -4349,25 +4559,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ); startpoint = rei->program + 1; ST.close_paren = 0; /* only used for GOSUB */ - /* borrowed from regtry */ - if (PL_reg_start_tmpl <= re->nparens) { - PL_reg_start_tmpl = re->nparens*3/2 + 3; - if(PL_reg_start_tmp) - Renew(PL_reg_start_tmp, PL_reg_start_tmpl, char*); - else - Newx(PL_reg_start_tmp, PL_reg_start_tmpl, char*); - } eval_recurse_doit: /* Share code with GOSUB below this line */ /* run the pattern returned from (??{...}) */ - ST.cp = regcppush(0); /* Save *all* the positions. */ + ST.cp = regcppush(rex, 0); /* Save *all* the positions. */ REGCP_SET(ST.lastcp); - PL_regoffs = re->offs; /* essentially NOOP on GOSUB */ - - /* see regtry, specifically PL_reglast(?:close)?paren is a pointer! (i dont know why) :dmq */ - PL_reglastparen = &re->lastparen; - PL_reglastcloseparen = &re->lastcloseparen; re->lastparen = 0; re->lastcloseparen = 0; @@ -4386,7 +4583,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ST.prev_rex = rex_sv; ST.prev_curlyx = cur_curlyx; - SETREX(rex_sv,re_sv); + rex_sv = re_sv; + SET_reg_curpm(rex_sv); rex = re; rexi = rei; cur_curlyx = NULL; @@ -4395,31 +4593,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) cur_eval = st; /* now continue from first node in postoned RE */ PUSH_YES_STATE_GOTO(EVAL_AB, startpoint); - /* NOTREACHED */ - } - /* logical is 1, /(?(?{...})X|Y)/ */ - sw = cBOOL(SvTRUE(ret)); - logical = 0; - break; + assert(0); /* NOTREACHED */ } case EVAL_AB: /* cleanup after a successful (??{A})B */ /* note: this is called twice; first after popping B, then A */ PL_reg_flags ^= ST.toggle_reg_flags; - ReREFCNT_dec(rex_sv); - SETREX(rex_sv,ST.prev_rex); + rex_sv = ST.prev_rex; + SET_reg_curpm(rex_sv); rex = (struct regexp *)SvANY(rex_sv); rexi = RXi_GET(rex); regcpblow(ST.cp); cur_eval = ST.prev_eval; cur_curlyx = ST.prev_curlyx; - /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ - PL_reglastparen = &rex->lastparen; - PL_reglastcloseparen = &rex->lastcloseparen; - /* also update PL_regoffs */ - PL_regoffs = rex->offs; - /* XXXX This is too dramatic a measure... */ PL_reg_maxiter = 0; if ( nochange_depth ) @@ -4430,13 +4617,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ /* note: this is called twice; first after popping B, then A */ PL_reg_flags ^= ST.toggle_reg_flags; - ReREFCNT_dec(rex_sv); - SETREX(rex_sv,ST.prev_rex); + rex_sv = ST.prev_rex; + SET_reg_curpm(rex_sv); rex = (struct regexp *)SvANY(rex_sv); rexi = RXi_GET(rex); - /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ - PL_reglastparen = &rex->lastparen; - PL_reglastcloseparen = &rex->lastcloseparen; PL_reginput = locinput; REGCP_UNWIND(ST.lastcp); @@ -4452,20 +4636,41 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case OPEN: n = ARG(scan); /* which paren pair */ - PL_reg_start_tmp[n] = locinput; + rex->offs[n].start_tmp = locinput - PL_bostr; if (n > PL_regsize) PL_regsize = n; + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, + "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf" tmp; regsize=%"UVuf"\n", + PTR2UV(rex), + PTR2UV(rex->offs), + (UV)n, + (IV)rex->offs[n].start_tmp, + (UV)PL_regsize + )); lastopen = n; break; + +/* XXX really need to log other places start/end are set too */ +#define CLOSE_CAPTURE \ + rex->offs[n].start = rex->offs[n].start_tmp; \ + rex->offs[n].end = locinput - PL_bostr; \ + DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, \ + "rex=0x%"UVxf" offs=0x%"UVxf": \\%"UVuf": set %"IVdf"..%"IVdf"\n", \ + PTR2UV(rex), \ + PTR2UV(rex->offs), \ + (UV)n, \ + (IV)rex->offs[n].start, \ + (IV)rex->offs[n].end \ + )) + case CLOSE: n = ARG(scan); /* which paren pair */ - PL_regoffs[n].start = PL_reg_start_tmp[n] - PL_bostr; - PL_regoffs[n].end = locinput - PL_bostr; + CLOSE_CAPTURE; /*if (n > PL_regsize) PL_regsize = n;*/ - if (n > *PL_reglastparen) - *PL_reglastparen = n; - *PL_reglastcloseparen = n; + if (n > rex->lastparen) + rex->lastparen = n; + rex->lastcloseparen = n; if (cur_eval && cur_eval->u.eval.close_paren == n) { goto fake_end; } @@ -4480,14 +4685,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) if ( OP(cursor)==CLOSE ){ n = ARG(cursor); if ( n <= lastopen ) { - PL_regoffs[n].start - = PL_reg_start_tmp[n] - PL_bostr; - PL_regoffs[n].end = locinput - PL_bostr; + CLOSE_CAPTURE; /*if (n > PL_regsize) PL_regsize = n;*/ - if (n > *PL_reglastparen) - *PL_reglastparen = n; - *PL_reglastcloseparen = n; + if (n > rex->lastparen) + rex->lastparen = n; + rex->lastcloseparen = n; if ( n == ARG(scan) || (cur_eval && cur_eval->u.eval.close_paren == n)) break; @@ -4499,7 +4702,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /*NOTREACHED*/ case GROUPP: n = ARG(scan); /* which paren pair */ - sw = cBOOL(*PL_reglastparen >= n && PL_regoffs[n].end != -1); + sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); break; case NGROUPP: /* reg_check_named_buff_matched returns 0 for no match */ @@ -4622,8 +4825,8 @@ NULL /* XXXX Probably it is better to teach regpush to support parenfloor > PL_regsize... */ - if (parenfloor > (I32)*PL_reglastparen) - parenfloor = *PL_reglastparen; /* Pessimization... */ + if (parenfloor > (I32)rex->lastparen) + parenfloor = rex->lastparen; /* Pessimization... */ ST.prev_curlyx= cur_curlyx; cur_curlyx = st; @@ -4641,19 +4844,19 @@ NULL PL_reginput = locinput; PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next)); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } case CURLYX_end: /* just finished matching all of A*B */ cur_curlyx = ST.prev_curlyx; sayYES; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case CURLYX_end_fail: /* just failed to match all of A*B */ regcpblow(ST.cp); cur_curlyx = ST.prev_curlyx; sayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ #undef ST @@ -4683,12 +4886,12 @@ NULL /* First just match a string of min A's. */ if (n < min) { - ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor); + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor); cur_curlyx->u.curlyx.lastloc = locinput; REGCP_SET(ST.lastcp); PUSH_STATE_GOTO(WHILEM_A_pre, A); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } /* If degenerate A matches "", assume A done. */ @@ -4759,37 +4962,37 @@ NULL if (cur_curlyx->u.curlyx.minmod) { ST.save_curlyx = cur_curlyx; cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; - ST.cp = regcppush(ST.save_curlyx->u.curlyx.parenfloor); + ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor); REGCP_SET(ST.lastcp); PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } /* Prefer A over B for maximal matching. */ if (n < max) { /* More greed allowed? */ - ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor); + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor); cur_curlyx->u.curlyx.lastloc = locinput; REGCP_SET(ST.lastcp); PUSH_STATE_GOTO(WHILEM_A_max, A); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } goto do_whilem_B_max; } - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case WHILEM_B_min: /* just matched B in a minimal match */ case WHILEM_B_max: /* just matched B in a maximal match */ cur_curlyx = ST.save_curlyx; sayYES; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case WHILEM_B_max_fail: /* just failed to match B in a maximal match */ cur_curlyx = ST.save_curlyx; cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; cur_curlyx->u.curlyx.count--; CACHEsayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ /* FALL THROUGH */ @@ -4799,7 +5002,7 @@ NULL cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; cur_curlyx->u.curlyx.count--; CACHEsayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case WHILEM_A_max_fail: /* just failed to match A in a maximal match */ REGCP_UNWIND(ST.lastcp); @@ -4815,8 +5018,9 @@ NULL && !(PL_reg_flags & RF_warned)) { PL_reg_flags |= RF_warned; - Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s limit (%d) exceeded", - "Complex regular subexpression recursion", + Perl_warner(aTHX_ packWARN(WARN_REGEXP), + "Complex regular subexpression recursion limit (%d) " + "exceeded", REG_INFTY - 1); } @@ -4824,7 +5028,7 @@ NULL ST.save_curlyx = cur_curlyx; cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ cur_curlyx = ST.save_curlyx; @@ -4839,8 +5043,8 @@ NULL { PL_reg_flags |= RF_warned; Perl_warner(aTHX_ packWARN(WARN_REGEXP), - "%s limit (%d) exceeded", - "Complex regular subexpression recursion", + "Complex regular subexpression recursion " + "limit (%d) exceeded", REG_INFTY - 1); } cur_curlyx->u.curlyx.count--; @@ -4853,11 +5057,11 @@ NULL /* Try grabbing another A and see if it helps. */ PL_reginput = locinput; cur_curlyx->u.curlyx.lastloc = locinput; - ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor); + ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor); REGCP_SET(ST.lastcp); PUSH_STATE_GOTO(WHILEM_A_min, /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ #undef ST #define ST st->u.branch @@ -4871,7 +5075,8 @@ NULL case BRANCH: /* /(...|A|...)/ */ scan = NEXTOPER(scan); /* scan now points to inner node */ - ST.lastparen = *PL_reglastparen; + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; ST.next_branch = next; REGCP_SET(ST.cp); PL_reginput = locinput; @@ -4882,33 +5087,30 @@ NULL } else { PUSH_STATE_GOTO(BRANCH_next, scan); } - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case CUTGROUP: PL_reginput = locinput; sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL : MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); PUSH_STATE_GOTO(CUTGROUP_next,next); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case CUTGROUP_next_fail: do_cutgroup = 1; no_final = 1; if (st->u.mark.mark_name) sv_commit = st->u.mark.mark_name; sayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case BRANCH_next: sayYES; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case BRANCH_next_fail: /* that branch failed; try the next, if any */ if (do_cutgroup) { do_cutgroup = 0; no_final = 0; } REGCP_UNWIND(ST.cp); - for (n = *PL_reglastparen; n > ST.lastparen; n--) - PL_regoffs[n].end = -1; - *PL_reglastparen = n; - /*dmq: *PL_reglastcloseparen = n; */ + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); scan = ST.next_branch; /* no more branches? */ if (!scan || (OP(scan) != BRANCH && OP(scan) != BRANCHJ)) { @@ -4922,7 +5124,7 @@ NULL sayNO_SILENT; } continue; /* execute next BRANCH[J] op */ - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case MINMOD: minmod = 1; @@ -4942,13 +5144,14 @@ NULL ST.me = scan; scan = NEXTOPER(scan) + NODE_STEP_REGNODE; + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; + /* if paren positive, emulate an OPEN/CLOSE around A */ if (ST.me->flags) { U32 paren = ST.me->flags; if (paren > PL_regsize) PL_regsize = paren; - if (paren > *PL_reglastparen) - *PL_reglastparen = paren; scan += NEXT_OFF(scan); /* Skip former OPEN. */ } ST.A = scan; @@ -4966,7 +5169,7 @@ NULL curlym_do_A: /* execute the A in /A{m,n}B/ */ PL_reginput = locinput; PUSH_YES_STATE_GOTO(CURLYM_A, ST.A); /* match A */ - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case CURLYM_A: /* we've just matched an A */ locinput = st->locinput; @@ -5042,6 +5245,8 @@ NULL switch (OP(text_node)) { case EXACTF: ST.c2 = PL_fold[ST.c1]; break; case EXACTFA: + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; default: ST.c2 = ST.c1; @@ -5072,16 +5277,18 @@ NULL } if (ST.me->flags) { - /* mark current A as captured */ + /* emulate CLOSE: mark current A as captured */ I32 paren = ST.me->flags; if (ST.count) { - PL_regoffs[paren].start + rex->offs[paren].start = HOPc(PL_reginput, -ST.alen) - PL_bostr; - PL_regoffs[paren].end = PL_reginput - PL_bostr; - /*dmq: *PL_reglastcloseparen = paren; */ + rex->offs[paren].end = PL_reginput - PL_bostr; + if ((U32)paren > rex->lastparen) + rex->lastparen = paren; + rex->lastcloseparen = paren; } else - PL_regoffs[paren].end = -1; + rex->offs[paren].end = -1; if (cur_eval && cur_eval->u.eval.close_paren && cur_eval->u.eval.close_paren == (U32)ST.me->flags) { @@ -5093,10 +5300,11 @@ NULL } PUSH_STATE_GOTO(CURLYM_B, ST.B); /* match B */ - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case CURLYM_B_fail: /* just failed to match a B */ REGCP_UNWIND(ST.cp); + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); if (ST.minmod) { I32 max = ARG2(ST.me); if (max != REG_INFTY && ST.count == max) @@ -5116,12 +5324,17 @@ NULL #define CURLY_SETPAREN(paren, success) \ if (paren) { \ if (success) { \ - PL_regoffs[paren].start = HOPc(locinput, -1) - PL_bostr; \ - PL_regoffs[paren].end = locinput - PL_bostr; \ - *PL_reglastcloseparen = paren; \ + rex->offs[paren].start = HOPc(locinput, -1) - PL_bostr; \ + rex->offs[paren].end = locinput - PL_bostr; \ + if (paren > rex->lastparen) \ + rex->lastparen = paren; \ + rex->lastcloseparen = paren; \ + } \ + else { \ + rex->offs[paren].end = -1; \ + rex->lastparen = ST.lastparen; \ + rex->lastcloseparen = ST.lastcloseparen; \ } \ - else \ - PL_regoffs[paren].end = -1; \ } case STAR: /* /A*B/ where A is width 1 */ @@ -5138,10 +5351,10 @@ NULL goto repeat; case CURLYN: /* /(A){m,n}B/ where A is width 1 */ ST.paren = scan->flags; /* Which paren to set */ + ST.lastparen = rex->lastparen; + ST.lastcloseparen = rex->lastcloseparen; if (ST.paren > PL_regsize) PL_regsize = ST.paren; - if (ST.paren > *PL_reglastparen) - *PL_reglastparen = ST.paren; ST.min = ARG1(scan); /* min to match */ ST.max = ARG2(scan); /* max to match */ if (cur_eval && cur_eval->u.eval.close_paren && @@ -5196,6 +5409,8 @@ NULL switch (OP(text_node)) { case EXACTF: ST.c2 = PL_fold[ST.c1]; break; case EXACTFA: + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; default: ST.c2 = ST.c1; break; @@ -5203,25 +5418,12 @@ NULL } else { /* UTF_PATTERN */ if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) { - STRLEN ulen1, ulen2; - U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; - U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; - - to_utf8_lower((U8*)s, tmpbuf1, &ulen1); - to_utf8_upper((U8*)s, tmpbuf2, &ulen2); -#ifdef EBCDIC - ST.c1 = utf8n_to_uvchr(tmpbuf1, UTF8_MAXLEN, 0, - ckWARN(WARN_UTF8) ? - 0 : UTF8_ALLOW_ANY); - ST.c2 = utf8n_to_uvchr(tmpbuf2, UTF8_MAXLEN, 0, - ckWARN(WARN_UTF8) ? - 0 : UTF8_ALLOW_ANY); -#else - ST.c1 = utf8n_to_uvuni(tmpbuf1, UTF8_MAXBYTES, 0, - uniflags); - ST.c2 = utf8n_to_uvuni(tmpbuf2, UTF8_MAXBYTES, 0, + STRLEN ulen; + U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; + + to_utf8_fold((U8*)s, tmpbuf, &ulen); + ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0, uniflags); -#endif } else { ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0, @@ -5291,16 +5493,17 @@ NULL REGCP_SET(ST.cp); goto curly_try_B_max; } - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case CURLY_B_min_known_fail: /* failed to find B in a non-greedy match where c1,c2 valid */ - if (ST.paren && ST.count) - PL_regoffs[ST.paren].end = -1; PL_reginput = locinput; /* Could be reset... */ REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } /* Couldn't or didn't -- move forward. */ ST.oldloc = locinput; if (utf8_target) @@ -5369,15 +5572,16 @@ NULL } PUSH_STATE_GOTO(CURLY_B_min_known, ST.B); } - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case CURLY_B_min_fail: /* failed to find B in a non-greedy match where c1,c2 invalid */ - if (ST.paren && ST.count) - PL_regoffs[ST.paren].end = -1; REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } /* failed -- move forward one */ PL_reginput = locinput; if (regrepeat(rex, ST.A, 1, depth)) { @@ -5396,7 +5600,7 @@ NULL } } sayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ curly_try_B_max: @@ -5415,16 +5619,17 @@ NULL if (ST.c1 == CHRTEST_VOID || c == (UV)ST.c1 || c == (UV)ST.c2) { CURLY_SETPAREN(ST.paren, ST.count); PUSH_STATE_GOTO(CURLY_B_max, ST.B); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } } /* FALL THROUGH */ case CURLY_B_max_fail: /* failed to find B in a greedy match */ - if (ST.paren && ST.count) - PL_regoffs[ST.paren].end = -1; REGCP_UNWIND(ST.cp); + if (ST.paren) { + UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); + } /* back up. */ if (--ST.count < ST.min) sayNO; @@ -5437,32 +5642,24 @@ NULL fake_end: if (cur_eval) { /* we've just finished A in /(??{A})B/; now continue with B */ - I32 tmpix; st->u.eval.toggle_reg_flags = cur_eval->u.eval.toggle_reg_flags; PL_reg_flags ^= st->u.eval.toggle_reg_flags; st->u.eval.prev_rex = rex_sv; /* inner */ - SETREX(rex_sv,cur_eval->u.eval.prev_rex); + st->u.eval.cp = regcppush(rex, 0); /* Save *all* the positions. */ + rex_sv = cur_eval->u.eval.prev_rex; + SET_reg_curpm(rex_sv); rex = (struct regexp *)SvANY(rex_sv); rexi = RXi_GET(rex); cur_curlyx = cur_eval->u.eval.prev_curlyx; - (void)ReREFCNT_inc(rex_sv); - st->u.eval.cp = regcppush(0); /* Save *all* the positions. */ - - /* rex was changed so update the pointer in PL_reglastparen and PL_reglastcloseparen */ - PL_reglastparen = &rex->lastparen; - PL_reglastcloseparen = &rex->lastcloseparen; REGCP_SET(st->u.eval.lastcp); PL_reginput = locinput; /* Restore parens of the outer rex without popping the * savestack */ - tmpix = PL_savestack_ix; - PL_savestack_ix = cur_eval->u.eval.lastcp; - regcppop(rex); - PL_savestack_ix = tmpix; + S_regcp_restore(aTHX_ rex, cur_eval->u.eval.lastcp); st->u.eval.prev_eval = cur_eval; cur_eval = cur_eval->u.eval.prev_eval; @@ -5539,7 +5736,7 @@ NULL /* execute body of (?...A) */ PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan))); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case IFMATCH_A_fail: /* body of (?...A) failed */ ST.wanted = !ST.wanted; @@ -5578,13 +5775,13 @@ NULL if (!scan->flags) sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); PUSH_STATE_GOTO(COMMIT_next,next); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case COMMIT_next_fail: no_final = 1; /* FALLTHROUGH */ case OPFAIL: sayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ #define ST st->u.mark case MARKPOINT: @@ -5594,11 +5791,11 @@ NULL mark_state = st; ST.mark_loc = PL_reginput = locinput; PUSH_YES_STATE_GOTO(MARKPOINT_next,next); - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case MARKPOINT_next: mark_state = ST.prev_mark; sayYES; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case MARKPOINT_next_fail: if (popmark && sv_eq(ST.mark_name,popmark)) { @@ -5618,7 +5815,7 @@ NULL sv_yes_mark = mark_state ? mark_state->u.mark.mark_name : NULL; sayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ case SKIP: PL_reginput = locinput; if (scan->flags) { @@ -5662,29 +5859,8 @@ NULL } no_final = 1; sayNO; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ #undef ST - case FOLDCHAR: - n = ARG(scan); - if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) { - locinput += ln; - } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) { - sayNO; - } else { - U8 folded[UTF8_MAXBYTES_CASE+1]; - STRLEN foldlen; - const char * const l = locinput; - char *e = PL_regeol; - to_uni_fold(n, folded, &foldlen); - - if (! foldEQ_utf8((const char*) folded, 0, foldlen, 1, - l, &e, 0, utf8_target)) { - sayNO; - } - locinput = e; - } - nextchr = UCHARAT(locinput); - break; case LNBREAK: if ((n=is_LNBREAK(locinput,utf8_target))) { locinput += n; @@ -5728,7 +5904,7 @@ NULL /* switch break jumps here */ scan = next; /* prepare to execute the next op and ... */ continue; /* ... jump back to the top, reusing st */ - /* NOTREACHED */ + assert(0); /* NOTREACHED */ push_yes_state: /* push a state that backtracks on success */ @@ -5772,7 +5948,7 @@ NULL nextchr = UCHARAT(locinput); st = newst; continue; - /* NOTREACHED */ + assert(0); /* NOTREACHED */ } } @@ -5831,7 +6007,7 @@ yes: DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch successful!%s\n", PL_colors[4], PL_colors[5])); - if (PL_reg_eval_set) { + if (PL_reg_state.re_state_eval_setup_done) { /* each successfully executed (?{...}) block does the equivalent of * local $^R = do {...} * When popping the save stack, all these locals would be undone; @@ -5897,6 +6073,13 @@ no_silent: sv_setsv(sv_mrk, sv_yes_mark); } + + if (last_pushed_cv) { + dSP; + POP_MULTICALL; + PERL_UNUSED_VAR(SP); + } + /* clean up; in particular, free all slabs above current one */ LEAVE_SCOPE(oldsave); @@ -5915,11 +6098,11 @@ STATIC I32 S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) { dVAR; - register char *scan; - register I32 c; - register char *loceol = PL_regeol; - register I32 hardcount = 0; - register bool utf8_target = PL_reg_match_utf8; + char *scan; + I32 c; + char *loceol = PL_regeol; + I32 hardcount = 0; + bool utf8_target = PL_reg_match_utf8; UV utf8_flags; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); @@ -5980,7 +6163,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) /* Here, the string is utf8, and the pattern char is different * in utf8 than not, so can't compare them directly. Outside the - * loop, find find the two utf8 bytes that represent c, and then + * loop, find the two utf8 bytes that represent c, and then * look for those in sequence in the utf8 string */ U8 high = UTF8_TWO_BYTE_HI(c); U8 low = UTF8_TWO_BYTE_LO(c); @@ -6006,8 +6189,13 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) goto do_exactf; case EXACTF: + utf8_flags = 0; + goto do_exactf; + + case EXACTFU_SS: + case EXACTFU_TRICKYFOLD: case EXACTFU: - utf8_flags = 0; + utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0; /* The comments for the EXACT case above apply as well to these fold * ones */ @@ -6016,7 +6204,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) c = (U8)*STRING(p); assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); - if (utf8_target) { /* Use full Unicode fold matching */ + if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */ char *tmpeol = loceol; while (hardcount < max && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, @@ -6047,6 +6235,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) switch (OP(p)) { case EXACTF: folded = PL_fold[c]; break; case EXACTFA: + case EXACTFU_TRICKYFOLD: case EXACTFU: folded = PL_fold_latin1[c]; break; case EXACTFL: folded = PL_fold_locale[c]; break; default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); @@ -6145,6 +6334,24 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; + + case POSIXA: + while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + break; + case NPOSIXA: + if (utf8_target) { + while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan += UTF8SKIP(scan); + } + } + else { + while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { + scan++; + } + } + break; case NALNUMA: if (utf8_target) { while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) { @@ -6447,20 +6654,39 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) /* -- regclass_swash - prepare the utf8 swash -*/ - +- regclass_swash - prepare the utf8 swash. Wraps the shared core version to +create a copy so that changes the caller makes won't change the shared one + */ SV * Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp) { + PERL_ARGS_ASSERT_REGCLASS_SWASH; + return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp)); +} +#endif + +STATIC SV * +S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp) +{ + /* Returns the swash for the input 'node' in the regex 'prog'. + * If is true, will attempt to create the swash if not already + * done. + * If is non-null, will return the swash initialization string in + * it. + * If is non-null, will return the alternates to the regular swash + * in it + * Tied intimately to how regcomp.c sets up the data structure */ + dVAR; SV *sw = NULL; SV *si = NULL; SV *alt = NULL; + SV* invlist = NULL; + RXi_GET_DECL(prog,progi); const struct reg_data * const data = prog ? progi->data : NULL; - PERL_ARGS_ASSERT_REGCLASS_SWASH; + PERL_ARGS_ASSERT_CORE_REGCLASS_SWASH; assert(ANYOF_NONBITMAP(node)); @@ -6471,34 +6697,75 @@ Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool SV * const rv = MUTABLE_SV(data->data[n]); AV * const av = MUTABLE_AV(SvRV(rv)); SV **const ary = AvARRAY(av); - SV **a, **b; + U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST; - /* See the end of regcomp.c:S_regclass() for - * documentation of these array elements. */ - - si = *ary; - a = SvROK(ary[1]) ? &ary[1] : NULL; - b = SvTYPE(ary[2]) == SVt_PVAV ? &ary[2] : NULL; + si = *ary; /* ary[0] = the string to initialize the swash with */ + + /* Elements 3 and 4 are either both present or both absent. [3] is + * any inversion list generated at compile time; [4] indicates if + * that inversion list has any user-defined properties in it. */ + if (av_len(av) >= 3) { + invlist = ary[3]; + if (SvUV(ary[4])) { + swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; + } + } + else { + invlist = NULL; + } - if (a) - sw = *a; + /* Element [1] is reserved for the set-up swash. If already there, + * return it; if not, create it and store it there */ + if (SvROK(ary[1])) { + sw = ary[1]; + } else if (si && doinit) { - sw = swash_init("utf8", "", si, 1, 0); + + sw = _core_swash_init("utf8", /* the utf8 package */ + "", /* nameless */ + si, + 1, /* binary */ + 0, /* not from tr/// */ + invlist, + &swash_init_flags); (void)av_store(av, 1, sw); } - if (b) - alt = *b; + + /* Element [2] is for any multi-char folds. Note that is a + * fundamentally flawed design, because can't backtrack and try + * again. See [perl #89774] */ + if (SvTYPE(ary[2]) == SVt_PVAV) { + alt = ary[2]; + } } } - if (listsvp) - *listsvp = si; + if (listsvp) { + SV* matches_string = newSVpvn("", 0); + + /* Use the swash, if any, which has to have incorporated into it all + * possibilities */ + if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) + && (si && si != &PL_sv_undef)) + { + + /* If no swash, use the input initialization string, if available */ + sv_catsv(matches_string, si); + } + + /* Add the inversion list to whatever we have. This may have come from + * the swash, or from an input parameter */ + if (invlist) { + sv_catsv(matches_string, _invlist_contents(invlist)); + } + *listsvp = matches_string; + } + if (altsvp) *altsvp = alt; return sw; } -#endif /* - reginclass - determine if a character falls into a character class @@ -6589,8 +6856,8 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) || (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII_LC(c)) || (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) || (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) || @@ -6607,8 +6874,8 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) || (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) || (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c)) + (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK_LC(c)) ) /* How's that for a conditional? */ ) { match = TRUE; @@ -6637,7 +6904,7 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, || (flags & ANYOF_IS_SYNTHETIC))))) { AV *av; - SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av); + SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av); if (sw) { U8 * utf8_p; @@ -6802,9 +7069,18 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, if (! utf8_target) Safefree(utf8_p); } } + + if (UNICODE_IS_SUPER(c) + && (flags & ANYOF_WARN_SUPER) + && ckWARN_d(WARN_NON_UNICODE)) + { + Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE), + "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", c); + } } - return (flags & ANYOF_INVERT) ? !match : match; + /* The xor complements the return if to invert: 1^1 = 0, 1^0 = 1 */ + return cBOOL(flags & ANYOF_INVERT) ^ match; } STATIC U8 * @@ -6904,7 +7180,7 @@ restore_pos(pTHX_ void *arg) { dVAR; regexp * const rex = (regexp *)arg; - if (PL_reg_eval_set) { + if (PL_reg_state.re_state_eval_setup_done) { if (PL_reg_oldsaved) { rex->subbeg = PL_reg_oldsaved; rex->sublen = PL_reg_oldsavedlen; @@ -6914,7 +7190,7 @@ restore_pos(pTHX_ void *arg) RXp_MATCH_COPIED_on(rex); } PL_reg_magic->mg_len = PL_reg_oldpos; - PL_reg_eval_set = 0; + PL_reg_state.re_state_eval_setup_done = FALSE; PL_curpm = PL_reg_oldcurpm; } } @@ -6987,8 +7263,8 @@ S_to_byte_substr(pTHX_ register regexp *prog) * Local variables: * c-indentation-style: bsd * c-basic-offset: 4 - * indent-tabs-mode: t + * indent-tabs-mode: nil * End: * - * ex: set ts=8 sts=4 sw=4 noet: + * ex: set ts=8 sts=4 sw=4 et: */