X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/a76b0e900c94a68cb2f0d142538e1a9bb0d907d6..7ff1117359e03ce00638e9ee1daad537321e75d6:/regexec.c diff --git a/regexec.c b/regexec.c index 776f8f0..fab9009 100644 --- a/regexec.c +++ b/regexec.c @@ -37,16 +37,6 @@ #include "re_top.h" #endif -/* At least one required character in the target string is expressible only in - * UTF-8. */ -static const char* const non_utf8_target_but_utf8_required - = "Can't match, because target string needs to be in UTF-8\n"; - -#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ - goto target; \ -} STMT_END - /* * pregcomp and pregexec -- regsub and regerror are not used in perl * @@ -93,6 +83,18 @@ static const char* const non_utf8_target_but_utf8_required #include "inline_invlist.c" #include "unicode_constants.h" +#ifdef DEBUGGING +/* At least one required character in the target string is expressible only in + * UTF-8. */ +static const char* const non_utf8_target_but_utf8_required + = "Can't match, because target string needs to be in UTF-8\n"; +#endif + +#define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%s", non_utf8_target_but_utf8_required));\ + goto target; \ +} STMT_END + #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) #ifndef STATIC @@ -165,13 +167,13 @@ static const char* const non_utf8_target_but_utf8_required #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS_DEBUG_TEST( \ PL_utf8_swash_ptrs[_CC_WORDCHAR], \ swash_property_names[_CC_WORDCHAR], \ - GREEK_SMALL_LETTER_IOTA_UTF8) + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); #define LOAD_UTF8_CHARCLASS_GCB() /* Grapheme cluster boundaries */ \ STMT_START { \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_regular_begin, \ "_X_regular_begin", \ - GREEK_SMALL_LETTER_IOTA_UTF8); \ + LATIN_CAPITAL_LETTER_SHARP_S_UTF8); \ LOAD_UTF8_CHARCLASS_DEBUG_TEST(PL_utf8_X_extend, \ "_X_extend", \ COMBINING_GRAVE_ACCENT_UTF8); \ @@ -207,13 +209,13 @@ static const char* const non_utf8_target_but_utf8_required /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn)==EXACTFA || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) -#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFU_TRICKYFOLD || OP(rn) == EXACTFA) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -296,8 +298,8 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen) ); for (p = parenfloor+1; p <= (I32)maxopenparen; p++) { /* REGCP_PARENS_ELEMS are pushed per pairs of parentheses. */ - SSPUSHINT(rex->offs[p].end); - SSPUSHINT(rex->offs[p].start); + SSPUSHIV(rex->offs[p].end); + SSPUSHIV(rex->offs[p].start); SSPUSHINT(rex->offs[p].start_tmp); DEBUG_BUFFERS_r(PerlIO_printf(Perl_debug_log, " \\%"UVuf": %"IVdf"(%"IVdf")..%"IVdf"\n", @@ -369,10 +371,10 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p) ); paren = *maxopenparen_p; for ( ; i > 0; i -= REGCP_PAREN_ELEMS) { - I32 tmps; + SSize_t tmps; rex->offs[paren].start_tmp = SSPOPINT; - rex->offs[paren].start = SSPOPINT; - tmps = SSPOPINT; + rex->offs[paren].start = SSPOPIV; + tmps = SSPOPIV; if (paren <= rex->lastparen) rex->offs[paren].end = tmps; DEBUG_BUFFERS_r( PerlIO_printf(Perl_debug_log, @@ -484,7 +486,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) } else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { return isFOO_lc(classnum, - TWO_BYTE_UTF8_TO_UNI(*character, *(character + 1))); + TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); } if (classnum < _FIRST_NON_SWASH_CC) { @@ -526,7 +528,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) */ I32 Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, - char *strbeg, I32 minend, SV *screamer, U32 nosave) + char *strbeg, SSize_t minend, SV *screamer, U32 nosave) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -557,13 +559,9 @@ Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, char *strend, * with giant delta may be not rechecked). */ -/* Assumptions: if ANCH_GPOS, then strpos is anchored. XXXX Check GPOS logic */ - /* If SCREAM, then SvPVX_const(sv) should be compatible with strpos and strend. Otherwise, only SvCUR(sv) is used to get strbeg. */ -/* XXXX We assume that strpos is strbeg unless sv. */ - /* XXXX Some places assume that there is a fixed substring. An update may be needed if optimizer marks as "INTUITable" RExen without fixed substrings. Similarly, it is assumed that @@ -619,9 +617,9 @@ Perl_re_intuit_start(pTHX_ { dVAR; struct regexp *const prog = ReANY(rx); - I32 start_shift = 0; + SSize_t start_shift = 0; /* Should be nonnegative! */ - I32 end_shift = 0; + SSize_t end_shift = 0; char *s; SV *check; char *t; @@ -671,15 +669,24 @@ Perl_re_intuit_start(pTHX_ } check = prog->check_substr; } - if (prog->extflags & RXf_ANCH) { /* Match at beg-of-str or after \n */ + if (prog->extflags & RXf_ANCH) { /* Match at \G, beg-of-str or after \n */ ml_anch = !( (prog->extflags & RXf_ANCH_SINGLE) || ( (prog->extflags & RXf_ANCH_BOL) && !multiline ) ); /* Check after \n? */ if (!ml_anch) { - if ( !(prog->extflags & RXf_ANCH_GPOS) /* Checked by the caller */ - && !(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */ - && (strpos != strbeg)) { + /* we are only allowed to match at BOS or \G */ + + if (prog->extflags & RXf_ANCH_GPOS) { + /* in this case, we hope(!) that the caller has already + * set strpos to pos()-gofs, and will already have checked + * that this anchor position is legal + */ + ; + } + else if (!(prog->intflags & PREGf_IMPLICIT) /* not a real BOL */ + && (strpos != strbeg)) + { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); goto fail; } @@ -690,7 +697,7 @@ Perl_re_intuit_start(pTHX_ See [perl #115242] */ { /* Substring at constant offset from beg-of-str... */ - I32 slen; + SSize_t slen; s = HOP3c(strpos, prog->check_offset_min, strend); @@ -726,9 +733,9 @@ Perl_re_intuit_start(pTHX_ end_shift = prog->check_end_shift; if (!ml_anch) { - const I32 end = prog->check_offset_max + CHR_SVLEN(check) + const SSize_t end = prog->check_offset_max + CHR_SVLEN(check) - (SvTAIL(check) != 0); - const I32 eshift = CHR_DIST((U8*)strend, (U8*)s) - end; + const SSize_t eshift = CHR_DIST((U8*)strend, (U8*)s) - end; if (end_shift < eshift) end_shift = eshift; @@ -743,7 +750,7 @@ Perl_re_intuit_start(pTHX_ /* end shift should be non negative here */ } -#ifdef QDEBUGGING /* 7/99: reports of failure (with the older version) */ +#ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ if (end_shift < 0) Perl_croak(aTHX_ "panic: end_shift: %"IVdf" pattern:\n%s\n ", (IV)end_shift, RX_PRECOMP(prog)); @@ -754,8 +761,8 @@ Perl_re_intuit_start(pTHX_ the "check" substring in the region corrected by start/end_shift. */ { - I32 srch_start_shift = start_shift; - I32 srch_end_shift = end_shift; + SSize_t srch_start_shift = start_shift; + SSize_t srch_end_shift = end_shift; U8* start_point; U8* end_point; if (srch_start_shift < 0 && strbeg - s > srch_start_shift) { @@ -1230,45 +1237,55 @@ Perl_re_intuit_start(pTHX_ } #define DECL_TRIE_TYPE(scan) \ - const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ + const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \ + trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \ trie_type = ((scan->flags == EXACT) \ ? (utf8_target ? trie_utf8 : trie_plain) \ - : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) + : (scan->flags == EXACTFA) \ + ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \ + : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold)) #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \ STMT_START { \ STRLEN skiplen; \ + U8 flags = FOLD_FLAGS_FULL; \ switch (trie_type) { \ + case trie_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALL THROUGH */ \ case trie_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ - uvc = to_utf8_fold( (const U8*) uc, foldbuf, &foldlen ); \ + uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags, NULL); \ len = UTF8SKIP(uc); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ + case trie_latin_utf8_exactfa_fold: \ + flags |= FOLD_FLAGS_NOMIX_ASCII; \ + /* FALL THROUGH */ \ case trie_latin_utf8_fold: \ if ( foldlen>0 ) { \ - uvc = utf8n_to_uvuni( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \ foldlen -= len; \ uscan += len; \ len=0; \ } else { \ len = 1; \ - uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, FOLD_FLAGS_FULL); \ + uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ skiplen = UNISKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ break; \ case trie_utf8: \ - uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ + uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \ break; \ case trie_plain: \ uvc = (UV)*uc; \ @@ -1490,6 +1507,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: if (is_utf8_pat || utf8_target) { utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; @@ -1499,10 +1519,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, folder = foldEQ_latin1; /* /a, except the sharp s one which */ goto do_exactf_non_utf8; /* isn't dealt with by these */ - case EXACTF: + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! is_utf8_pat); if (utf8_target) { - - /* regcomp.c already folded this if pattern is in UTF-8 */ utf8_fold_flags = 0; goto do_exactf_utf8; } @@ -1525,7 +1544,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } goto do_exactf_utf8; - case EXACTFU_TRICKYFOLD: case EXACTFU: if (is_utf8_pat || utf8_target) { utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -1558,7 +1576,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * characters, and there are only 2 availabe, we know without * trying that it will fail; so don't start a match past the * required minimum number from the far end */ - e = HOP3c(strend, -((I32)ln), s); + e = HOP3c(strend, -((SSize_t)ln), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1604,7 +1622,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * only 2 are left, it's guaranteed to fail, so don't start a * match that would require us to go beyond the end of the string */ - e = HOP3c(strend, -((I32)lnc), s); + e = HOP3c(strend, -((SSize_t)lnc), s); if (reginfo->intuit && e < s) { e = s; /* Due to minlen logic of intuit() */ @@ -1632,13 +1650,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case BOUNDL: RXp_MATCH_TAINTED_on(prog); FBC_BOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case NBOUNDL: RXp_MATCH_TAINTED_on(prog); FBC_NBOUND(isWORDCHAR_LC, - isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)), + isWORDCHAR_LC_uvchr(tmp), isWORDCHAR_LC_utf8((U8*)s)); break; case BOUND: @@ -1749,7 +1767,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, classnum))) || (UTF8_IS_DOWNGRADEABLE_START(*s) && to_complement ^ cBOOL( - _generic_isCC(TWO_BYTE_UTF8_TO_UNI(*s, *(s + 1)), + _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + *(s + 1)), classnum)))) { if (tmp && (reginfo->intuit || regtry(reginfo, &s))) @@ -2071,31 +2090,41 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, "Copy on write: regexp capture, type %d\n", (int) SvTYPE(sv)); } - /* skip creating new COW SV if a valid one already exists */ - if (! ( prog->saved_copy - && SvIsCOW(sv) - && SvPOKp(sv) - && SvIsCOW(prog->saved_copy) - && SvPOKp(prog->saved_copy) - && SvPVX(sv) == SvPVX(prog->saved_copy))) + /* Create a new COW SV to share the match string and store + * in saved_copy, unless the current COW SV in saved_copy + * is valid and suitable for our purpose */ + if (( prog->saved_copy + && SvIsCOW(prog->saved_copy) + && SvPOKp(prog->saved_copy) + && SvIsCOW(sv) + && SvPOKp(sv) + && SvPVX(sv) == SvPVX(prog->saved_copy))) { + /* just reuse saved_copy SV */ + if (RXp_MATCH_COPIED(prog)) { + Safefree(prog->subbeg); + RXp_MATCH_COPIED_off(prog); + } + } + else { + /* create new COW SV to share string */ RX_MATCH_COPY_FREE(rx); prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); - prog->subbeg = (char *)SvPVX_const(prog->saved_copy); - assert (SvPOKp(prog->saved_copy)); } + prog->subbeg = (char *)SvPVX_const(prog->saved_copy); + assert (SvPOKp(prog->saved_copy)); prog->sublen = strend - strbeg; prog->suboffset = 0; prog->subcoffset = 0; } else #endif { - I32 min = 0; - I32 max = strend - strbeg; - I32 sublen; + SSize_t min = 0; + SSize_t max = strend - strbeg; + SSize_t sublen; if ( (flags & REXEC_COPY_SKIP_POST) - && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ && !(PL_sawampersand & SAWAMPERSAND_RIGHT) ) { /* don't copy $' part of string */ U32 n = 0; @@ -2116,7 +2145,7 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, } if ( (flags & REXEC_COPY_SKIP_PRE) - && !(RX_EXTFLAGS(rx) & RXf_PMf_KEEPCOPY) /* //p */ + && !(prog->extflags & RXf_PMf_KEEPCOPY) /* //p */ && !(PL_sawampersand & SAWAMPERSAND_LEFT) ) { /* don't copy $` part of string */ U32 n = 0; @@ -2172,7 +2201,8 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, * $x = "\x{100}" x 1E6; 1 while $x =~ /(.)/g; * from going quadratic */ if (SvPOKp(sv) && SvPVX(sv) == strbeg) - sv_pos_b2u(sv, &(prog->subcoffset)); + prog->subcoffset = sv_pos_b2u_flags(sv, prog->subcoffset, + SV_GMAGIC|SV_CONST_RETURN); else prog->subcoffset = utf8_length((U8*)strbeg, (U8*)(strbeg+prog->suboffset)); @@ -2195,7 +2225,7 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, */ I32 Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, - char *strbeg, I32 minend, SV *sv, void *data, U32 flags) + char *strbeg, SSize_t minend, SV *sv, void *data, U32 flags) /* stringarg: the point in the string at which to begin matching */ /* strend: pointer to null at end of string */ /* strbeg: real beginning of string */ @@ -2203,21 +2233,17 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* sv: SV being matched: only used for utf8 flag, pos() etc; string * itself is accessed via the pointers above */ /* data: May be used for some additional optimizations. - Currently its only used, with a U32 cast, for transmitting - the ganch offset when doing a /g match. This will change */ -/* nosave: For optimizations. */ + Currently unused. */ +/* flags: For optimizations. See REXEC_* in regexp.h */ { dVAR; struct regexp *const prog = ReANY(rx); char *s; regnode *c; - char *startpos = stringarg; - I32 minlen; /* must match at least this many chars */ - I32 dontbother = 0; /* how many characters not to try at end */ - I32 end_shift = 0; /* Same for the end. */ /* CC */ - I32 scream_pos = -1; /* Internal iterator of scream. */ - char *scream_olds = NULL; + char *startpos; + SSize_t minlen; /* must match at least this many chars */ + SSize_t dontbother = 0; /* how many characters not to try at end */ const bool utf8_target = cBOOL(DO_UTF8(sv)); I32 multiline; RXi_GET_DECL(prog,progi); @@ -2231,58 +2257,126 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, PERL_UNUSED_ARG(data); /* Be paranoid... */ - if (prog == NULL || startpos == NULL) { + if (prog == NULL || stringarg == NULL) { Perl_croak(aTHX_ "NULL regexp parameter"); return 0; } DEBUG_EXECUTE_r( - debug_start_match(rx, utf8_target, startpos, strend, + debug_start_match(rx, utf8_target, stringarg, strend, "Matching"); ); - if ((RX_EXTFLAGS(rx) & RXf_USE_INTUIT) + startpos = stringarg; + + if (prog->extflags & RXf_GPOS_SEEN) { + MAGIC *mg; + + /* set reginfo->ganch, the position where \G can match */ + + reginfo->ganch = + (flags & REXEC_IGNOREPOS) + ? stringarg /* use start pos rather than pos() */ + : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0) + /* Defined pos(): */ + ? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg) + : strbeg; /* pos() not defined; use start of string */ + + DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, + "GPOS ganch set to strbeg[%"IVdf"]\n", (IV)(reginfo->ganch - strbeg))); + + /* in the presence of \G, we may need to start looking earlier in + * the string than the suggested start point of stringarg: + * if prog->gofs is set, then that's a known, fixed minimum + * offset, such as + * /..\G/: gofs = 2 + * /ab|c\G/: gofs = 1 + * or if the minimum offset isn't known, then we have to go back + * to the start of the string, e.g. /w+\G/ + */ + + if (prog->extflags & RXf_ANCH_GPOS) { + startpos = reginfo->ganch - prog->gofs; + if (startpos < + ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) + { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "fail: ganch-gofs before earliest possible start\n")); + return 0; + } + } + else if (prog->gofs) { + if (startpos - prog->gofs < strbeg) + startpos = strbeg; + else + startpos -= prog->gofs; + } + else if (prog->extflags & RXf_GPOS_FLOAT) + startpos = strbeg; + } + + minlen = prog->minlen; + if ((startpos + minlen) > strend || startpos < strbeg) { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "Regex match can't succeed, so not even tried\n")); + return 0; + } + + /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), + * which will call destuctors to reset PL_regmatch_state, free higher + * PL_regmatch_slabs, and clean up regmatch_info_aux and + * regmatch_info_aux_eval */ + + oldsave = PL_savestack_ix; + + s = startpos; + + if ((prog->extflags & RXf_USE_INTUIT) && !(flags & REXEC_CHECKED)) { - stringarg = re_intuit_start(rx, sv, strbeg, stringarg, strend, + s = re_intuit_start(rx, sv, strbeg, startpos, strend, flags, NULL); - if (!stringarg) + if (!s) return 0; - if (RX_EXTFLAGS(rx) & RXf_CHECK_ALL) { + if (prog->extflags & RXf_CHECK_ALL) { /* we can match based purely on the result of INTUIT. * Set up captures etc just for $& and $-[0] * (an intuit-only match wont have $1,$2,..) */ assert(!prog->nparens); + + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (s < stringarg)) + { + /* this should only be possible under \G */ + assert(prog->extflags & RXf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + /* match via INTUIT shouldn't have any captures. * Let @-, @+, $^N know */ prog->lastparen = prog->lastcloseparen = 0; RX_MATCH_UTF8_set(rx, utf8_target); + prog->offs[0].start = s - strbeg; + prog->offs[0].end = utf8_target + ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg + : s - strbeg + prog->minlenret; if ( !(flags & REXEC_NOT_FIRST) ) S_reg_set_capture_string(aTHX_ rx, strbeg, strend, sv, flags, utf8_target); - prog->offs[0].start = stringarg - strbeg; - prog->offs[0].end = utf8_target - ? (char*)utf8_hop((U8*)stringarg, prog->minlenret) - strbeg - : stringarg - strbeg + prog->minlenret; return 1; } } - - /* at the end of this function, we'll do a LEAVE_SCOPE(oldsave), - * which will call destuctors to reset PL_regmatch_state, free higher - * PL_regmatch_slabs, and clean up regmatch_info_aux and - * regmatch_info_aux_eval */ - - oldsave = PL_savestack_ix; - multiline = prog->extflags & RXf_PMf_MULTILINE; - minlen = prog->minlen; - if (strend - startpos < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { + if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "String too short [regexec_flags]...\n")); goto phooey; @@ -2305,7 +2399,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, reginfo->poscache_maxiter = 0; /* not yet started a countdown */ reginfo->strend = strend; /* see how far we have to get to not match where we matched before */ - reginfo->till = startpos+minend; + reginfo->till = stringarg + minend; if (prog->extflags & RXf_EVAL_SEEN && SvPADTMP(sv) && !IS_PADGV(sv)) { /* SAVEFREESV, not sv_mortalcopy, as this SV must last until after @@ -2315,7 +2409,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, Not newSVsv, either, as it does not COW. */ reginfo->sv = newSV(0); - sv_setsv(reginfo->sv, sv); + SvSetSV_nosteal(reginfo->sv, sv); SAVEFREESV(reginfo->sv); } @@ -2370,41 +2464,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } /* If there is a "must appear" string, look for it. */ - s = startpos; - if (prog->extflags & RXf_GPOS_SEEN) { /* Need to set reginfo->ganch */ - MAGIC *mg; - if (flags & REXEC_IGNOREPOS){ /* Means: check only at start */ - reginfo->ganch = startpos + prog->gofs; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS IGNOREPOS: reginfo->ganch = startpos + %"UVxf"\n",(UV)prog->gofs)); - } else if (sv && (mg = mg_find_mglob(sv)) - && mg->mg_len >= 0) { - reginfo->ganch = strbeg + mg->mg_len; /* Defined pos() */ - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS MAGIC: reginfo->ganch = strbeg + %"IVdf"\n",(IV)mg->mg_len)); - - if (prog->extflags & RXf_ANCH_GPOS) { - if (s > reginfo->ganch) - goto phooey; - s = reginfo->ganch - prog->gofs; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS ANCH_GPOS: s = ganch - %"UVxf"\n",(UV)prog->gofs)); - if (s < strbeg) - goto phooey; - } - } - else if (data) { - reginfo->ganch = strbeg + PTR2UV(data); - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS DATA: reginfo->ganch= strbeg + %"UVxf"\n",PTR2UV(data))); - - } else { /* pos() not defined */ - reginfo->ganch = strbeg; - DEBUG_GPOS_r(PerlIO_printf(Perl_debug_log, - "GPOS: reginfo->ganch = strbeg\n")); - } - } if (PL_curpm && (PM_GETRE(PL_curpm) == rx)) { /* We have to be careful. If the previous successful match was from this regex we don't want a subsequent partially @@ -2423,24 +2483,11 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, PTR2UV(prog->offs) )); } - if (!(flags & REXEC_CHECKED) && (prog->check_substr != NULL || prog->check_utf8 != NULL)) { - re_scream_pos_data d; - - d.scream_olds = &scream_olds; - d.scream_pos = &scream_pos; - s = re_intuit_start(rx, sv, strbeg, s, strend, flags, &d); - if (!s) { - DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not present...\n")); - goto phooey; /* not present */ - } - } - - /* Simplest case: anchored match need be tried only once. */ /* [unless only anchor is BOL and multiline is set] */ if (prog->extflags & (RXf_ANCH & ~RXf_ANCH_GPOS)) { - if (s == startpos && regtry(reginfo, &startpos)) + if (s == startpos && regtry(reginfo, &s)) goto got_it; else if (multiline || (prog->intflags & PREGf_IMPLICIT) || (prog->extflags & RXf_ANCH_MBOL)) /* XXXX SBOL? */ @@ -2519,12 +2566,11 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, goto phooey; } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) { - /* the warning about reginfo->ganch being used without initialization - is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN - and we only enter this block when the same bit is set. */ - char *tmp_s = reginfo->ganch - prog->gofs; - - if (tmp_s >= strbeg && regtry(reginfo, &tmp_s)) + /* For anchored \G, the only position it can match from is + * (ganch-gofs); we already set startpos to this above; if intuit + * moved us on from there, we can't possibly succeed */ + assert(startpos == reginfo->ganch - prog->gofs); + if (s == startpos && regtry(reginfo, &s)) goto got_it; goto phooey; } @@ -2580,8 +2626,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, || ((prog->float_substr != NULL || prog->float_utf8 != NULL) && prog->float_max_offset < strend - s)) { SV *must; - I32 back_max; - I32 back_min; + SSize_t back_max; + SSize_t back_min; char *last; char *last1; /* Last position checked before */ #ifdef DEBUGGING @@ -2626,7 +2672,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, last = strend; } else { last = HOP3c(strend, /* Cannot start after this */ - -(I32)(CHR_SVLEN(must) + -(SSize_t)(CHR_SVLEN(must) - (SvTAIL(must) != 0) + back_min), strbeg); } if (s > reginfo->strbeg) @@ -2636,8 +2682,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* XXXX check_substr already used to find "s", can optimize if check_substr==must. */ - scream_pos = -1; - dontbother = end_shift; + dontbother = 0; strend = HOPc(strend, -dontbother); while ( (s <= last) && (s = fbm_instr((unsigned char*)HOP3(s, back_min, (back_min<0 ? strbeg : strend)), @@ -2830,6 +2875,18 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, goto phooey; got_it: + /* s/// doesn't like it if $& is earlier than where we asked it to + * start searching (which can happen on something like /.\G/) */ + if ( (flags & REXEC_FAIL_ON_UNDERFLOW) + && (prog->offs[0].start < stringarg - strbeg)) + { + /* this should only be possible under \G */ + assert(prog->extflags & RXf_GPOS_SEEN); + DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, + "matched, but failing for REXEC_FAIL_ON_UNDERFLOW\n")); + goto phooey; + } + DEBUG_BUFFERS_r( if (swap) PerlIO_printf(Perl_debug_log, @@ -2904,7 +2961,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) CHECKPOINT lastcp; REGEXP *const rx = reginfo->prog; regexp *const prog = ReANY(rx); - I32 result; + SSize_t result; RXi_GET_DECL(prog,progi); GET_RE_DEBUG_FLAGS_DECL; @@ -3398,7 +3455,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, SV** listp; if (! PL_utf8_foldclosures) { if (! PL_utf8_tofold) { - U8 dummy[UTF8_MAXBYTES+1]; + U8 dummy[UTF8_MAXBYTES_CASE+1]; /* Force loading this by folding an above-Latin1 char */ to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); @@ -3451,7 +3508,8 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, * which is the one above 255 */ if ((c1 < 256) != (c2 < 256)) { if (OP(text_node) == EXACTFL - || (OP(text_node) == EXACTFA + || ((OP(text_node) == EXACTFA + || OP(text_node) == EXACTFA_NO_TRIE) && (isASCII(c1) || isASCII(c2)))) { if (c1 < 256) { @@ -3469,7 +3527,9 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, if (utf8_target && HAS_NONLATIN1_FOLD_CLOSURE(c1) && OP(text_node) != EXACTFL - && (OP(text_node) != EXACTFA || ! isASCII(c1))) + && ((OP(text_node) != EXACTFA + && OP(text_node) != EXACTFA_NO_TRIE) + || ! isASCII(c1))) { /* Here, there could be something above Latin1 in the target which * folds to this character in the pattern. All such cases except @@ -3490,7 +3550,9 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, c2 = PL_fold_locale[c1]; break; - case EXACTF: + case EXACTF: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); if (! utf8_target) { /* /d rules */ c2 = PL_fold[c1]; break; @@ -3498,8 +3560,11 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, /* FALLTHROUGH */ /* /u rules for all these. This happens to work for * EXACTFA as nothing in Latin1 folds to ASCII */ + case EXACTFA_NO_TRIE: /* This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: - case EXACTFU_TRICKYFOLD: case EXACTFU_SS: case EXACTFU: c2 = PL_fold_latin1[c1]; @@ -3549,7 +3614,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } /* returns -1 on failure, $+[0] on success */ -STATIC I32 +STATIC SSize_t S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) { #if PERL_VERSION < 9 && !defined(PERL_CORE) @@ -3567,7 +3632,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) regnode *scan; regnode *next; U32 n = 0; /* general value; init to avoid compiler warning */ - I32 ln = 0; /* len or last; init to avoid compiler warning */ + SSize_t ln = 0; /* len or last; init to avoid compiler warning */ char *locinput = startpos; char *pushinput; /* where to continue after a PUSH */ I32 nextchr; /* is always set to UCHARAT(locinput) */ @@ -3595,7 +3660,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) during a successful match */ U32 lastopen = 0; /* last open we saw */ bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; - SV* const oreplsv = GvSV(PL_replgv); + SV* const oreplsv = GvSVn(PL_replgv); /* these three flags are set by various ops to signal information to * the very next op. They have a useful lifetime of exactly one loop * iteration, and are not preserved or restored by state pushes/pops @@ -3707,16 +3772,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sayNO_SILENT; assert(0); /*NOTREACHED*/ - case EOL: /* /..$/ */ - goto seol; - case MEOL: /* /..$/m */ if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; break; + case EOL: /* /..$/ */ + /* FALL THROUGH */ case SEOL: /* /..$/s */ - seol: if (!NEXTCHR_IS_EOS && nextchr != '\n') sayNO; if (reginfo->strend - locinput > 1) @@ -4014,7 +4077,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (chars) { if (utf8_target) { - uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr((U8*)uc, UTF8_MAXLEN, &len, uniflags); uc += len; } @@ -4027,7 +4090,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) while (foldlen) { if (!--chars) break; - uvc = utf8n_to_uvuni(uscan, UTF8_MAXLEN, &len, + uvc = utf8n_to_uvchr(uscan, UTF8_MAXLEN, &len, uniflags); uscan += len; foldlen -= len; @@ -4118,7 +4181,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) l++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*l, *(l+1)) != * (U8*) s) { + if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + { sayNO; } l += 2; @@ -4141,7 +4205,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s++; } else { - if (TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)) != * (U8*) l) { + if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + { sayNO; } s += 2; @@ -4178,20 +4243,25 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto do_exactf; case EXACTFU_SS: /* /\x{df}/iu */ - case EXACTFU_TRICKYFOLD: /* /\x{390}/iu */ case EXACTFU: /* /abc/iu */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = is_utf8_pat ? FOLDEQ_S1_ALREADY_FOLDED : 0; goto do_exactf; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 + patterns */ + assert(! is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: /* /abc/iaa */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; fold_utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; - case EXACTF: /* /abc/i */ + case EXACTF: /* /abc/i This node only generated for + non-utf8 patterns */ + assert(! is_utf8_pat); folder = foldEQ; fold_array = PL_fold; fold_utf8_flags = 0; @@ -4267,7 +4337,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } } else { - ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln)); + ln = isWORDCHAR_LC_uvchr(ln); n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput); } } @@ -4356,7 +4426,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), - (U8) TWO_BYTE_UTF8_TO_UNI(nextchr, + (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)))))) { sayNO; @@ -4437,9 +4507,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(nextchr, + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, *(locinput + 1)), - FLAGS(scan))))) + FLAGS(scan))))) { sayNO; } @@ -4993,8 +5063,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) rex->offs[0].end = locinput - reginfo->strbeg; if (reginfo->info_aux_eval->pos_magic) - reginfo->info_aux_eval->pos_magic->mg_len - = locinput - reginfo->strbeg; + MgBYTEPOS_set(reginfo->info_aux_eval->pos_magic, + reginfo->sv, reginfo->strbeg, + locinput - reginfo->strbeg); if (sv_yes_mark) { SV *sv_mrk = get_sv("REGMARK", 1); @@ -5032,20 +5103,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) else { /* /(??{}) */ /* if its overloaded, let the regex compiler handle * it; otherwise extract regex, or stringify */ + if (SvGMAGICAL(ret)) + ret = sv_mortalcopy(ret); if (!SvAMAGIC(ret)) { SV *sv = ret; if (SvROK(sv)) sv = SvRV(sv); if (SvTYPE(sv) == SVt_REGEXP) re_sv = (REGEXP*) sv; - else if (SvSMAGICAL(sv)) { - MAGIC *mg = mg_find(sv, PERL_MAGIC_qr); + else if (SvSMAGICAL(ret)) { + MAGIC *mg = mg_find(ret, PERL_MAGIC_qr); if (mg) re_sv = (REGEXP *) mg->mg_obj; } - /* force any magic, undef warnings here */ - if (!re_sv) { + /* force any undef warnings here */ + if (!re_sv && !SvPOK(ret) && !SvNIOK(ret)) { ret = sv_mortalcopy(ret); (void) SvPV_force_nolen(ret); } @@ -5099,17 +5172,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) pm_flags); if (!(SvFLAGS(ret) - & (SVs_TEMP | SVs_PADTMP | SVf_READONLY - | SVs_GMG))) { + & (SVs_TEMP | SVs_GMG | SVf_ROK)) + && (!SvPADTMP(ret) || SvREADONLY(ret))) { /* This isn't a first class regexp. Instead, it's caching a regexp onto an existing, Perl visible scalar. */ sv_magic(ret, MUTABLE_SV(re_sv), PERL_MAGIC_qr, 0, 0); } - /* safe to do now that any $1 etc has been - * interpolated into the new pattern string and - * compiled */ - S_regcp_restore(aTHX_ rex, runops_cp, &maxopenparen); } SAVEFREESV(re_sv); re = ReANY(re_sv); @@ -5523,10 +5592,10 @@ NULL if (reginfo->poscache_iter-- == 0) { /* initialise cache */ - const I32 size = (reginfo->poscache_maxiter + 7)/8; + const SSize_t size = (reginfo->poscache_maxiter + 7)/8; regmatch_info_aux *const aux = reginfo->info_aux; if (aux->poscache) { - if ((I32)reginfo->poscache_size < size) { + if ((SSize_t)reginfo->poscache_size < size) { Renew(aux->poscache, size, char); reginfo->poscache_size = size; } @@ -5544,7 +5613,7 @@ NULL if (reginfo->poscache_iter < 0) { /* have we already failed at this position? */ - I32 offset, mask; + SSize_t offset, mask; reginfo->poscache_iter = -1; /* stop eventual underflow */ offset = (scan->flags & 0xf) - 1 @@ -5869,7 +5938,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n", + "%*s CURLYM Fast bail next target=0x%"UVXf" c1=0x%"UVXf" c2=0x%"UVXf"\n", (int)(REPORT_CODE_OFF+(depth*2)),"", valid_utf8_to_uvchr((U8 *) locinput, NULL), valid_utf8_to_uvchr(ST.c1_utf8, NULL), @@ -5883,7 +5952,7 @@ NULL /* simulate B failing */ DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n", + "%*s CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", (int)(REPORT_CODE_OFF+(depth*2)),"", (int) nextchr, ST.c1, ST.c2) ); @@ -6824,7 +6893,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* Target isn't utf8; convert the character in the UTF-8 * pattern to non-UTF8, and do a simple loop */ - c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1)); + c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); while (scan < loceol && UCHARAT(scan) == c) { scan++; } @@ -6851,8 +6920,11 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; + case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + /* FALL THROUGH */ case EXACTFA: - utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; + utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; case EXACTFL: @@ -6860,12 +6932,12 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, utf8_flags = FOLDEQ_UTF8_LOCALE; goto do_exactf; - case EXACTF: - utf8_flags = 0; - goto do_exactf; + case EXACTF: /* This node only generated for non-utf8 patterns */ + assert(! reginfo->is_utf8_pat); + utf8_flags = 0; + goto do_exactf; case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: case EXACTFU: utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0; @@ -7051,8 +7123,8 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_UNI(*scan, - *(scan + 1)), + ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + *(scan + 1)), classnum)))) { break; @@ -7237,15 +7309,18 @@ STATIC SV * S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit, SV** listsvp) { /* Returns the swash for the input 'node' in the regex 'prog'. - * If is true, will attempt to create the swash if not already + * If is 'true', will attempt to create the swash if not already * done. - * If is non-null, will return the swash initialization string in - * it. + * If is non-null, will return the printable contents of the + * swash. This can be used to get debugging information even before the + * swash exists, by calling this function with 'doinit' set to false, in + * which case the components that will be used to eventually create the + * swash are returned (in a printable form). * Tied intimately to how regcomp.c sets up the data structure */ dVAR; SV *sw = NULL; - SV *si = NULL; + SV *si = NULL; /* Input swash initialization string */ SV* invlist = NULL; RXi_GET_DECL(prog,progi); @@ -7281,7 +7356,7 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit /* Element [1] is reserved for the set-up swash. If already there, * return it; if not, create it and store it there */ - if (SvROK(ary[1])) { + if (ary[1] && SvROK(ary[1])) { sw = ary[1]; } else if (si && doinit) { @@ -7298,16 +7373,18 @@ S_core_regclass_swash(pTHX_ const regexp *prog, const regnode* node, bool doinit } } + /* If requested, return a printable version of what this swash matches */ if (listsvp) { SV* matches_string = newSVpvn("", 0); - /* Use the swash, if any, which has to have incorporated into it all - * possibilities */ + /* The swash should be used, if possible, to get the data, as it + * contains the resolved data. But this function can be called at + * compile-time, before everything gets resolved, in which case we + * return the currently best available information, which is the string + * that will eventually be used to do that resolving, 'si' */ if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL) && (si && si != &PL_sv_undef)) { - - /* If no swash, use the input initialization string, if available */ sv_catsv(matches_string, si); } @@ -7371,14 +7448,13 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const match = TRUE; } else if (flags & ANYOF_LOCALE) { - RXp_MATCH_TAINTED_on(prog); - - if ((flags & ANYOF_LOC_FOLD) - && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) - { - match = TRUE; - } - else if (ANYOF_CLASS_TEST_ANY_SET(n)) { + if (flags & ANYOF_LOC_FOLD) { + RXp_MATCH_TAINTED_on(prog); + if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { + match = TRUE; + } + } + else if (ANYOF_POSIXL_TEST_ANY_SET(n)) { /* The data structure is arranged so bits 0, 2, 4, ... are set * if the class includes the Posix character class given by @@ -7412,8 +7488,10 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const int count = 0; int to_complement = 0; + + RXp_MATCH_TAINTED_on(prog); while (count < ANYOF_MAX) { - if (ANYOF_CLASS_TEST(n, count) + if (ANYOF_POSIXL_TEST(n, count) && to_complement ^ cBOOL(isFOO_lc(count/2, (U8) c))) { match = TRUE; @@ -7436,7 +7514,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * positive that will be resolved when the match is done again as not part * of the synthetic start class */ if (!match) { - if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { + if (utf8_target && (flags & ANYOF_ABOVE_LATIN1_ALL) && c >= 256) { match = TRUE; /* Everything above 255 matches */ } else if (ANYOF_NONBITMAP(n) @@ -7479,7 +7557,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } STATIC U8 * -S_reghop3(U8 *s, I32 off, const U8* lim) +S_reghop3(U8 *s, SSize_t off, const U8* lim) { /* return the position 'off' UTF-8 characters away from 's', forward if * 'off' >= 0, backwards if negative. But don't go outside of position @@ -7514,7 +7592,7 @@ S_reghop3(U8 *s, I32 off, const U8* lim) we ifdef it out - dmq */ STATIC U8 * -S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) +S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) { dVAR; @@ -7541,7 +7619,7 @@ S_reghop4(U8 *s, I32 off, const U8* llim, const U8* rlim) #endif STATIC U8 * -S_reghopmaybe3(U8* s, I32 off, const U8* lim) +S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) { dVAR; @@ -7609,6 +7687,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) } eval_state->pos_magic = mg; eval_state->pos = mg->mg_len; + eval_state->pos_flags = mg->mg_flags; } else eval_state->pos_magic = NULL; @@ -7683,7 +7762,12 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg) RXp_MATCH_COPIED_on(rex); } if (eval_state->pos_magic) + { eval_state->pos_magic->mg_len = eval_state->pos; + eval_state->pos_magic->mg_flags = + (eval_state->pos_magic->mg_flags & ~MGf_BYTES) + | (eval_state->pos_flags & MGf_BYTES); + } PL_curpm = eval_state->curpm; }