X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/045325615467f1c1f009faca5f22887c8c0724b5..33bc847050ad68bb79f1e04db9100e25017348e1:/regexec.c diff --git a/regexec.c b/regexec.c index ee98c86..75d58ce 100644 --- a/regexec.c +++ b/regexec.c @@ -83,12 +83,29 @@ #include "invlist_inline.h" #include "unicode_constants.h" -#define B_ON_NON_UTF8_LOCALE_IS_WRONG \ - "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale" +static const char b_utf8_locale_required[] = + "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong." + " Assuming a UTF-8 locale"; + +#define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND \ + STMT_START { \ + if (! IN_UTF8_CTYPE_LOCALE) { \ + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \ + b_utf8_locale_required); \ + } \ + } STMT_END -static const char utf8_locale_required[] = +static const char sets_utf8_locale_required[] = "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"; +#define CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(n) \ + STMT_START { \ + if (! IN_UTF8_CTYPE_LOCALE && ANYOFL_UTF8_LOCALE_REQD(FLAGS(n))) { \ + Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), \ + sets_utf8_locale_required); \ + } \ + } STMT_END + #ifdef DEBUGGING /* At least one required character in the target string is expressible only in * UTF-8. */ @@ -101,8 +118,6 @@ static const char non_utf8_target_but_utf8_required[] goto target; \ } STMT_END -#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) - #ifndef STATIC #define STATIC static #endif @@ -194,7 +209,7 @@ static const char non_utf8_target_but_utf8_required[] rn = (rn->flags == 0) ? NEXTOPER(NEXTOPER(rn)) : rn + ARG(rn); \ else rn += NEXT_OFF(rn); \ } \ -} STMT_END +} STMT_END #define SLAB_FIRST(s) (&(s)->states[0]) #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) @@ -218,7 +233,7 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH) const UV total_elems = paren_elems_to_push + REGCP_OTHER_ELEMS; const UV elems_shifted = total_elems << SAVE_TIGHT_SHIFT; I32 p; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_REGCPPUSH; @@ -235,7 +250,7 @@ S_regcppush(pTHX_ const regexp *rex, I32 parenfloor, U32 maxopenparen _pDEPTH) (long)parenfloor); SSGROW(total_elems + REGCP_FRAME_ELEMS); - + DEBUG_BUFFERS_r( if ((int)maxopenparen > (int)parenfloor) Perl_re_exec_indentf( aTHX_ @@ -328,7 +343,7 @@ S_regcppop(pTHX_ regexp *rex, U32 *maxopenparen_p _pDEPTH) { UV i; U32 paren; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_REGCPPOP; @@ -410,10 +425,8 @@ S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH) #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ -#ifndef PERL_IN_XSUB_RE - -bool -Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character) +STATIC bool +S_isFOO_lc(pTHX_ const U8 classnum, const U8 character) { /* Returns a boolean as to whether or not 'character' is a member of the * Posix character class given by 'classnum' that should be equivalent to a @@ -422,7 +435,7 @@ Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character) * Ideally this could be replaced by a just an array of function pointers * to the C library functions that implement the macros this calls. * However, to compile, the precise function signatures are required, and - * these may vary from platform to to platform. To avoid having to figure + * these may vary from platform to platform. To avoid having to figure * out what those all are on each platform, I (khw) am using this method, * which adds an extra layer of function call overhead (unless the C * optimizer strips it away). But we don't particularly care about @@ -453,8 +466,6 @@ Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character) return FALSE; } -#endif - PERL_STATIC_INLINE I32 S_foldEQ_latin1_s2_folded(const char *s1, const char *s2, I32 len) { @@ -496,7 +507,6 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character, const U8* e) * rules, ignoring any locale. So use the Unicode function if this class * requires an inversion list, and use the Unicode macro otherwise. */ - dVAR; PERL_ARGS_ASSERT_ISFOO_UTF8_LC; @@ -579,7 +589,7 @@ S_find_span_end(U8 * s, const U8 * send, const U8 span_byte) span_word |= span_word << 4; /* That reduces the problem to what this function solves */ - return s + _variant_byte_number(span_word); + return s + variant_byte_number(span_word); #endif @@ -657,7 +667,7 @@ S_find_next_masked(U8 * s, const U8 * send, const U8 byte, const U8 mask) masked &= PERL_VARIANTS_WORD_MASK; /* This reduces the problem to that solved by this function */ - s += _variant_byte_number(masked); + s += variant_byte_number(masked); return s; } while (s + PERL_WORDSIZE <= send); @@ -723,7 +733,7 @@ S_find_span_end_mask(U8 * s, const U8 * send, const U8 span_byte, const U8 mask) masked |= masked << 1; masked |= masked << 2; masked |= masked << 4; - return s + _variant_byte_number(masked); + return s + variant_byte_number(masked); #endif @@ -859,7 +869,7 @@ Perl_re_intuit_start(pTHX_ RXi_GET_DECL(prog,progi); regmatch_info reginfo_buf; /* create some info to pass to find_byclass */ regmatch_info *const reginfo = ®info_buf; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_RE_INTUIT_START; PERL_UNUSED_ARG(flags); @@ -993,7 +1003,7 @@ Perl_re_intuit_start(pTHX_ /* Substring at constant offset from beg-of-str... */ SSize_t slen = SvCUR(check); char *s = HOP3c(strpos, prog->check_offset_min, strend); - + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " Looking for check substr at fixed offset %" IVdf "...\n", (IV)prog->check_offset_min)); @@ -1039,7 +1049,7 @@ Perl_re_intuit_start(pTHX_ #endif restart: - + /* This is the (re)entry point of the main loop in this function. * The goal of this loop is to: * 1) find the "check" substring in the region rx_origin..strend @@ -1080,7 +1090,7 @@ Perl_re_intuit_start(pTHX_ (IV)end_shift, (IV)prog->check_end_shift); }); - + end_point = HOPBACK3(strend, end_shift, rx_origin); if (!end_point) goto fail_finish; @@ -1398,7 +1408,7 @@ Perl_re_intuit_start(pTHX_ * On the one hand you'd expect rare substrings to appear less * often than \n's. On the other hand, searching for \n means * we're effectively flipping between check_substr and "\n" on each - * iteration as the current "rarest" string candidate, which + * iteration as the current "rarest" candidate string, which * means for example that we'll quickly reject the whole string if * hasn't got a \n, rather than trying every substr position * first @@ -1472,10 +1482,10 @@ Perl_re_intuit_start(pTHX_ const U8* const str = (U8*)STRING(progi->regstclass); /* XXX this value could be pre-computed */ - const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT + const SSize_t cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT ? (reginfo->is_utf8_pat - ? utf8_distance(str + STR_LEN(progi->regstclass), str) - : STR_LEN(progi->regstclass)) + ? (SSize_t)utf8_distance(str + STR_LEN(progi->regstclass), str) + : (SSize_t)STR_LEN(progi->regstclass)) : 1); char * endpos; char *s; @@ -1512,9 +1522,9 @@ Perl_re_intuit_start(pTHX_ rx_max_float = HOP3c(check_at, -start_shift, strbeg); endpos = HOP3clim(rx_max_float, cl_l, strend); } - else + else endpos= strend; - + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ " looking for class: start_shift: %" IVdf " check_at: %" IVdf " rx_origin: %" IVdf " endpos: %" IVdf "\n", @@ -1783,56 +1793,74 @@ STMT_START { dump_exec_pos(li,s,(reginfo->strend),(reginfo->strbeg), \ startpos, doutf8, depth) -#define REXEC_FBC_SCAN(UTF8, CODE) \ +#define REXEC_FBC_UTF8_SCAN(CODE) \ + STMT_START { \ + while (s < strend) { \ + CODE \ + s += UTF8_SAFE_SKIP(s, reginfo->strend); \ + } \ + } STMT_END + +#define REXEC_FBC_NON_UTF8_SCAN(CODE) \ STMT_START { \ while (s < strend) { \ CODE \ - s += ((UTF8) ? UTF8SKIP(s) : 1); \ + s++; \ } \ } STMT_END -#define REXEC_FBC_CLASS_SCAN(UTF8, COND) \ +#define REXEC_FBC_UTF8_CLASS_SCAN(COND) \ STMT_START { \ while (s < strend) { \ - REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \ + REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \ } \ } STMT_END -#define REXEC_FBC_CLASS_SCAN_GUTS(UTF8, COND) \ +#define REXEC_FBC_NON_UTF8_CLASS_SCAN(COND) \ + STMT_START { \ + while (s < strend) { \ + REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \ + } \ + } STMT_END + +#define REXEC_FBC_UTF8_CLASS_SCAN_GUTS(COND) \ if (COND) { \ FBC_CHECK_AND_TRY \ - s += ((UTF8) ? UTF8SKIP(s) : 1); \ + s += UTF8_SAFE_SKIP(s, reginfo->strend); \ previous_occurrence_end = s; \ } \ else { \ - s += ((UTF8) ? UTF8SKIP(s) : 1); \ + s += UTF8SKIP(s); \ } -#define REXEC_FBC_CSCAN(CONDUTF8,COND) \ - if (utf8_target) { \ - REXEC_FBC_CLASS_SCAN(1, CONDUTF8); \ +#define REXEC_FBC_NON_UTF8_CLASS_SCAN_GUTS(COND) \ + if (COND) { \ + FBC_CHECK_AND_TRY \ + s++; \ + previous_occurrence_end = s; \ } \ else { \ - REXEC_FBC_CLASS_SCAN(0, COND); \ + s++; \ } /* We keep track of where the next character should start after an occurrence * of the one we're looking for. Knowing that, we can see right away if the * next occurrence is adjacent to the previous. When 'doevery' is FALSE, we * don't accept the 2nd and succeeding adjacent occurrences */ -#define FBC_CHECK_AND_TRY \ - if ( ( doevery \ - || s != previous_occurrence_end) \ - && (reginfo->intuit || regtry(reginfo, &s))) \ - { \ - goto got_it; \ +#define FBC_CHECK_AND_TRY \ + if ( ( doevery \ + || s != previous_occurrence_end) \ + && ( reginfo->intuit \ + || (s <= reginfo->strend && regtry(reginfo, &s)))) \ + { \ + goto got_it; \ } -/* This differs from the above macros in that it calls a function which returns - * the next occurrence of the thing being looked for in 's'; and 'strend' if - * there is no such occurrence. */ -#define REXEC_FBC_FIND_NEXT_SCAN(UTF8, f) \ +/* These differ from the above macros in that they call a function which + * returns the next occurrence of the thing being looked for in 's'; and + * 'strend' if there is no such occurrence. */ +#define REXEC_FBC_UTF8_FIND_NEXT_SCAN(f) \ while (s < strend) { \ s = (f); \ if (s >= strend) { \ @@ -1840,7 +1868,19 @@ STMT_START { } \ \ FBC_CHECK_AND_TRY \ - s += (UTF8) ? UTF8SKIP(s) : 1; \ + s += UTF8SKIP(s); \ + previous_occurrence_end = s; \ + } + +#define REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN(f) \ + while (s < strend) { \ + s = (f); \ + if (s >= strend) { \ + break; \ + } \ + \ + FBC_CHECK_AND_TRY \ + s++; \ previous_occurrence_end = s; \ } @@ -1858,7 +1898,7 @@ STMT_START { \ if (COND) { \ FBC_CHECK_AND_TRY \ - s += UTF8SKIP(s); \ + s += UTF8_SAFE_SKIP(s, reginfo->strend); \ previous_occurrence_end = s; \ } \ else { \ @@ -1866,16 +1906,16 @@ STMT_START { } \ } -/* The three macros below are slightly different versions of the same logic. +/* The four macros below are slightly different versions of the same logic. * * The first is for /a and /aa when the target string is UTF-8. This can only - * match ascii, but it must advance based on UTF-8. The other two handle the - * non-UTF-8 and the more generic UTF-8 cases. In all three, we are looking - * for the boundary (or non-boundary) between a word and non-word character. - * The utf8 and non-utf8 cases have the same logic, but the details must be - * different. Find the "wordness" of the character just prior to this one, and - * compare it with the wordness of this one. If they differ, we have a - * boundary. At the beginning of the string, pretend that the previous + * match ascii, but it must advance based on UTF-8. The other three handle + * the non-UTF-8 and the more generic UTF-8 cases. In all four, we are + * looking for the boundary (or non-boundary) between a word and non-word + * character. The utf8 and non-utf8 cases have the same logic, but the details + * must be different. Find the "wordness" of the character just prior to this + * one, and compare it with the wordness of this one. If they differ, we have + * a boundary. At the beginning of the string, pretend that the previous * character was a new-line. * * All these macros uncleanly have side-effects with each other and outside @@ -1893,8 +1933,8 @@ STMT_START { * see if this tentative match actually works, and if so, to quit the loop * here. And vice-versa if we are looking for a non-boundary. * - * 'tmp' below in the next three macros in the REXEC_FBC_SCAN and - * REXEC_FBC_SCAN loops is a loop invariant, a bool giving the return of + * 'tmp' below in the next four macros in the REXEC_FBC_UTF8_SCAN and + * REXEC_FBC_UTF8_SCAN loops is a loop invariant, a bool giving the return of * TEST_NON_UTF8(s-1). To see this, note that that's what it is defined to be * at entry to the loop, and to get to the IF_FAIL branch, tmp must equal * TEST_NON_UTF8(s), and in the opposite branch, IF_SUCCESS, tmp is that @@ -1905,7 +1945,7 @@ STMT_START { #define FBC_UTF8_A(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ tmp = TEST_NON_UTF8(tmp); \ - REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \ + REXEC_FBC_UTF8_SCAN( /* advances s while s < strend */ \ if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ tmp = !tmp; \ IF_SUCCESS; /* Is a boundary if values for s-1 and s differ */ \ @@ -1917,7 +1957,8 @@ STMT_START { /* Like FBC_UTF8_A, but TEST_UV is a macro which takes a UV as its input, and * TEST_UTF8 is a macro that for the same input code points returns identically - * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead */ + * to TEST_UV, but takes a pointer to a UTF-8 encoded string instead (and an + * end pointer as well) */ #define FBC_UTF8(TEST_UV, TEST_UTF8, IF_SUCCESS, IF_FAIL) \ if (s == reginfo->strbeg) { \ tmp = '\n'; \ @@ -1928,7 +1969,7 @@ STMT_START { 0, UTF8_ALLOW_DEFAULT); \ } \ tmp = TEST_UV(tmp); \ - REXEC_FBC_SCAN(1, /* 1=>is-utf8; advances s while s < strend */ \ + REXEC_FBC_UTF8_SCAN(/* advances s while s < strend */ \ if (tmp == ! (TEST_UTF8((U8 *) s, (U8 *) reginfo->strend))) { \ tmp = !tmp; \ IF_SUCCESS; \ @@ -1938,32 +1979,14 @@ STMT_START { } \ ); -/* Like the above two macros. UTF8_CODE is the complete code for handling - * UTF-8. Common to the BOUND and NBOUND cases, set-up by the FBC_BOUND, etc - * macros below */ -#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ - if (utf8_target) { \ - UTF8_CODE \ - } \ - else { /* Not utf8 */ \ - tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ - tmp = TEST_NON_UTF8(tmp); \ - REXEC_FBC_SCAN(0, /* 0=>not-utf8; advances s while s < strend */ \ - if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ - IF_SUCCESS; \ - tmp = !tmp; \ - } \ - else { \ - IF_FAIL; \ - } \ - ); \ - } \ +/* Like the above two macros, for a UTF-8 target string. UTF8_CODE is the + * complete code for handling UTF-8. Common to the BOUND and NBOUND cases, + * set-up by the FBC_BOUND, etc macros below */ +#define FBC_BOUND_COMMON_UTF8(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ + UTF8_CODE; \ /* Here, things have been set up by the previous code so that tmp is the \ - * return of TEST_NON_UTF(s-1) or TEST_UTF8(s-1) (depending on the \ - * utf8ness of the target). We also have to check if this matches against \ - * the EOS, which we treat as a \n (which is the same value in both UTF-8 \ - * or non-UTF8, so can use the non-utf8 test condition even for a UTF-8 \ - * string */ \ + * return of TEST_NON_UTF8(s-1). We also have to check if this matches \ + * against the EOS, which we treat as a \n */ \ if (tmp == ! TEST_NON_UTF8('\n')) { \ IF_SUCCESS; \ } \ @@ -1971,10 +1994,36 @@ STMT_START { IF_FAIL; \ } +/* Same as the macro above, but the target isn't UTF-8 */ +#define FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ + tmp = (s != reginfo->strbeg) ? UCHARAT(s - 1) : '\n'; \ + tmp = TEST_NON_UTF8(tmp); \ + REXEC_FBC_NON_UTF8_SCAN(/* advances s while s < strend */ \ + if (tmp == ! TEST_NON_UTF8(UCHARAT(s))) { \ + IF_SUCCESS; \ + tmp = !tmp; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + /* Here, things have been set up by the previous code so that tmp is \ + * the return of TEST_NON_UTF8(s-1). We also have to check if this \ + * matches against the EOS, which we treat as a \n */ \ + if (tmp == ! TEST_NON_UTF8('\n')) { \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } + /* This is the macro to use when we want to see if something that looks like it - * could match, actually does, and if so exits the loop */ -#define REXEC_FBC_TRYIT \ - if ((reginfo->intuit || regtry(reginfo, &s))) \ + * could match, actually does, and if so exits the loop. It needs to be used + * only for bounds checking macros, as it allows for matching beyond the end of + * string (which should be zero length without having to look at the string + * contents) */ +#define REXEC_FBC_TRYIT \ + if (reginfo->intuit || (s <= reginfo->strend && regtry(reginfo, &s))) \ goto got_it /* The only difference between the BOUND and NBOUND cases is that @@ -1985,26 +2034,39 @@ STMT_START { * The TEST_FOO parameters are for operating on different forms of input, but * all should be ones that return identically for the same underlying code * points */ -#define FBC_BOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ - FBC_BOUND_COMMON( \ - FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ + +#define FBC_BOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON_UTF8( \ + FBC_UTF8(TEST_UV, TEST_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) -#define FBC_BOUND_A(TEST_NON_UTF8) \ - FBC_BOUND_COMMON( \ - FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), \ - TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) +#define FBC_BOUND_NON_UTF8(TEST_NON_UTF8) \ + FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) + +#define FBC_BOUND_A_UTF8(TEST_NON_UTF8) \ + FBC_BOUND_COMMON_UTF8( \ + FBC_UTF8_A(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER),\ + TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) + +#define FBC_BOUND_A_NON_UTF8(TEST_NON_UTF8) \ + FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) -#define FBC_NBOUND(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ - FBC_BOUND_COMMON( \ - FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ - TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) +#define FBC_NBOUND_UTF8(TEST_NON_UTF8, TEST_UV, TEST_UTF8) \ + FBC_BOUND_COMMON_UTF8( \ + FBC_UTF8(TEST_UV, TEST_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ + TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) -#define FBC_NBOUND_A(TEST_NON_UTF8) \ - FBC_BOUND_COMMON( \ - FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ +#define FBC_NBOUND_NON_UTF8(TEST_NON_UTF8) \ + FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) + +#define FBC_NBOUND_A_UTF8(TEST_NON_UTF8) \ + FBC_BOUND_COMMON_UTF8( \ + FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) +#define FBC_NBOUND_A_NON_UTF8(TEST_NON_UTF8) \ + FBC_BOUND_COMMON_NON_UTF8(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) + #ifdef DEBUGGING static IV S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) { @@ -2089,10 +2151,9 @@ S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) { /* annoyingly all the vars in this routine have different names from their counterparts in regmatch. /grrr */ STATIC char * -S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, +S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, const char *strend, regmatch_info *reginfo) { - dVAR; /* TRUE if x+ need not match at just the 1st pos of run of x's */ const I32 doevery = (prog->intflags & PREGf_SKIP) == 0; @@ -2127,83 +2188,174 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, PERL_ARGS_ASSERT_FIND_BYCLASS; - /* We know what class it must start with. */ - switch (OP(c)) { - case ANYOFPOSIXL: - case ANYOFL: + /* We know what class it must start with. The case statements below have + * encoded the OP, and the UTF8ness of the target ('t8' for is UTF-8; 'tb' + * for it isn't; 'b' stands for byte), and the UTF8ness of the pattern + * ('p8' and 'pb'. */ + switch (with_tp_UTF8ness(OP(c), utf8_target, is_utf8_pat)) { + + case ANYOFPOSIXL_t8_pb: + case ANYOFPOSIXL_t8_p8: + case ANYOFL_t8_pb: + case ANYOFL_t8_p8: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c); - if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) { - Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); - } + /* FALLTHROUGH */ + + case ANYOFD_t8_pb: + case ANYOFD_t8_p8: + case ANYOF_t8_pb: + case ANYOF_t8_p8: + REXEC_FBC_UTF8_CLASS_SCAN( + reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)); + break; + + case ANYOFPOSIXL_tb_pb: + case ANYOFPOSIXL_tb_p8: + case ANYOFL_tb_pb: + case ANYOFL_tb_p8: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(c); /* FALLTHROUGH */ - case ANYOFD: - case ANYOF: - if (utf8_target) { - REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */ - reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); - } - else if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) { + + case ANYOFD_tb_pb: + case ANYOFD_tb_p8: + case ANYOF_tb_pb: + case ANYOF_tb_p8: + if (ANYOF_FLAGS(c) & ~ ANYOF_MATCHES_ALL_ABOVE_BITMAP) { /* We know that s is in the bitmap range since the target isn't * UTF-8, so what happens for out-of-range values is not relevant, * so exclude that from the flags */ - REXEC_FBC_CLASS_SCAN(0, reginclass(prog,c, (U8*)s, (U8*)s+1, 0)); + REXEC_FBC_NON_UTF8_CLASS_SCAN(reginclass(prog,c, (U8*)s, (U8*)s+1, + 0)); } else { - REXEC_FBC_CLASS_SCAN(0, ANYOF_BITMAP_TEST(c, *((U8*)s))); + REXEC_FBC_NON_UTF8_CLASS_SCAN(ANYOF_BITMAP_TEST(c, *((U8*)s))); } break; - case ANYOFM: /* ARG() is the base byte; FLAGS() the mask byte */ - /* UTF-8ness doesn't matter, so use 0 */ - REXEC_FBC_FIND_NEXT_SCAN(0, - (char *) find_next_masked((U8 *) s, (U8 *) strend, - (U8) ARG(c), FLAGS(c))); + case ANYOFM_tb_pb: /* ARG() is the base byte; FLAGS() the mask byte */ + case ANYOFM_tb_p8: + REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN( + (char *) find_next_masked((U8 *) s, (U8 *) strend, + (U8) ARG(c), FLAGS(c))); break; - case NANYOFM: - REXEC_FBC_FIND_NEXT_SCAN(0, - (char *) find_span_end_mask((U8 *) s, (U8 *) strend, - (U8) ARG(c), FLAGS(c))); + case ANYOFM_t8_pb: + case ANYOFM_t8_p8: + /* UTF-8ness doesn't matter because only matches UTF-8 invariants. But + * we do anyway for performance reasons, as otherwise we would have to + * examine all the continuation characters */ + REXEC_FBC_UTF8_FIND_NEXT_SCAN( + (char *) find_next_masked((U8 *) s, (U8 *) strend, + (U8) ARG(c), FLAGS(c))); break; - case ANYOFH: - if (utf8_target) { /* Can't possibly match a non-UTF-8 target */ + case NANYOFM_tb_pb: + case NANYOFM_tb_p8: + REXEC_FBC_NON_UTF8_FIND_NEXT_SCAN( + (char *) find_span_end_mask((U8 *) s, (U8 *) strend, + (U8) ARG(c), FLAGS(c))); + break; + + case NANYOFM_t8_pb: + case NANYOFM_t8_p8: /* UTF-8ness does matter because can match UTF-8 + variants. */ + REXEC_FBC_UTF8_FIND_NEXT_SCAN( + (char *) find_span_end_mask((U8 *) s, (U8 *) strend, + (U8) ARG(c), FLAGS(c))); + break; + + /* These nodes all require at least one code point to be in UTF-8 to + * match */ + case ANYOFH_tb_pb: + case ANYOFH_tb_p8: + case ANYOFHb_tb_pb: + case ANYOFHb_tb_p8: + case ANYOFHr_tb_pb: + case ANYOFHr_tb_p8: + case ANYOFHs_tb_pb: + case ANYOFHs_tb_p8: + case EXACTFLU8_tb_pb: + case EXACTFLU8_tb_p8: + case EXACTFU_REQ8_tb_pb: + case EXACTFU_REQ8_tb_p8: + break; + + case ANYOFH_t8_pb: + case ANYOFH_t8_p8: + REXEC_FBC_UTF8_CLASS_SCAN( + ( (U8) NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) + && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */))); + break; + + case ANYOFHb_t8_pb: + case ANYOFHb_t8_p8: + { + /* We know what the first byte of any matched string should be. */ U8 first_byte = FLAGS(c); - if (first_byte) { /* We know what the first byte of any matched - string should be */ - REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, - reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); - } - else { - REXEC_FBC_CLASS_SCAN(TRUE, - reginclass(prog, c, (U8*)s, (U8*) strend, utf8_target)); - } + REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, + reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */)); } break; - case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */ - assert(! is_utf8_pat); - /* FALLTHROUGH */ - case EXACTFAA: - if (is_utf8_pat) { - utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII - |FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE; - goto do_exactf_utf8; - } - else if (utf8_target) { + case ANYOFHr_t8_pb: + case ANYOFHr_t8_p8: + REXEC_FBC_UTF8_CLASS_SCAN( + ( inRANGE(NATIVE_UTF8_TO_I8(*s), + LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c)), + HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(c))) + && reginclass(prog, c, (U8*)s, (U8*) strend, + 1 /* is utf8 */))); + break; - /* Here, and elsewhere in this file, the reason we can't consider a - * non-UTF-8 pattern already folded in the presence of a UTF-8 - * target is because any MICRO SIGN in the pattern won't be folded. - * Since the fold of the MICRO SIGN requires UTF-8 to represent, we - * can consider a non-UTF-8 pattern folded when matching a - * non-UTF-8 target */ - utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; - goto do_exactf_utf8; + case ANYOFHs_t8_pb: + case ANYOFHs_t8_p8: + REXEC_FBC_UTF8_CLASS_SCAN( + ( strend -s >= FLAGS(c) + && memEQ(s, ((struct regnode_anyofhs *) c)->string, FLAGS(c)) + && reginclass(prog, c, (U8*)s, (U8*) strend, 1 /* is utf8 */))); + break; + + case ANYOFR_tb_pb: + case ANYOFR_tb_p8: + REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s, + ANYOFRbase(c), ANYOFRdelta(c))); + break; + + case ANYOFR_t8_pb: + case ANYOFR_t8_p8: + REXEC_FBC_UTF8_CLASS_SCAN( + ( NATIVE_UTF8_TO_I8(*s) >= ANYOF_FLAGS(c) + && withinCOUNT(utf8_to_uvchr_buf((U8 *) s, + (U8 *) strend, + NULL), + ANYOFRbase(c), ANYOFRdelta(c)))); + break; + + case ANYOFRb_tb_pb: + case ANYOFRb_tb_p8: + REXEC_FBC_NON_UTF8_CLASS_SCAN(withinCOUNT((U8) *s, + ANYOFRbase(c), ANYOFRdelta(c))); + break; + + case ANYOFRb_t8_pb: + case ANYOFRb_t8_p8: + { /* We know what the first byte of any matched string should be */ + U8 first_byte = FLAGS(c); + + REXEC_FBC_FIND_NEXT_UTF8_BYTE_SCAN(first_byte, + withinCOUNT(utf8_to_uvchr_buf((U8 *) s, + (U8 *) strend, + NULL), + ANYOFRbase(c), ANYOFRdelta(c))); } + break; + + case EXACTFAA_tb_pb: /* Latin1 folds are not affected by /a, except it excludes the sharp s, * which these functions don't handle anyway */ @@ -2211,80 +2363,50 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, folder = foldEQ_latin1_s2_folded; goto do_exactf_non_utf8; - case EXACTF: /* This node only generated for non-utf8 patterns */ - assert(! is_utf8_pat); - if (utf8_target) { - goto do_exactf_utf8; - } + case EXACTF_tb_pb: fold_array = PL_fold; folder = foldEQ; goto do_exactf_non_utf8; - case EXACTFL: + case EXACTFL_tb_pb: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) { + + if (IN_UTF8_CTYPE_LOCALE) { utf8_fold_flags = FOLDEQ_LOCALE; goto do_exactf_utf8; } + fold_array = PL_fold_locale; folder = foldEQ_locale; goto do_exactf_non_utf8; - case EXACTFUP: /* Problematic even though pattern isn't UTF-8. Use - full functionality normally not done except for - UTF-8 */ - assert(! is_utf8_pat); - goto do_exactf_utf8; - - case EXACTFLU8: - if (! utf8_target) { /* All code points in this node require - UTF-8 to express. */ - break; - } - utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED - | FOLDEQ_S2_FOLDS_SANE; - goto do_exactf_utf8; - - case EXACTFU_ONLY8: - if (! utf8_target) { - break; - } - assert(is_utf8_pat); - utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; - goto do_exactf_utf8; - - case EXACTFU: - if (is_utf8_pat || utf8_target) { - utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; - goto do_exactf_utf8; - } - - /* Any 'ss' in the pattern should have been replaced by regcomp, - * so we don't have to worry here about this single special case - * in the Latin1 range */ + case EXACTFU_tb_pb: + /* Any 'ss' in the pattern should have been replaced by regcomp, so we + * don't have to worry here about this single special case in the + * Latin1 range */ fold_array = PL_fold_latin1; folder = foldEQ_latin1_s2_folded; /* FALLTHROUGH */ - do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there - are no glitches with fold-length differences - between the target string and pattern */ + do_exactf_non_utf8: /* Neither pattern nor string are UTF8, and there + are no glitches with fold-length differences + between the target string and pattern */ - /* The idea in the non-utf8 EXACTF* cases is to first find the - * first character of the EXACTF* node and then, if necessary, + /* The idea in the non-utf8 EXACTF* cases is to first find the first + * character of the EXACTF* node and then, if necessary, * case-insensitively compare the full text of the node. c1 is the * first character. c2 is its fold. This logic will not work for - * Unicode semantics and the german sharp ss, which hence should - * not be compiled into a node that gets here. */ - pat_string = STRING(c); - ln = STR_LEN(c); /* length to match in octets/bytes */ - - /* We know that we have to match at least 'ln' bytes (which is the - * same as characters, since not utf8). If we have to match 3 - * characters, and there are only 2 availabe, we know without - * trying that it will fail; so don't start a match past the - * required minimum number from the far end */ + * Unicode semantics and the german sharp ss, which hence should not be + * compiled into a node that gets here. */ + pat_string = STRINGs(c); + ln = STR_LENs(c); /* length to match in octets/bytes */ + + /* We know that we have to match at least 'ln' bytes (which is the same + * as characters, since not utf8). If we have to match 3 characters, + * and there are only 2 availabe, we know without trying that it will + * fail; so don't start a match past the required minimum number from + * the far end */ e = HOP3c(strend, -((SSize_t)ln), s); if (e < s) break; @@ -2346,486 +2468,647 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } break; - do_exactf_utf8: - { - unsigned expansion; - - /* If one of the operands is in utf8, we can't use the simpler folding - * above, due to the fact that many different characters can have the - * same fold, or portion of a fold, or different- length fold */ - pat_string = STRING(c); - ln = STR_LEN(c); /* length to match in octets/bytes */ - pat_end = pat_string + ln; - lnc = is_utf8_pat /* length to match in characters */ - ? utf8_length((U8 *) pat_string, (U8 *) pat_end) - : ln; - - /* We have 'lnc' characters to match in the pattern, but because of - * multi-character folding, each character in the target can match - * up to 3 characters (Unicode guarantees it will never exceed - * this) if it is utf8-encoded; and up to 2 if not (based on the - * fact that the Latin 1 folds are already determined, and the - * only multi-char fold in that range is the sharp-s folding to - * 'ss'. Thus, a pattern character can match as little as 1/3 of a - * string character. Adjust lnc accordingly, rounding up, so that - * if we need to match at least 4+1/3 chars, that really is 5. */ - expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; - lnc = (lnc + expansion - 1) / expansion; - - /* As in the non-UTF8 case, if we have to match 3 characters, and - * only 2 are left, it's guaranteed to fail, so don't start a - * match that would require us to go beyond the end of the string - */ - e = HOP3c(strend, -((SSize_t)lnc), s); - - /* XXX Note that we could recalculate e to stop the loop earlier, - * as the worst case expansion above will rarely be met, and as we - * go along we would usually find that e moves further to the left. - * This would happen only after we reached the point in the loop - * where if there were no expansion we should fail. Unclear if - * worth the expense */ - - while (s <= e) { - char *my_strend= (char *)strend; - if (foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, - pat_string, NULL, ln, is_utf8_pat, utf8_fold_flags) - && (reginfo->intuit || regtry(reginfo, &s)) ) - { - goto got_it; - } - s += (utf8_target) ? UTF8SKIP(s) : 1; - } - break; - } + case EXACTFAA_tb_p8: + case EXACTFAA_t8_p8: + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII + |FOLDEQ_S2_ALREADY_FOLDED + |FOLDEQ_S2_FOLDS_SANE; + goto do_exactf_utf8; - case BOUNDL: - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - if (FLAGS(c) != TRADITIONAL_BOUND) { - if (! IN_UTF8_CTYPE_LOCALE) { - Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), - B_ON_NON_UTF8_LOCALE_IS_WRONG); - } - goto do_boundu; - } + case EXACTFAA_NO_TRIE_tb_pb: + case EXACTFAA_NO_TRIE_t8_pb: + case EXACTFAA_t8_pb: - FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe); - break; + /* Here, and elsewhere in this file, the reason we can't consider a + * non-UTF-8 pattern already folded in the presence of a UTF-8 target + * is because any MICRO SIGN in the pattern won't be folded. Since the + * fold of the MICRO SIGN requires UTF-8 to represent, we can consider + * a non-UTF-8 pattern folded when matching a non-UTF-8 target */ + utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; + goto do_exactf_utf8; - case NBOUNDL: + case EXACTFL_tb_p8: + case EXACTFL_t8_pb: + case EXACTFL_t8_p8: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - if (FLAGS(c) != TRADITIONAL_BOUND) { - if (! IN_UTF8_CTYPE_LOCALE) { - Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), - B_ON_NON_UTF8_LOCALE_IS_WRONG); + utf8_fold_flags = FOLDEQ_LOCALE; + goto do_exactf_utf8; + + case EXACTFLU8_t8_pb: + case EXACTFLU8_t8_p8: + utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED + | FOLDEQ_S2_FOLDS_SANE; + goto do_exactf_utf8; + + case EXACTFU_REQ8_t8_p8: + utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; + goto do_exactf_utf8; + + case EXACTFU_tb_p8: + case EXACTFU_t8_pb: + case EXACTFU_t8_p8: + utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED; + goto do_exactf_utf8; + + /* The following are problematic even though pattern isn't UTF-8. Use + * full functionality normally not done except for UTF-8. */ + case EXACTF_t8_pb: + case EXACTFUP_tb_pb: + case EXACTFUP_t8_pb: + + do_exactf_utf8: + { + unsigned expansion; + + /* If one of the operands is in utf8, we can't use the simpler + * folding above, due to the fact that many different characters + * can have the same fold, or portion of a fold, or different- + * length fold */ + pat_string = STRINGs(c); + ln = STR_LENs(c); /* length to match in octets/bytes */ + pat_end = pat_string + ln; + lnc = is_utf8_pat /* length to match in characters */ + ? utf8_length((U8 *) pat_string, (U8 *) pat_end) + : ln; + + /* We have 'lnc' characters to match in the pattern, but because of + * multi-character folding, each character in the target can match + * up to 3 characters (Unicode guarantees it will never exceed + * this) if it is utf8-encoded; and up to 2 if not (based on the + * fact that the Latin 1 folds are already determined, and the only + * multi-char fold in that range is the sharp-s folding to 'ss'. + * Thus, a pattern character can match as little as 1/3 of a string + * character. Adjust lnc accordingly, rounding up, so that if we + * need to match at least 4+1/3 chars, that really is 5. */ + expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2; + lnc = (lnc + expansion - 1) / expansion; + + /* As in the non-UTF8 case, if we have to match 3 characters, and + * only 2 are left, it's guaranteed to fail, so don't start a match + * that would require us to go beyond the end of the string */ + e = HOP3c(strend, -((SSize_t)lnc), s); + + /* XXX Note that we could recalculate e to stop the loop earlier, + * as the worst case expansion above will rarely be met, and as we + * go along we would usually find that e moves further to the left. + * This would happen only after we reached the point in the loop + * where if there were no expansion we should fail. Unclear if + * worth the expense */ + + while (s <= e) { + char *my_strend= (char *)strend; + if ( foldEQ_utf8_flags(s, &my_strend, 0, utf8_target, + pat_string, NULL, ln, is_utf8_pat, + utf8_fold_flags) + && (reginfo->intuit || regtry(reginfo, &s)) ) + { + goto got_it; + } + s += (utf8_target) ? UTF8_SAFE_SKIP(s, reginfo->strend) : 1; } - goto do_nboundu; } - - FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8_safe); break; - case BOUND: /* regcomp.c makes sure that this only has the traditional \b - meaning */ + case BOUNDA_tb_pb: + case BOUNDA_tb_p8: + case BOUND_tb_pb: /* /d without utf8 target is /a */ + case BOUND_tb_p8: + /* regcomp.c makes sure that these only have the traditional \b + * meaning. */ assert(FLAGS(c) == TRADITIONAL_BOUND); - FBC_BOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe); + FBC_BOUND_A_NON_UTF8(isWORDCHAR_A); break; - case BOUNDA: /* regcomp.c makes sure that this only has the traditional \b - meaning */ + case BOUNDA_t8_pb: /* What /a matches is same under UTF-8 */ + case BOUNDA_t8_p8: + /* regcomp.c makes sure that these only have the traditional \b + * meaning. */ assert(FLAGS(c) == TRADITIONAL_BOUND); - FBC_BOUND_A(isWORDCHAR_A); + FBC_BOUND_A_UTF8(isWORDCHAR_A); break; - case NBOUND: /* regcomp.c makes sure that this only has the traditional \b - meaning */ + case NBOUNDA_tb_pb: + case NBOUNDA_tb_p8: + case NBOUND_tb_pb: /* /d without utf8 target is /a */ + case NBOUND_tb_p8: + /* regcomp.c makes sure that these only have the traditional \b + * meaning. */ assert(FLAGS(c) == TRADITIONAL_BOUND); - FBC_NBOUND(isWORDCHAR, isWORDCHAR_uni, isWORDCHAR_utf8_safe); + FBC_NBOUND_A_NON_UTF8(isWORDCHAR_A); break; - case NBOUNDA: /* regcomp.c makes sure that this only has the traditional \b - meaning */ + case NBOUNDA_t8_pb: /* What /a matches is same under UTF-8 */ + case NBOUNDA_t8_p8: + /* regcomp.c makes sure that these only have the traditional \b + * meaning. */ assert(FLAGS(c) == TRADITIONAL_BOUND); - FBC_NBOUND_A(isWORDCHAR_A); + FBC_NBOUND_A_UTF8(isWORDCHAR_A); break; - case NBOUNDU: + case NBOUNDU_tb_pb: + case NBOUNDU_tb_p8: if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { - FBC_NBOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe); + FBC_NBOUND_NON_UTF8(isWORDCHAR_L1); break; } - do_nboundu: + to_complement = 1; + goto do_boundu_non_utf8; + + case NBOUNDL_tb_pb: + case NBOUNDL_tb_p8: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + if (FLAGS(c) == TRADITIONAL_BOUND) { + FBC_NBOUND_NON_UTF8(isWORDCHAR_LC); + break; + } + + CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; to_complement = 1; - /* FALLTHROUGH */ + goto do_boundu_non_utf8; - case BOUNDU: - do_boundu: - switch((bound_type) FLAGS(c)) { - case TRADITIONAL_BOUND: - FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe); - break; - case GCB_BOUND: - if (s == reginfo->strbeg) { - if (reginfo->intuit || regtry(reginfo, &s)) - { - goto got_it; - } + case BOUNDL_tb_pb: + case BOUNDL_tb_p8: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + if (FLAGS(c) == TRADITIONAL_BOUND) { + FBC_BOUND_NON_UTF8(isWORDCHAR_LC); + break; + } - /* Didn't match. Try at the next position (if there is one) */ - s += (utf8_target) ? UTF8SKIP(s) : 1; - if (UNLIKELY(s >= reginfo->strend)) { - break; - } - } + CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; - if (utf8_target) { - GCB_enum before = getGCB_VAL_UTF8( - reghop3((U8*)s, -1, - (U8*)(reginfo->strbeg)), - (U8*) reginfo->strend); - while (s < strend) { - GCB_enum after = getGCB_VAL_UTF8((U8*) s, - (U8*) reginfo->strend); - if ( (to_complement ^ isGCB(before, - after, - (U8*) reginfo->strbeg, - (U8*) s, - utf8_target)) - && (reginfo->intuit || regtry(reginfo, &s))) - { - goto got_it; - } - before = after; - s += UTF8SKIP(s); - } - } - else { /* Not utf8. Everything is a GCB except between CR and - LF */ - while (s < strend) { - if ((to_complement ^ ( UCHARAT(s - 1) != '\r' - || UCHARAT(s) != '\n')) - && (reginfo->intuit || regtry(reginfo, &s))) - { - goto got_it; - } - s++; - } - } + goto do_boundu_non_utf8; + + case BOUNDU_tb_pb: + case BOUNDU_tb_p8: + if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { + FBC_BOUND_NON_UTF8(isWORDCHAR_L1); + break; + } + + do_boundu_non_utf8: + if (s == reginfo->strbeg) { + if (reginfo->intuit || regtry(reginfo, &s)) + { + goto got_it; + } + + /* Didn't match. Try at the next position (if there is one) */ + s++; + if (UNLIKELY(s >= reginfo->strend)) { + break; + } + } + + switch((bound_type) FLAGS(c)) { + case TRADITIONAL_BOUND: /* Should have already been handled */ + assert(0); + break; - /* And, since this is a bound, it can match after the final - * character in the string */ - if ((reginfo->intuit || regtry(reginfo, &s))) { + case GCB_BOUND: + /* Not utf8. Everything is a GCB except between CR and LF */ + while (s < strend) { + if ((to_complement ^ ( UCHARAT(s - 1) != '\r' + || UCHARAT(s) != '\n')) + && (reginfo->intuit || regtry(reginfo, &s))) + { goto got_it; } - break; + s++; + } - case LB_BOUND: - if (s == reginfo->strbeg) { - if (reginfo->intuit || regtry(reginfo, &s)) { + break; + + case LB_BOUND: + { + LB_enum before = getLB_VAL_CP((U8) *(s -1)); + while (s < strend) { + LB_enum after = getLB_VAL_CP((U8) *s); + if (to_complement ^ isLB(before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + 0 /* target not utf8 */ ) + && (reginfo->intuit || regtry(reginfo, &s))) + { goto got_it; } - s += (utf8_target) ? UTF8SKIP(s) : 1; - if (UNLIKELY(s >= reginfo->strend)) { - break; - } + before = after; + s++; } + } - if (utf8_target) { - LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s, - -1, - (U8*)(reginfo->strbeg)), - (U8*) reginfo->strend); - while (s < strend) { - LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend); - if (to_complement ^ isLB(before, - after, - (U8*) reginfo->strbeg, - (U8*) s, - (U8*) reginfo->strend, - utf8_target) - && (reginfo->intuit || regtry(reginfo, &s))) - { - goto got_it; - } - before = after; - s += UTF8SKIP(s); + break; + + case SB_BOUND: + { + SB_enum before = getSB_VAL_CP((U8) *(s -1)); + while (s < strend) { + SB_enum after = getSB_VAL_CP((U8) *s); + if ((to_complement ^ isSB(before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + 0 /* target not utf8 */ )) + && (reginfo->intuit || regtry(reginfo, &s))) + { + goto got_it; } + before = after; + s++; } - else { /* Not utf8. */ - LB_enum before = getLB_VAL_CP((U8) *(s -1)); - while (s < strend) { - LB_enum after = getLB_VAL_CP((U8) *s); - if (to_complement ^ isLB(before, - after, - (U8*) reginfo->strbeg, - (U8*) s, - (U8*) reginfo->strend, - utf8_target) - && (reginfo->intuit || regtry(reginfo, &s))) - { - goto got_it; - } - before = after; - s++; + } + + break; + + case WB_BOUND: + { + WB_enum previous = WB_UNKNOWN; + WB_enum before = getWB_VAL_CP((U8) *(s -1)); + while (s < strend) { + WB_enum after = getWB_VAL_CP((U8) *s); + if ((to_complement ^ isWB(previous, + before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + 0 /* target not utf8 */ )) + && (reginfo->intuit || regtry(reginfo, &s))) + { + goto got_it; } + previous = before; + before = after; + s++; } + } + } - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } + /* Here are at the final position in the target string, which is a + * boundary by definition, so matches, depending on other constraints. + * */ + if ( reginfo->intuit + || (s <= reginfo->strend && regtry(reginfo, &s))) + { + goto got_it; + } + + break; + + case BOUNDL_t8_pb: + case BOUNDL_t8_p8: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + if (FLAGS(c) == TRADITIONAL_BOUND) { + FBC_BOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, + isWORDCHAR_LC_utf8_safe); + break; + } + + CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; + + to_complement = 1; + goto do_boundu_utf8; + + case NBOUNDL_t8_pb: + case NBOUNDL_t8_p8: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + if (FLAGS(c) == TRADITIONAL_BOUND) { + FBC_NBOUND_UTF8(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, + isWORDCHAR_LC_utf8_safe); + break; + } + + CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; + + to_complement = 1; + goto do_boundu_utf8; + + case NBOUND_t8_pb: + case NBOUND_t8_p8: + /* regcomp.c makes sure that these only have the traditional \b + * meaning. */ + assert(FLAGS(c) == TRADITIONAL_BOUND); + + /* FALLTHROUGH */ + + case NBOUNDU_t8_pb: + case NBOUNDU_t8_p8: + if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { + FBC_NBOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, + isWORDCHAR_utf8_safe); + break; + } + + to_complement = 1; + goto do_boundu_utf8; + + case BOUND_t8_pb: + case BOUND_t8_p8: + /* regcomp.c makes sure that these only have the traditional \b + * meaning. */ + assert(FLAGS(c) == TRADITIONAL_BOUND); + /* FALLTHROUGH */ + + case BOUNDU_t8_pb: + case BOUNDU_t8_p8: + if ((bound_type) FLAGS(c) == TRADITIONAL_BOUND) { + FBC_BOUND_UTF8(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8_safe); + break; + } + + do_boundu_utf8: + if (s == reginfo->strbeg) { + if (reginfo->intuit || regtry(reginfo, &s)) + { + goto got_it; + } + + /* Didn't match. Try at the next position (if there is one) */ + s += UTF8_SAFE_SKIP(s, reginfo->strend); + if (UNLIKELY(s >= reginfo->strend)) { break; + } + } - case SB_BOUND: - if (s == reginfo->strbeg) { - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } - s += (utf8_target) ? UTF8SKIP(s) : 1; - if (UNLIKELY(s >= reginfo->strend)) { - break; - } - } + switch((bound_type) FLAGS(c)) { + case TRADITIONAL_BOUND: /* Should have already been handled */ + assert(0); + break; - if (utf8_target) { - SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s, - -1, - (U8*)(reginfo->strbeg)), - (U8*) reginfo->strend); - while (s < strend) { - SB_enum after = getSB_VAL_UTF8((U8*) s, - (U8*) reginfo->strend); - if ((to_complement ^ isSB(before, - after, - (U8*) reginfo->strbeg, - (U8*) s, - (U8*) reginfo->strend, - utf8_target)) - && (reginfo->intuit || regtry(reginfo, &s))) - { - goto got_it; - } - before = after; - s += UTF8SKIP(s); - } - } - else { /* Not utf8. */ - SB_enum before = getSB_VAL_CP((U8) *(s -1)); - while (s < strend) { - SB_enum after = getSB_VAL_CP((U8) *s); - if ((to_complement ^ isSB(before, + case GCB_BOUND: + { + GCB_enum before = getGCB_VAL_UTF8( + reghop3((U8*)s, -1, + (U8*)(reginfo->strbeg)), + (U8*) reginfo->strend); + while (s < strend) { + GCB_enum after = getGCB_VAL_UTF8((U8*) s, + (U8*) reginfo->strend); + if ( (to_complement ^ isGCB(before, after, (U8*) reginfo->strbeg, (U8*) s, - (U8*) reginfo->strend, - utf8_target)) - && (reginfo->intuit || regtry(reginfo, &s))) - { - goto got_it; - } - before = after; - s++; + 1 /* target is utf8 */ )) + && (reginfo->intuit || regtry(reginfo, &s))) + { + goto got_it; } + before = after; + s += UTF8_SAFE_SKIP(s, reginfo->strend); } + } + break; - /* Here are at the final position in the target string. The SB - * value is always true here, so matches, depending on other - * constraints */ - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; + case LB_BOUND: + { + LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s, + -1, + (U8*)(reginfo->strbeg)), + (U8*) reginfo->strend); + while (s < strend) { + LB_enum after = getLB_VAL_UTF8((U8*) s, + (U8*) reginfo->strend); + if (to_complement ^ isLB(before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + 1 /* target is utf8 */ ) + && (reginfo->intuit || regtry(reginfo, &s))) + { + goto got_it; + } + before = after; + s += UTF8_SAFE_SKIP(s, reginfo->strend); } + } - break; + break; - case WB_BOUND: - if (s == reginfo->strbeg) { - if (reginfo->intuit || regtry(reginfo, &s)) { + case SB_BOUND: + { + SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s, + -1, + (U8*)(reginfo->strbeg)), + (U8*) reginfo->strend); + while (s < strend) { + SB_enum after = getSB_VAL_UTF8((U8*) s, + (U8*) reginfo->strend); + if ((to_complement ^ isSB(before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + 1 /* target is utf8 */ )) + && (reginfo->intuit || regtry(reginfo, &s))) + { goto got_it; } - s += (utf8_target) ? UTF8SKIP(s) : 1; - if (UNLIKELY(s >= reginfo->strend)) { - break; - } + before = after; + s += UTF8_SAFE_SKIP(s, reginfo->strend); } + } - if (utf8_target) { - /* We are at a boundary between char_sub_0 and char_sub_1. - * We also keep track of the value for char_sub_-1 as we - * loop through the line. Context may be needed to make a - * determination, and if so, this can save having to - * recalculate it */ - WB_enum previous = WB_UNKNOWN; - WB_enum before = getWB_VAL_UTF8( - reghop3((U8*)s, - -1, - (U8*)(reginfo->strbeg)), - (U8*) reginfo->strend); - while (s < strend) { - WB_enum after = getWB_VAL_UTF8((U8*) s, - (U8*) reginfo->strend); - if ((to_complement ^ isWB(previous, - before, - after, - (U8*) reginfo->strbeg, - (U8*) s, - (U8*) reginfo->strend, - utf8_target)) - && (reginfo->intuit || regtry(reginfo, &s))) - { - goto got_it; - } - previous = before; - before = after; - s += UTF8SKIP(s); - } - } - else { /* Not utf8. */ - WB_enum previous = WB_UNKNOWN; - WB_enum before = getWB_VAL_CP((U8) *(s -1)); - while (s < strend) { - WB_enum after = getWB_VAL_CP((U8) *s); - if ((to_complement ^ isWB(previous, - before, - after, - (U8*) reginfo->strbeg, - (U8*) s, - (U8*) reginfo->strend, - utf8_target)) - && (reginfo->intuit || regtry(reginfo, &s))) - { - goto got_it; - } - previous = before; - before = after; - s++; + break; + + case WB_BOUND: + { + /* We are at a boundary between char_sub_0 and char_sub_1. + * We also keep track of the value for char_sub_-1 as we + * loop through the line. Context may be needed to make a + * determination, and if so, this can save having to + * recalculate it */ + WB_enum previous = WB_UNKNOWN; + WB_enum before = getWB_VAL_UTF8( + reghop3((U8*)s, + -1, + (U8*)(reginfo->strbeg)), + (U8*) reginfo->strend); + while (s < strend) { + WB_enum after = getWB_VAL_UTF8((U8*) s, + (U8*) reginfo->strend); + if ((to_complement ^ isWB(previous, + before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + 1 /* target is utf8 */ )) + && (reginfo->intuit || regtry(reginfo, &s))) + { + goto got_it; } + previous = before; + before = after; + s += UTF8_SAFE_SKIP(s, reginfo->strend); } + } + } - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } + /* Here are at the final position in the target string, which is a + * boundary by definition, so matches, depending on other constraints. + * */ + + if ( reginfo->intuit + || (s <= reginfo->strend && regtry(reginfo, &s))) + { + goto got_it; } break; - case LNBREAK: - REXEC_FBC_CSCAN(is_LNBREAK_utf8_safe(s, strend), - is_LNBREAK_latin1_safe(s, strend) - ); + case LNBREAK_t8_pb: + case LNBREAK_t8_p8: + REXEC_FBC_UTF8_CLASS_SCAN(is_LNBREAK_utf8_safe(s, strend)); break; - /* The argument to all the POSIX node types is the class number to pass to - * _generic_isCC() to build a mask for searching in PL_charclass[] */ - - case NPOSIXL: - to_complement = 1; - /* FALLTHROUGH */ - - case POSIXL: - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, (U8 *) strend)), - to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); + case LNBREAK_tb_pb: + case LNBREAK_tb_p8: + REXEC_FBC_NON_UTF8_CLASS_SCAN(is_LNBREAK_latin1_safe(s, strend)); break; - case NPOSIXD: - to_complement = 1; - /* FALLTHROUGH */ - - case POSIXD: - if (utf8_target) { - goto posix_utf8; - } - goto posixa; - - case NPOSIXA: - if (utf8_target) { - /* The complement of something that matches only ASCII matches all - * non-ASCII, plus everything in ASCII that isn't in the class. */ - REXEC_FBC_CLASS_SCAN(1, ! isASCII_utf8_safe(s, strend) - || ! _generic_isCC_A(*s, FLAGS(c))); - break; - } - + /* The argument to all the POSIX node types is the class number to pass + * to _generic_isCC() to build a mask for searching in PL_charclass[] */ + + case NPOSIXL_t8_pb: + case NPOSIXL_t8_p8: to_complement = 1; - goto posixa; + /* FALLTHROUGH */ - case POSIXA: + case POSIXL_t8_pb: + case POSIXL_t8_p8: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s, + (U8 *) strend))); + break; + + case NPOSIXL_tb_pb: + case NPOSIXL_tb_p8: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXL_tb_pb: + case POSIXL_tb_p8: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + REXEC_FBC_NON_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s))); + break; + + case NPOSIXA_t8_pb: + case NPOSIXA_t8_p8: + /* The complement of something that matches only ASCII matches all + * non-ASCII, plus everything in ASCII that isn't in the class. */ + REXEC_FBC_UTF8_CLASS_SCAN( ! isASCII_utf8_safe(s, strend) + || ! _generic_isCC_A(*s, FLAGS(c))); + break; + + case POSIXA_t8_pb: + case POSIXA_t8_p8: /* Don't need to worry about utf8, as it can match only a single * byte invariant character. But we do anyway for performance reasons, * as otherwise we would have to examine all the continuation * characters */ - if (utf8_target) { - REXEC_FBC_CLASS_SCAN(1, _generic_isCC_A(*s, FLAGS(c))); - break; - } + REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c))); + break; + + case NPOSIXD_tb_pb: + case NPOSIXD_tb_p8: + case NPOSIXA_tb_pb: + case NPOSIXA_tb_p8: + to_complement = 1; + /* FALLTHROUGH */ - posixa: - REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */ + case POSIXD_tb_pb: + case POSIXD_tb_p8: + case POSIXA_tb_pb: + case POSIXA_tb_p8: + REXEC_FBC_NON_UTF8_CLASS_SCAN( to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c)))); break; - case NPOSIXU: + case NPOSIXU_tb_pb: + case NPOSIXU_tb_p8: to_complement = 1; /* FALLTHROUGH */ - case POSIXU: - if (! utf8_target) { - REXEC_FBC_CLASS_SCAN(0, /* 0=>not-utf8 */ + case POSIXU_tb_pb: + case POSIXU_tb_p8: + REXEC_FBC_NON_UTF8_CLASS_SCAN( to_complement ^ cBOOL(_generic_isCC(*s, FLAGS(c)))); - } - else { + break; - posix_utf8: - classnum = (_char_class_number) FLAGS(c); - switch (classnum) { - default: - REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */ + case NPOSIXD_t8_pb: + case NPOSIXD_t8_p8: + case NPOSIXU_t8_pb: + case NPOSIXU_t8_p8: + to_complement = 1; + /* FALLTHROUGH */ + + case POSIXD_t8_pb: + case POSIXD_t8_p8: + case POSIXU_t8_pb: + case POSIXU_t8_p8: + classnum = (_char_class_number) FLAGS(c); + switch (classnum) { + default: + REXEC_FBC_UTF8_CLASS_SCAN( to_complement ^ cBOOL(_invlist_contains_cp( - PL_XPosix_ptrs[classnum], - utf8_to_uvchr_buf((U8 *) s, + PL_XPosix_ptrs[classnum], + utf8_to_uvchr_buf((U8 *) s, (U8 *) strend, NULL)))); - break; - case _CC_ENUM_SPACE: - REXEC_FBC_CLASS_SCAN(1, /* 1=>is-utf8 */ + break; + + case _CC_ENUM_SPACE: + REXEC_FBC_UTF8_CLASS_SCAN( to_complement ^ cBOOL(isSPACE_utf8_safe(s, strend))); - break; + break; - case _CC_ENUM_BLANK: - REXEC_FBC_CLASS_SCAN(1, + case _CC_ENUM_BLANK: + REXEC_FBC_UTF8_CLASS_SCAN( to_complement ^ cBOOL(isBLANK_utf8_safe(s, strend))); - break; + break; - case _CC_ENUM_XDIGIT: - REXEC_FBC_CLASS_SCAN(1, - to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend))); - break; + case _CC_ENUM_XDIGIT: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isXDIGIT_utf8_safe(s, strend))); + break; - case _CC_ENUM_VERTSPACE: - REXEC_FBC_CLASS_SCAN(1, - to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend))); - break; + case _CC_ENUM_VERTSPACE: + REXEC_FBC_UTF8_CLASS_SCAN( + to_complement ^ cBOOL(isVERTWS_utf8_safe(s, strend))); + break; - case _CC_ENUM_CNTRL: - REXEC_FBC_CLASS_SCAN(1, + case _CC_ENUM_CNTRL: + REXEC_FBC_UTF8_CLASS_SCAN( to_complement ^ cBOOL(isCNTRL_utf8_safe(s, strend))); - break; - } + break; } break; - case AHOCORASICKC: - case AHOCORASICK: + case AHOCORASICKC_tb_pb: + case AHOCORASICKC_tb_p8: + case AHOCORASICKC_t8_pb: + case AHOCORASICKC_t8_p8: + case AHOCORASICK_tb_pb: + case AHOCORASICK_tb_p8: + case AHOCORASICK_t8_pb: + case AHOCORASICK_t8_p8: { DECL_TRIE_TYPE(c); /* what trie are we using right now */ reg_ac_data *aho = (reg_ac_data*)progi->data->data[ ARG( c ) ]; - reg_trie_data *trie = (reg_trie_data*)progi->data->data[ aho->trie ]; + reg_trie_data *trie = (reg_trie_data*)progi->data->data[aho->trie]; HV *widecharmap = MUTABLE_HV(progi->data->data[ aho->trie + 1 ]); const char *last_start = strend - trie->minlen; @@ -2843,7 +3126,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, U8 *bitmap=NULL; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; /* We can't just allocate points here. We need to wrap it in * an SV so it gets freed properly if there is a croak while @@ -2902,19 +3185,26 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if( state==1 ) { if ( bitmap ) { DEBUG_TRIE_EXECUTE_r( - if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { - dump_exec_pos( (char *)uc, c, strend, real_start, + if ( uc <= (U8*)last_start + && !BITMAP_TEST(bitmap,*uc) ) + { + dump_exec_pos( (char *)uc, c, strend, + real_start, (char *)uc, utf8_target, 0 ); Perl_re_printf( aTHX_ " Scanning for legal start char...\n"); } ); if (utf8_target) { - while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + while ( uc <= (U8*)last_start + && !BITMAP_TEST(bitmap,*uc) ) + { uc += UTF8SKIP(uc); } } else { - while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + while ( uc <= (U8*)last_start + && ! BITMAP_TEST(bitmap,*uc) ) + { uc++; } } @@ -2924,7 +3214,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } if ( word ) { - U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) % maxlen ]; + U8 *lpos= points[ (pointpos - trie->wordinfo[word].len) + % maxlen ]; if (!leftmost || lpos < leftmost) { DEBUG_r(accepted_word=word); leftmost= lpos; @@ -2960,7 +3251,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, DEBUG_TRIE_EXECUTE_r({ if (failed) - dump_exec_pos( (char *)uc, c, strend, real_start, + dump_exec_pos((char *)uc, c, strend, real_start, s, utf8_target, 0 ); Perl_re_printf( aTHX_ "%sState: %4" UVxf ", word=%" UVxf, @@ -3005,7 +3296,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } } if ( aho->states[ state ].wordnum ) { - U8 *lpos = points[ (pointpos - trie->wordinfo[aho->states[ state ].wordnum].len) % maxlen ]; + U8 *lpos = points[ (pointpos + - trie->wordinfo[aho->states[ state ] + .wordnum].len) % maxlen ]; if (!leftmost || lpos < leftmost) { DEBUG_r(accepted_word=aho->states[ state ].wordnum); leftmost = lpos; @@ -3014,7 +3307,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if (leftmost) { s = (char*)leftmost; DEBUG_TRIE_EXECUTE_r({ - Perl_re_printf( aTHX_ "Matches word #%" UVxf " at position %" IVdf ". Trying full pattern...\n", + Perl_re_printf( aTHX_ "Matches word #%" UVxf + " at position %" IVdf ". Trying full" + " pattern...\n", (UV)accepted_word, (IV)(s - real_start) ); }); @@ -3023,9 +3318,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, LEAVE; goto got_it; } - s = HOPc(s,1); + if (s < reginfo->strend) { + s = HOPc(s,1); + } DEBUG_TRIE_EXECUTE_r({ - Perl_re_printf( aTHX_ "Pattern failed. Looking for new start point...\n"); + Perl_re_printf( aTHX_ + "Pattern failed. Looking for new start" + " point...\n"); }); } else { DEBUG_TRIE_EXECUTE_r( @@ -3037,10 +3336,23 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, LEAVE; } break; - default: + + case EXACTFU_REQ8_t8_pb: + case EXACTFUP_tb_p8: + case EXACTFUP_t8_p8: + case EXACTF_tb_p8: + case EXACTF_t8_p8: /* This node only generated for non-utf8 patterns */ + case EXACTFAA_NO_TRIE_tb_p8: + case EXACTFAA_NO_TRIE_t8_p8: /* This node only generated for non-utf8 + patterns */ + assert(0); + + default: Perl_croak(aTHX_ "panic: unknown regstclass %d", (int)OP(c)); - } + } /* End of switch on node type */ + return 0; + got_it: return s; } @@ -3224,7 +3536,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, regmatch_info *const reginfo = ®info_buf; regexp_paren_pair *swap = NULL; I32 oldsave; - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_REGEXEC_FLAGS; PERL_UNUSED_ARG(data); @@ -3278,7 +3590,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, if (!startpos || ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg)) { - DEBUG_r(Perl_re_printf( aTHX_ + DEBUG_GPOS_r(Perl_re_printf( aTHX_ "fail: ganch-gofs before earliest possible start\n")); return 0; } @@ -3297,8 +3609,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, minlen = prog->minlen; if ((startpos + minlen) > strend || startpos < strbeg) { - DEBUG_r(Perl_re_printf( aTHX_ - "Regex match can't succeed, so not even tried\n")); + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + "Regex match can't succeed, so not even tried\n")); return 0; } @@ -3355,13 +3667,13 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } multiline = prog->extflags & RXf_PMf_MULTILINE; - + if (strend - s < (minlen+(prog->check_offset_min<0?prog->check_offset_min:0))) { DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "String too short [regexec_flags]...\n")); goto phooey; } - + /* Check validity of program. */ if (UCHARAT(progi->program) != REG_MAGIC) { Perl_croak(aTHX_ "corrupted regexp program"); @@ -3538,11 +3850,11 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, to_utf8_substr(prog); } ch = SvPVX_const(prog->anchored_utf8)[0]; - REXEC_FBC_SCAN(0, /* 0=>not-utf8 */ + REXEC_FBC_UTF8_SCAN( if (*s == ch) { DEBUG_EXECUTE_r( did_match = 1 ); if (regtry(reginfo, &s)) goto got_it; - s += UTF8SKIP(s); + s += UTF8_SAFE_SKIP(s, strend); while (s < strend && *s == ch) s += UTF8SKIP(s); } @@ -3556,7 +3868,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, } } ch = SvPVX_const(prog->anchored_substr)[0]; - REXEC_FBC_SCAN(0, /* 0=>not-utf8 */ + REXEC_FBC_NON_UTF8_SCAN( if (*s == ch) { DEBUG_EXECUTE_r( did_match = 1 ); if (regtry(reginfo, &s)) goto got_it; @@ -3617,7 +3929,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, back_max = prog->float_max_offset; back_min = prog->float_min_offset; } - + if (back_min<0) { last = strend; } else { @@ -3676,7 +3988,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, ((must == prog->anchored_substr || must == prog->anchored_utf8) ? "anchored" : "floating"), quoted, RE_SV_TAIL(must)); - }); + }); goto phooey; } else if ( (c = progi->regstclass) ) { @@ -3844,7 +4156,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, LEAVE_SCOPE(oldsave); - if (RXp_PAREN_NAMES(prog)) + if (RXp_PAREN_NAMES(prog)) (void)hv_iterinit(RXp_PAREN_NAMES(prog)); /* make sure $`, $&, $', and $digit will work later */ @@ -3910,7 +4222,7 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) U32 depth = 0; /* used by REGCP_SET */ #endif RXi_GET_DECL(prog,progi); - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; PERL_ARGS_ASSERT_REGTRY; @@ -4012,33 +4324,33 @@ S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target, PERL_ARGS_ASSERT_DEBUG_START_MATCH; - if (!PL_colorset) - reginitcolors(); + if (!PL_colorset) + reginitcolors(); { - RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), + RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len); - + RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1), start, end - start, PL_dump_re_max_len); - + Perl_re_printf( aTHX_ - "%s%s REx%s %s against %s\n", - PL_colors[4], blurb, PL_colors[5], s0, s1); - + "%s%s REx%s %s against %s\n", + PL_colors[4], blurb, PL_colors[5], s0, s1); + if (utf8_target||utf8_pat) Perl_re_printf( aTHX_ "UTF-8 %s%s%s...\n", utf8_pat ? "pattern" : "", utf8_pat && utf8_target ? " and " : "", utf8_target ? "string" : "" - ); + ); } } STATIC void -S_dump_exec_pos(pTHX_ const char *locinput, - const regnode *scan, - const char *loc_regeol, - const char *loc_bostr, +S_dump_exec_pos(pTHX_ const char *locinput, + const regnode *scan, + const char *loc_regeol, + const char *loc_bostr, const char *loc_reg_starttry, const bool utf8_target, const U32 depth @@ -4076,11 +4388,11 @@ S_dump_exec_pos(pTHX_ const char *locinput, RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5); - + RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), (locinput - pref_len + pref0_len), pref_len - pref0_len, PL_dump_re_max_len, 2, 3); - + RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), locinput, loc_regeol - locinput, 10, 0, 1); @@ -4101,7 +4413,7 @@ S_dump_exec_pos(pTHX_ const char *locinput, #endif /* reg_check_named_buff_matched() - * Checks to see if a named buffer has matched. The data array of + * Checks to see if a named buffer has matched. The data array of * buffer numbers corresponding to the buffer is expected to reside * in the regexp->data->data array in the slot stored in the ARG() of * node involved. Note that this routine doesn't actually care about the @@ -4129,326 +4441,616 @@ S_reg_check_named_buff_matched(const regexp *rex, const regnode *scan) return 0; } -#define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */ -#define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */ -#define CHRTEST_NOT_A_CP_1 -999 -#define CHRTEST_NOT_A_CP_2 -998 - static bool -S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, - U8* c1_utf8, int *c2p, U8* c2_utf8, regmatch_info *reginfo) +S_setup_EXACTISH_ST(pTHX_ const regnode * const text_node, + struct next_matchable_info * m, + regmatch_info *reginfo) { - /* This function determines if there are zero, one, two, or more characters - * that match the first character of the passed-in EXACTish node - * , and if there are one or two, it returns them in the - * passed-in pointers. + /* This function determines various characteristics about every possible + * initial match of the passed-in EXACTish , and stores them in + * <*m>. * - * If it determines that no possible character in the target string can - * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if - * the first character in requires UTF-8 to represent, and the - * target string isn't in UTF-8.) + * That includes a match string and a parallel mask, such that if you AND + * the target string with the mask and compare with the match string, + * you'll have a pretty good idea, perhaps even perfect, if that portion of + * the target matches or not. * - * If there are more than two characters that could match the beginning of - * , or if more context is required to determine a match or not, - * it sets both * and * to CHRTEST_VOID. + * The motivation behind this function is to allow the caller to set up + * tight loops for matching. Consider patterns like '.*B' or '.*?B' where + * B is an arbitrary EXACTish node. To find the end of .*, we look for the + * beginning oF B, which is the passed in That's where this + * function comes in. The values it returns can quickly be used to rule + * out many, or all, cases of possible matches not actually being the + * beginning of B, . It is also used in regrepeat() where we + * have 'A*', for arbitrary 'A'. This sets up criteria to more efficiently + * determine where the span of 'A's stop. * - * The motiviation behind this function is to allow the caller to set up - * tight loops for matching. If is of type EXACT, there is - * only one possible character that can match its first character, and so - * the situation is quite simple. But things get much more complicated if - * folding is involved. It may be that the first character of an EXACTFish - * node doesn't participate in any possible fold, e.g., punctuation, so it - * can be matched only by itself. The vast majority of characters that are - * in folds match just two things, their lower and upper-case equivalents. + * If is of type EXACT, there is only one possible character + * that can match its first character, and so the situation is quite + * simple. But things can get much more complicated if folding is + * involved. It may be that the first character of an EXACTFish node + * doesn't participate in any possible fold, e.g., punctuation, so it can + * be matched only by itself. The vast majority of characters that are in + * folds match just two things, their lower and upper-case equivalents. * But not all are like that; some have multiple possible matches, or match * sequences of more than one character. This function sorts all that out. * - * Consider the patterns A*B or A*?B where A and B are arbitrary. In a - * loop of trying to match A*, we know we can't exit where the thing - * following it isn't a B. And something can't be a B unless it is the - * beginning of B. By putting a quick test for that beginning in a tight - * loop, we can rule out things that can't possibly be B without having to - * break out of the loop, thus avoiding work. Similarly, if A is a single - * character, we can make a tight loop matching A*, using the outputs of - * this function. + * It returns information about all possibilities of what the first + * character(s) of could look like. Again, if is a + * plain EXACT node, that's just the actual first bytes of the first + * character; but otherwise it is the bytes, that when masked, match all + * possible combinations of all the initial bytes of all the characters + * that could match, folded. (Actually, this is a slight over promise. It + * handles only up to the initial 5 bytes, which is enough for all Unicode + * characters, but not for all non-Unicode ones.) + * + * Here's an example to clarify. Suppose the first character of + * is the letter 'C', and we are under /i matching. That means + * 'c' also matches. The representations of these two characters differ in + * just one bit, so the mask would be a zero in that position and ones in + * the other 7. And the returned string would be the AND of these two + * characters, and would be one byte long, since these characters are each + * a single byte. ANDing the target with this mask will yield + * the returned string if and only if begins with one of these + * two characters. So, the function would also return that the definitive + * length matched is 1 byte. + * + * Now, suppose instead of the letter 'C', begins with the + * letter 'F'. The situation is much more complicated because there are + * various ligatures such as LATIN SMALL LIGATURE FF, whose fold also + * begins with 'f', and hence could match. We add these into the returned + * string and mask, but the result isn't definitive; the caller has to + * check further if its AND and compare pass. But the failure of that + * compare will quickly rule out most possible inputs. * - * If the target string to match isn't in UTF-8, and there aren't - * complications which require CHRTEST_VOID, * and * are set to - * the one or two possible octets (which are characters in this situation) - * that can match. In all cases, if there is only one character that can - * match, * and * will be identical. + * Much of this could be done in regcomp.c at compile time, except for + * locale-dependent, and UTF-8 target dependent data. Extra data fields + * could be used for one or the other eventualities. * - * If the target string is in UTF-8, the buffers pointed to by - * and will contain the one or two UTF-8 sequences of bytes that - * can match the beginning of . They should be declared with at - * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is - * undefined what these contain.) If one or both of the buffers are - * invariant under UTF-8, *, and * will also be set to the - * corresponding invariant. If variant, the corresponding * and/or - * * will be set to a negative number(s) that shouldn't match any code - * point (unless inappropriately coerced to unsigned). * will equal - * * if and only if and are the same. */ + * If this function determines that no possible character in the target + * string can match, it returns FALSE; otherwise TRUE. (The FALSE + * situation occurs if the first character in requires UTF-8 to + * represent, and the target string isn't in UTF-8.) + * + * Some analysis is in GH #18414, located at the time of this writing at: + * https://github.com/Perl/perl5/issues/18414 + */ const bool utf8_target = reginfo->is_utf8_target; + bool utf8_pat = reginfo->is_utf8_pat; - UV c1 = (UV)CHRTEST_NOT_A_CP_1; - UV c2 = (UV)CHRTEST_NOT_A_CP_2; - bool use_chrtest_void = FALSE; - const bool is_utf8_pat = reginfo->is_utf8_pat; + PERL_UINT_FAST8_T i; - /* Used when we have both utf8 input and utf8 output, to avoid converting - * to/from code points */ - bool utf8_has_been_setup = FALSE; + /* Here and below, '15' is the value of UTF8_MAXBYTES_CASE, which requires at least :e + */ + U8 matches[MAX_MATCHES][UTF8_MAXBYTES_CASE + 1] = { { 0 } }; + U8 lengths[MAX_MATCHES] = { 0 }; - dVAR; + U8 index_of_longest = 0; U8 *pat = (U8*)STRING(text_node); - U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' }; + Size_t pat_len = STR_LEN(text_node); + U8 op = OP(text_node); - if ( OP(text_node) == EXACT - || OP(text_node) == EXACT_ONLY8 - || OP(text_node) == EXACTL) - { + U8 byte_mask[5] = {0}; + U8 byte_anded[5] = {0}; - /* In an exact node, only one thing can be matched, that first - * character. If both the pat and the target are UTF-8, we can just - * copy the input to the output, avoiding finding the code point of - * that character */ - if (!is_utf8_pat) { - assert(OP(text_node) != EXACT_ONLY8); - c2 = c1 = *pat; - } - else if (utf8_target) { - Copy(pat, c1_utf8, UTF8SKIP(pat), U8); - Copy(pat, c2_utf8, UTF8SKIP(pat), U8); - utf8_has_been_setup = TRUE; - } - else if (OP(text_node) == EXACT_ONLY8) { - return FALSE; /* Can only match UTF-8 target */ + /* There are some folds in Unicode to multiple characters. This will hold + * such characters that could fold to the beginning of 'text_node' */ + UV multi_fold_from = 0; + + /* We may have to create a modified copy of the pattern */ + U8 mod_pat[UTF8_MAXBYTES_CASE + 1] = { '\0' }; + + m->max_length = 0; + m->min_length = 255; + m->count = 0; + + /* Even if the first character in the node can match something in Latin1, + * if there is anything in the node that can't, the match must fail */ + if (! utf8_target && isEXACT_REQ8(op)) { + return FALSE; + } + +/* Define a temporary op for use in this function, using an existing one that + * should never be a real op during execution */ +#define TURKISH PSEUDO + + /* What to do about these two nodes had to be deferred to runtime (which is + * now). If the extra information we now have so indicates, turn them into + * EXACTFU nodes */ + if ( (op == EXACTF && utf8_target) + || (op == EXACTFL && IN_UTF8_CTYPE_LOCALE)) + { + if (op == EXACTFL && PL_in_utf8_turkic_locale) { + op = TURKISH; } else { - c2 = c1 = valid_utf8_to_uvchr(pat, NULL); - } - } - else { /* an EXACTFish node */ - U8 *pat_end = pat + STR_LEN(text_node); - - /* An EXACTFL node has at least some characters unfolded, because what - * they match is not known until now. So, now is the time to fold - * the first few of them, as many as are needed to determine 'c1' and - * 'c2' later in the routine. If the pattern isn't UTF-8, we only need - * to fold if in a UTF-8 locale, and then only the Sharp S; everything - * else is 1-1 and isn't assumed to be folded. In a UTF-8 pattern, we - * need to fold as many characters as a single character can fold to, - * so that later we can check if the first ones are such a multi-char - * fold. But, in such a pattern only locale-problematic characters - * aren't folded, so we can skip this completely if the first character - * in the node isn't one of the tricky ones */ - if (OP(text_node) == EXACTFL) { - - if (! is_utf8_pat) { - if (IN_UTF8_CTYPE_LOCALE && *pat == LATIN_SMALL_LETTER_SHARP_S) - { - folded[0] = folded[1] = 's'; - pat = folded; - pat_end = folded + 2; + op = EXACTFU; + } + + /* And certain situations are better handled if we create a modified + * version of the pattern */ + if (utf8_pat) { /* Here, must have been EXACTFL, so look at the + specific problematic characters */ + if (is_PROBLEMATIC_LOCALE_FOLD_utf8(pat)) { + + /* The node could start with characters that are the first ones + * of a multi-character fold. */ + multi_fold_from + = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len); + if (multi_fold_from) { + + /* Here, they do form a sequence that matches the fold of a + * single character. That single character then is a + * possible match. Below we will look again at this, but + * the code below is expecting every character in the + * pattern to be folded, which the input isn't required to + * be in this case. So, just fold the single character, + * and the result will be in the expected form. */ + _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len, + FOLD_FLAGS_FULL); + pat = mod_pat; } - } - else if (is_PROBLEMATIC_LOCALE_FOLDEDS_START_utf8(pat)) { - U8 *s = pat; - U8 *d = folded; - int i; - - for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < pat_end; i++) { - if (isASCII(*s) && LIKELY(! PL_in_utf8_turkic_locale)) { - *(d++) = (U8) toFOLD_LC(*s); - s++; + /* Turkish has a couple extra possibilities. */ + else if ( UNLIKELY(op == TURKISH) + && pat_len >= 3 + && isALPHA_FOLD_EQ(pat[0], 'f') + && ( memBEGINs(pat + 1, pat_len - 1, + LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8) + || ( pat_len >= 4 + && isALPHA_FOLD_EQ(pat[1], 'f') + && memBEGINs(pat + 2, pat_len - 2, + LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE_UTF8) + ))) { + /* The macros for finding a multi-char fold don't include + * the Turkish possibilities, in which U+130 folds to 'i'. + * Hard-code these. It's very unlikely that Unicode will + * ever add any others. */ + if (pat[1] == 'f') { + pat_len = 3; + Copy("ffi", mod_pat, pat_len, U8); } else { - STRLEN len; - _toFOLD_utf8_flags(s, - pat_end, - d, - &len, - FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); - d += len; - s += UTF8SKIP(s); + pat_len = 2; + Copy("fi", mod_pat, pat_len, U8); } + pat = mod_pat; + } + else if ( UTF8_IS_DOWNGRADEABLE_START(*pat) + && LIKELY(memNEs(pat, pat_len, MICRO_SIGN_UTF8)) + && LIKELY(memNEs(pat, pat_len, + LATIN_SMALL_LETTER_SHARP_S_UTF8)) + && (LIKELY(op != TURKISH || *pat != 'I'))) + { + /* For all cases of things between 0-255, except the ones + * in the conditional above, the fold is just the lower + * case, which is faster than the more general case. */ + mod_pat[0] = toLOWER_L1(EIGHT_BIT_UTF8_TO_NATIVE(pat[0], + pat[1])); + pat_len = 1; + pat = mod_pat; + utf8_pat = FALSE; + } + else { /* Code point above 255, or needs special handling */ + _to_utf8_fold_flags(pat, pat + pat_len, + mod_pat, &pat_len, + FOLD_FLAGS_FULL|FOLD_FLAGS_LOCALE); + pat = mod_pat; } - - pat = folded; - pat_end = d; } } + else if /* Below is not a UTF-8 pattern; there's a somewhat different + set of problematic characters */ + ((multi_fold_from + = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len))) + { + /* We may have to canonicalize a multi-char fold, as in the UTF-8 + * case */ + _to_uni_fold_flags(multi_fold_from, mod_pat, &pat_len, + FOLD_FLAGS_FULL); + pat = mod_pat; + } + else if (UNLIKELY(*pat == LATIN_SMALL_LETTER_SHARP_S)) { + mod_pat[0] = mod_pat[1] = 's'; + pat_len = 2; + utf8_pat = utf8_target; /* UTF-8ness immaterial for invariant + chars, and speeds copying */ + pat = mod_pat; + } + else if (LIKELY(op != TURKISH || *pat != 'I')) { + mod_pat[0] = toLOWER_L1(*pat); + pat_len = 1; + pat = mod_pat; + } + } + else if /* Below isn't a node that we convert to UTF-8 */ + ( utf8_target + && ! utf8_pat + && op == EXACTFAA_NO_TRIE + && *pat == LATIN_SMALL_LETTER_SHARP_S) + { + /* A very special case. Folding U+DF goes to U+17F under /iaa. We + * did this at compile time when the pattern was UTF-8 , but otherwise + * we couldn't do it earlier, because it requires a UTF-8 target for + * this match to be legal. */ + pat_len = 2 * (sizeof(LATIN_SMALL_LETTER_LONG_S_UTF8) - 1); + Copy(LATIN_SMALL_LETTER_LONG_S_UTF8 + LATIN_SMALL_LETTER_LONG_S_UTF8, mod_pat, pat_len, U8); + pat = mod_pat; + utf8_pat = TRUE; + } + + /* Here, we have taken care of the initial work for a few very problematic + * situations, possibly creating a modified pattern. + * + * Now ready for the general case. We build up all the possible things + * that could match the first character of the pattern into the elements of + * 'matches[]' + * + * Everything generally matches at least itself. But if there is a + * UTF8ness mismatch, we have to convert to that of the target string. */ + if (UTF8_IS_INVARIANT(*pat)) { /* Immaterial if either is in UTF-8 */ + matches[0][0] = pat[0]; + lengths[0] = 1; + m->count++; + } + else if (utf8_target) { + if (utf8_pat) { + lengths[0] = UTF8SKIP(pat); + Copy(pat, matches[0], lengths[0], U8); + m->count++; + } + else { /* target is UTF-8, pattern isn't */ + matches[0][0] = UTF8_EIGHT_BIT_HI(pat[0]); + matches[0][1] = UTF8_EIGHT_BIT_LO(pat[0]); + lengths[0] = 2; + m->count++; + } + } + else if (! utf8_pat) { /* Neither is UTF-8 */ + matches[0][0] = pat[0]; + lengths[0] = 1; + m->count++; + } + else /* target isn't UTF-8; pattern is. No match possible unless the + pattern's first character can fit in a byte */ + if (UTF8_IS_DOWNGRADEABLE_START(*pat)) + { + matches[0][0] = EIGHT_BIT_UTF8_TO_NATIVE(pat[0], pat[1]); + lengths[0] = 1; + m->count++; + } + + /* Here we have taken care of any necessary node-type changes */ + + if (m->count) { + m->max_length = lengths[0]; + m->min_length = lengths[0]; + } + + /* For non-folding nodes, there are no other possible candidate matches, + * but for foldable ones, we have to look further. */ + if (UNLIKELY(op == TURKISH) || isEXACTFish(op)) { /* A folding node */ + UV folded; /* The first character in the pattern, folded */ + U32 first_fold_from; /* A character that folds to it */ + const U32 * remaining_fold_froms; /* The remaining characters that + fold to it, if any */ + Size_t folds_to_count; /* The total number of characters that fold to + 'folded' */ + + /* If the node begins with a sequence of more than one character that + * together form the fold of a single character, it is called a + * 'multi-character fold', and the normal functions don't handle this + * case. We set 'multi_fold_from' to the single folded-from character, + * which is handled in an extra iteration below */ + if (utf8_pat) { + folded = valid_utf8_to_uvchr(pat, NULL); + multi_fold_from + = what_MULTI_CHAR_FOLD_utf8_safe(pat, pat + pat_len); + } + else { + folded = *pat; + + /* This may generate illegal combinations for things like EXACTF, + * but rather than repeat the logic and exclude them here, all such + * illegalities are checked for and skipped below in the loop */ + multi_fold_from + = what_MULTI_CHAR_FOLD_latin1_safe(pat, pat + pat_len); + } + + /* Everything matches at least itself; initialize to that because the + * only the branches below that set it are the ones where the number + * isn't 1. */ + folds_to_count = 1; - if ( ( is_utf8_pat && is_MULTI_CHAR_FOLD_utf8_safe(pat, pat_end)) - || (!is_utf8_pat && is_MULTI_CHAR_FOLD_latin1_safe(pat, pat_end))) + /* There are a few special cases for locale-dependent nodes, where the + * run-time context was needed before we could know what matched */ + if (UNLIKELY(op == EXACTFL) && folded < 256) { + first_fold_from = PL_fold_locale[folded]; + } + else if ( op == EXACTFL && utf8_target && utf8_pat + && memBEGINs(pat, pat_len, LATIN_SMALL_LETTER_LONG_S_UTF8 + LATIN_SMALL_LETTER_LONG_S_UTF8)) { - /* Multi-character folds require more context to sort out. Also - * PL_utf8_foldclosures used below doesn't handle them, so have to - * be handled outside this routine */ - use_chrtest_void = TRUE; - } - else { /* an EXACTFish node which doesn't begin with a multi-char fold */ - c1 = is_utf8_pat ? valid_utf8_to_uvchr(pat, NULL) : *pat; - - if ( UNLIKELY(PL_in_utf8_turkic_locale) - && OP(text_node) == EXACTFL - && UNLIKELY( c1 == 'i' || c1 == 'I' - || c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE - || c1 == LATIN_SMALL_LETTER_DOTLESS_I)) - { /* Hard-coded Turkish locale rules for these 4 characters - override normal rules */ - if (c1 == 'i') { - c2 = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; - } - else if (c1 == 'I') { - c2 = LATIN_SMALL_LETTER_DOTLESS_I; - } - else if (c1 == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) { - c2 = 'i'; - } - else if (c1 == LATIN_SMALL_LETTER_DOTLESS_I) { - c2 = 'I'; - } + first_fold_from = LATIN_CAPITAL_LETTER_SHARP_S; + } + else if (UNLIKELY( op == TURKISH + && ( isALPHA_FOLD_EQ(folded, 'i') + || inRANGE(folded, + LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE, + LATIN_SMALL_LETTER_DOTLESS_I)))) + { /* Turkish folding requires special handling */ + if (folded == 'i') + first_fold_from = LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; + else if (folded == 'I') + first_fold_from = LATIN_SMALL_LETTER_DOTLESS_I; + else if (folded == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) + first_fold_from = 'i'; + else first_fold_from = 'I'; + } + else { + /* Here, isn't a special case: use the generic function to + * calculate what folds to this */ + redo_multi: + /* Look up what code points (besides itself) fold to 'folded'; + * e.g., [ 'K', KELVIN_SIGN ] both fold to 'k'. */ + folds_to_count = _inverse_folds(folded, &first_fold_from, + &remaining_fold_froms); + } + + /* Add each character that folds to 'folded' to the list of them, + * subject to limitations based on the node type and target UTF8ness. + * If there was a character that folded to multiple characters, do an + * extra iteration for it. (Note the extra iteration if there is a + * multi-character fold) */ + for (i = 0; i < folds_to_count + + UNLIKELY(multi_fold_from != 0); i++) + { + UV fold_from = 0; + + if (i >= folds_to_count) { /* Final iteration: handle the + multi-char */ + fold_from = multi_fold_from; } - else if (c1 > 255) { - const unsigned int * remaining_folds; - unsigned int first_fold; - - /* Look up what code points (besides c1) fold to c1; e.g., - * [ 'K', KELVIN_SIGN ] both fold to 'k'. */ - Size_t folds_count = _inverse_folds(c1, &first_fold, - &remaining_folds); - if (folds_count == 0) { - c2 = c1; /* there is only a single character that could - match */ - } - else if (folds_count != 1) { - /* If there aren't exactly two folds to this (itself and - * another), it is outside the scope of this function */ - use_chrtest_void = TRUE; - } - else { /* There are two. We already have one, get the other */ - c2 = first_fold; - - /* Folds that cross the 255/256 boundary are forbidden if - * EXACTFL (and isnt a UTF8 locale), or EXACTFAA and one is - * ASCIII. The only other match to c1 is c2, and since c1 - * is above 255, c2 better be as well under these - * circumstances. If it isn't, it means the only legal - * match of c1 is itself. */ - if ( c2 < 256 - && ( ( OP(text_node) == EXACTFL - && ! IN_UTF8_CTYPE_LOCALE) - || (( OP(text_node) == EXACTFAA - || OP(text_node) == EXACTFAA_NO_TRIE) - && (isASCII(c1) || isASCII(c2))))) - { - c2 = c1; - } - } + else if (i == 0) { + fold_from = first_fold_from; + } + else if (i < folds_to_count) { + fold_from = remaining_fold_froms[i-1]; + } + + if (folded == fold_from) { /* We already added the character + itself */ + continue; + } + + /* EXACTF doesn't have any non-ascii folds */ + if (op == EXACTF && (! isASCII(folded) || ! isASCII(fold_from))) { + continue; + } + + /* In /iaa nodes, neither or both must be ASCII to be a legal fold + * */ + if ( isASCII(folded) != isASCII(fold_from) + && inRANGE(op, EXACTFAA, EXACTFAA_NO_TRIE)) + + { + continue; } - else /* Here, c1 is <= 255 */ - if ( utf8_target - && HAS_NONLATIN1_FOLD_CLOSURE(c1) - && ( ! (OP(text_node) == EXACTFL && ! IN_UTF8_CTYPE_LOCALE)) - && ( ( OP(text_node) != EXACTFAA - && OP(text_node) != EXACTFAA_NO_TRIE) - || ! isASCII(c1))) + + /* In /il nodes, can't cross 255/256 boundary (unless in a UTF-8 + * locale, but those have been converted to EXACTFU above) */ + if ( op == EXACTFL + && (folded < 256) != (fold_from < 256)) { - /* Here, there could be something above Latin1 in the target - * which folds to this character in the pattern. All such - * cases except LATIN SMALL LETTER Y WITH DIAERESIS have more - * than two characters involved in their folds, so are outside - * the scope of this function */ - if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { - c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; - } - else { - use_chrtest_void = TRUE; + continue; + } + + /* If this triggers, it likely is because of the unlikely case + * where a new Unicode standard has changed what MAX_MATCHES should + * be set to */ + assert(m->count < MAX_MATCHES); + + /* Add this character to the list of possible matches */ + if (utf8_target) { + uvchr_to_utf8(matches[m->count], fold_from); + lengths[m->count] = UVCHR_SKIP(fold_from); + m->count++; + } + else { /* Non-UTF8 target: no code point above 255 can appear in it + */ + if (fold_from > 255) { + continue; } + + matches[m->count][0] = fold_from; + lengths[m->count] = 1; + m->count++; } - else { /* Here nothing above Latin1 can fold to the pattern - character */ - switch (OP(text_node)) { - case EXACTFL: /* /l rules */ - c2 = PL_fold_locale[c1]; - break; + /* Update min and mlengths */ + if (m->min_length > lengths[m->count-1]) { + m->min_length = lengths[m->count-1]; + } - case EXACTF: /* This node only generated for non-utf8 - patterns */ - assert(! is_utf8_pat); - if (! utf8_target) { /* /d rules */ - c2 = PL_fold[c1]; - break; - } - /* FALLTHROUGH */ - /* /u rules for all these. This happens to work for - * EXACTFAA as nothing in Latin1 folds to ASCII */ - case EXACTFAA_NO_TRIE: /* This node only generated for - non-utf8 patterns */ - assert(! is_utf8_pat); - /* FALLTHROUGH */ - case EXACTFAA: - case EXACTFUP: - case EXACTFU: - c2 = PL_fold_latin1[c1]; - break; - case EXACTFU_ONLY8: - return FALSE; - NOT_REACHED; /* NOTREACHED */ + if (m->max_length < lengths[m->count-1]) { + index_of_longest = m->count - 1; + m->max_length = lengths[index_of_longest]; + } + } /* looped through each potential fold */ - default: - Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); - NOT_REACHED; /* NOTREACHED */ + /* If there is something that folded to an initial multi-character + * fold, repeat, using it. This catches some edge cases. An example + * of one is /ss/i when UTF-8 encoded. The function + * what_MULTI_CHAR_FOLD_utf8_safe('ss') gets called and returns U+DF + * (LATIN SMALL SHARP S). If it returned a list of characters, this + * code wouldn't be needed. But since it doesn't, we have to look what + * folds to the U+DF. In this case, U+1E9E does, and has to be added. + * */ + if (multi_fold_from) { + folded = multi_fold_from; + multi_fold_from = 0; + goto redo_multi; + } + } /* End of finding things that participate in this fold */ + + if (m->count == 0) { /* If nothing found, can't match */ + m->min_length = 0; + return FALSE; + } + + /* Have calculated all possible matches. Now calculate the mask and AND + * values */ + m->initial_exact = 0; + m->initial_definitive = 0; + + { + unsigned int mask_ones = 0; + unsigned int possible_ones = 0; + U8 j; + + /* For each byte that is in all possible matches ... */ + for (j = 0; j < MIN(m->min_length, 5); j++) { + + /* Initialize the accumulator for this byte */ + byte_mask[j] = 0xFF; + byte_anded[j] = matches[0][j]; + + /* Then the rest of the rows (folds). The mask is based on, like, + * ~('A' ^ 'a') is a 1 in all bits where these are the same, and 0 + * where they differ. */ + for (i = 1; i < (PERL_UINT_FAST8_T) m->count; i++) { + byte_mask[j] &= ~ (byte_anded[j] ^ matches[i][j]); + byte_anded[j] &= matches[i][j]; + } + + /* Keep track of the number of initial mask bytes that are all one + * bits. The code calling this can use this number to know that + * a string that matches this number of bytes in the pattern is an + * exact match of that pattern for this number of bytes. But also + * counted are the number of initial bytes that in total have a + * single zero bit. If a string matches those, masked, it must be + * one of two possibilites, both of which this function has + * determined are legal. (But if that single 0 is one of the + * initial bits for masking a UTF-8 start byte, that could + * incorrectly lead to different length strings appearing to be + * equivalent, so only do this optimization when the matchables are + * all the same length. This was uncovered by testing + * /\x{029E}/i.) */ + if (m->min_length == m->max_length) { + mask_ones += PL_bitcount[byte_mask[j]]; + possible_ones += 8; + if (mask_ones + 1 >= possible_ones) { + m->initial_definitive++; + if (mask_ones >= possible_ones) { + m->initial_exact++; + } } } } } - /* Here have figured things out. Set up the returns */ - if (use_chrtest_void) { - *c2p = *c1p = CHRTEST_VOID; + /* The first byte is separate for speed */ + m->first_byte_mask = byte_mask[0]; + m->first_byte_anded = byte_anded[0]; + + /* Then pack up to the next 4 bytes into a word */ + m->mask32 = m->anded32 = 0; + for (i = 1; i < MIN(m->min_length, 5); i++) { + U8 which = i; + U8 shift = (which - 1) * 8; + m->mask32 |= (U32) byte_mask[i] << shift; + m->anded32 |= (U32) byte_anded[i] << shift; } - else if (utf8_target) { - if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */ - uvchr_to_utf8(c1_utf8, c1); - uvchr_to_utf8(c2_utf8, c2); + + /* Finally, take the match strings and place them sequentially into a + * one-dimensional array. (This is done to save significant space in the + * structure.) Sort so the longest (presumably the least likely) is last. + * XXX When this gets moved to regcomp, may want to fully sort shortest + * first, but above we generally used the folded code point first, and + * those tend to be no longer than their upper case values, so this is + * already pretty well sorted by size. + * + * If the asserts fail, it's most likely because a new version of the + * Unicode standard requires more space; simply increase the declaration + * size. */ + { + U8 cur_pos = 0; + U8 output_index = 0; + + if (m->count > 1) { /* No need to sort a single entry */ + for (i = 0; i < (PERL_UINT_FAST8_T) m->count; i++) { + + /* Keep the same order for all but the longest. (If the + * asserts fail, it could be because m->matches is declared too + * short, either because of a new Unicode release, or an + * overlooked test case, or it could be a bug.) */ + if (i != index_of_longest) { + assert(cur_pos + lengths[i] <= C_ARRAY_LENGTH(m->matches)); + Copy(matches[i], m->matches + cur_pos, lengths[i], U8); + cur_pos += lengths[i]; + m->lengths[output_index++] = lengths[i]; + } + } } - /* Invariants are stored in both the utf8 and byte outputs; Use - * negative numbers otherwise for the byte ones. Make sure that the - * byte ones are the same iff the utf8 ones are the same */ - *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1; - *c2p = (UTF8_IS_INVARIANT(*c2_utf8)) - ? *c2_utf8 - : (c1 == c2) - ? CHRTEST_NOT_A_CP_1 - : CHRTEST_NOT_A_CP_2; - } - else if (c1 > 255) { - if (c2 > 255) { /* both possibilities are above what a non-utf8 string - can represent */ - return FALSE; - } + assert(cur_pos + lengths[index_of_longest] <= C_ARRAY_LENGTH(m->matches)); + Copy(matches[index_of_longest], m->matches + cur_pos, + lengths[index_of_longest], U8); - *c1p = *c2p = c2; /* c2 is the only representable value */ - } - else { /* c1 is representable; see about c2 */ - *c1p = c1; - *c2p = (c2 < 256) ? c2 : c1; + /* Place the longest match last */ + m->lengths[output_index] = lengths[index_of_longest]; } + return TRUE; } +PERL_STATIC_FORCE_INLINE /* We want speed at the expense of size */ +bool +S_test_EXACTISH_ST(const char * loc, + struct next_matchable_info info) +{ + /* This function uses the data set up in setup_EXACTISH_ST() to see if the + * bytes starting at 'loc' can match based on 'next_matchable_info' */ + + U32 input32 = 0; + + /* Check the first byte */ + if (((U8) loc[0] & info.first_byte_mask) != info.first_byte_anded) + return FALSE; + + /* Pack the next up-to-4 bytes into a 32 bit word */ + switch (info.min_length) { + default: + input32 |= (U32) ((U8) loc[4]) << 3 * 8; + /* FALLTHROUGH */ + case 4: + input32 |= (U8) loc[3] << 2 * 8; + /* FALLTHROUGH */ + case 3: + input32 |= (U8) loc[2] << 1 * 8; + /* FALLTHROUGH */ + case 2: + input32 |= (U8) loc[1]; + break; + case 1: + return TRUE; /* We already tested and passed the 0th byte */ + case 0: + ASSUME(0); + } + + /* And AND that with the mask and compare that with the assembled ANDED + * values */ + return (input32 & info.mask32) == info.anded32; +} + STATIC bool S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strbeg, const U8 * const curpos, const bool utf8_target) { /* returns a boolean indicating if there is a Grapheme Cluster Boundary - * between the inputs. See http://www.unicode.org/reports/tr29/. */ + * between the inputs. See https://www.unicode.org/reports/tr29/. */ PERL_ARGS_ASSERT_ISGCB; @@ -4510,7 +5112,7 @@ S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strb } while (prev == GCB_Extend); - return prev != GCB_XPG_XX; + return prev != GCB_ExtPict_XX; } default: @@ -4528,7 +5130,6 @@ S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strb STATIC GCB_enum S_backup_one_GCB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) { - dVAR; GCB_enum gcb; PERL_ARGS_ASSERT_BACKUP_ONE_GCB; @@ -4806,7 +5407,6 @@ S_isLB(pTHX_ LB_enum before, STATIC LB_enum S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) { - dVAR; LB_enum lb; @@ -4837,7 +5437,6 @@ S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_ta STATIC LB_enum S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) { - dVAR; LB_enum lb; PERL_ARGS_ASSERT_BACKUP_ONE_LB; @@ -4885,7 +5484,7 @@ S_isSB(pTHX_ SB_enum before, const bool utf8_target) { /* returns a boolean indicating if there is a Sentence Boundary Break - * between the inputs. See http://www.unicode.org/reports/tr29/ */ + * between the inputs. See https://www.unicode.org/reports/tr29/ */ U8 * lpos = (U8 *) curpos; bool has_para_sep = FALSE; @@ -5074,7 +5673,6 @@ S_isSB(pTHX_ SB_enum before, STATIC SB_enum S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) { - dVAR; SB_enum sb; PERL_ARGS_ASSERT_ADVANCE_ONE_SB; @@ -5108,7 +5706,6 @@ S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_ta STATIC SB_enum S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) { - dVAR; SB_enum sb; PERL_ARGS_ASSERT_BACKUP_ONE_SB; @@ -5345,7 +5942,6 @@ S_advance_one_WB(pTHX_ U8 ** curpos, const bool utf8_target, const bool skip_Extend_Format) { - dVAR; WB_enum wb; PERL_ARGS_ASSERT_ADVANCE_ONE_WB; @@ -5383,7 +5979,6 @@ S_advance_one_WB(pTHX_ U8 ** curpos, STATIC WB_enum S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target) { - dVAR; WB_enum wb; PERL_ARGS_ASSERT_BACKUP_ONE_WB; @@ -5459,10 +6054,10 @@ S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, /* Macros for regmatch(), using its internal variables */ #define NEXTCHR_EOS -10 /* nextchr has fallen off the end */ -#define NEXTCHR_IS_EOS (nextchr < 0) +#define NEXTCHR_IS_EOS (nextbyte < 0) #define SET_nextchr \ - nextchr = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS) + nextbyte = ((locinput < reginfo->strend) ? UCHARAT(locinput) : NEXTCHR_EOS) #define SET_locinput(p) \ locinput = (p); \ @@ -5503,18 +6098,20 @@ S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, /* push a new state then goto it */ -#define PUSH_STATE_GOTO(state, node, input, eol) \ +#define PUSH_STATE_GOTO(state, node, input, eol, sr0) \ pushinput = input; \ pusheol = eol; \ + pushsr0 = sr0; \ scan = node; \ st->resume_state = state; \ goto push_state; /* push a new state with success backtracking, then goto it */ -#define PUSH_YES_STATE_GOTO(state, node, input, eol) \ +#define PUSH_YES_STATE_GOTO(state, node, input, eol, sr0) \ pushinput = input; \ pusheol = eol; \ + pushsr0 = sr0; \ scan = node; \ st->resume_state = state; \ goto push_yes_state; @@ -5570,7 +6167,7 @@ the subpattern to be matched possibly multiple times, while B is the entire rest of the pattern. Variable and state names reflect this convention. The states in the main switch are the union of ops and failure/success of -substates associated with with that op. For example, IFMATCH is the op +substates associated with that op. For example, IFMATCH is the op that does lookahead assertions /(?=A)B/ and so the IFMATCH state means 'execute IFMATCH'; while IFMATCH_A is a state saying that we have just successfully matched A and IFMATCH_A_fail is a state saying that we have @@ -5680,7 +6277,6 @@ bounds of our window into the string. STATIC SSize_t S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) { - dVAR; const bool utf8_target = reginfo->is_utf8_target; const U32 uniflags = UTF8_ALLOW_DEFAULT; REGEXP *rex_sv = reginfo->prog; @@ -5698,7 +6294,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) char *loceol = reginfo->strend; char *pushinput; /* where to continue after a PUSH */ char *pusheol; /* where to stop matching (loceol) after a PUSH */ - I32 nextchr; /* is always set to UCHARAT(locinput), or -1 at EOS */ + U8 *pushsr0; /* save starting pos of script run */ + PERL_INT_FAST16_T nextbyte; /* is always set to UCHARAT(locinput), or -1 + at EOS */ bool result = 0; /* return value of S_regmatch */ U32 depth = 0; /* depth of backtrack stack */ @@ -5708,7 +6306,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) 3 * rex->nparens : MAX_RECURSE_EVAL_NOCHANGE_DEPTH; regmatch_state *yes_state = NULL; /* state to pop to on success of subpattern */ - /* mark_state piggy backs on the yes_state logic so that when we unwind + /* mark_state piggy backs on the yes_state logic so that when we unwind the stack on success we can update the mark_state as we go */ regmatch_state *mark_state = NULL; /* last mark state we have seen */ regmatch_state *cur_eval = NULL; /* most recent EVAL_AB state */ @@ -5719,7 +6317,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) char *startpoint = locinput; SV *popmark = NULL; /* are we looking for a mark? */ SV *sv_commit = NULL; /* last mark name seen in failure */ - SV *sv_yes_mark = NULL; /* last mark name we have seen + SV *sv_yes_mark = NULL; /* last mark name we have seen during a successful match */ U32 lastopen = 0; /* last open we saw */ bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0; @@ -5759,7 +6357,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) #endif #ifdef DEBUGGING - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; #endif /* protect against undef(*^R) */ @@ -5773,7 +6371,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) st = PL_regmatch_state; - /* Note that nextchr is a byte even in UTF */ + /* Note that nextbyte is a byte even in UTF */ SET_nextchr; scan = prog; @@ -5809,7 +6407,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) to_complement = 0; SET_nextchr; - assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); + assert(nextbyte < 256 && (nextbyte >= 0 || nextbyte == NEXTCHR_EOS)); switch (state_num) { case SBOL: /* /^../ and /\A../ */ @@ -5834,7 +6432,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* update the startpoint */ st->u.keeper.val = rex->offs[0].start; rex->offs[0].start = locinput - reginfo->strbeg; - PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol); + PUSH_STATE_GOTO(KEEPS_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case KEEPS_next_fail: @@ -5844,12 +6443,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) NOT_REACHED; /* NOTREACHED */ case MEOL: /* /..$/m */ - if (!NEXTCHR_IS_EOS && nextchr != '\n') + if (!NEXTCHR_IS_EOS && nextbyte != '\n') sayNO; break; case SEOL: /* /..$/ */ - if (!NEXTCHR_IS_EOS && nextchr != '\n') + if (!NEXTCHR_IS_EOS && nextbyte != '\n') sayNO; if (reginfo->strend - locinput > 1) sayNO; @@ -5868,7 +6467,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case REG_ANY: /* /./ */ if ( NEXTCHR_IS_EOS || locinput >= loceol - || nextchr == '\n') + || nextbyte == '\n') { sayNO; } @@ -5879,11 +6478,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) #define ST st->u.trie case TRIEC: /* (ab|cd) with known charclass */ /* In this case the charclass data is available inline so - we can fail fast without a lot of extra overhead. + we can fail fast without a lot of extra overhead. */ if ( ! NEXTCHR_IS_EOS && locinput < loceol - && ! ANYOF_BITMAP_TEST(scan, nextchr)) + && ! ANYOF_BITMAP_TEST(scan, nextbyte)) { DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "%sTRIE: failed to match trie start class...%s\n", @@ -5952,7 +6551,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (utf8_target && ! NEXTCHR_IS_EOS - && UTF8_IS_ABOVE_LATIN1(nextchr) + && UTF8_IS_ABOVE_LATIN1(nextbyte) && scan->flags == EXACTL) { /* We only output for EXACTL, as we let the folder @@ -5965,7 +6564,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if ( trie->bitmap && ( NEXTCHR_IS_EOS || locinput >= loceol - || ! TRIE_BITMAP_TEST(trie, nextchr))) + || ! TRIE_BITMAP_TEST(trie, nextbyte))) { if (trie->states[ state ].wordnum) { DEBUG_EXECUTE_r( @@ -5983,7 +6582,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } } - { + { U8 *uc = ( U8* )locinput; STRLEN len = 0; @@ -6210,7 +6809,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) }); if ( ST.accepted > 1 || has_cutgroup || ST.jump ) { - PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol); + PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } /* only one choice left - just continue */ @@ -6227,7 +6827,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, PL_colors[0], PL_colors[1], (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII - ) + ) : "not compiled under -Dr", PL_colors[5] ); }); @@ -6238,6 +6838,20 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } #undef ST + case LEXACT_REQ8: + if (! utf8_target) { + sayNO; + } + /* FALLTHROUGH */ + + case LEXACT: + { + char *s; + + s = STRINGl(scan); + ln = STR_LENl(scan); + goto join_short_long_exact; + case EXACTL: /* /abc/l */ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; @@ -6251,16 +6865,18 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend); } goto do_exact; - case EXACT_ONLY8: + case EXACT_REQ8: if (! utf8_target) { sayNO; } /* FALLTHROUGH */ - case EXACT: { /* /abc/ */ - char *s; + + case EXACT: /* /abc/ */ do_exact: - s = STRING(scan); - ln = STR_LEN(scan); + s = STRINGs(scan); + ln = STR_LENs(scan); + + join_short_long_exact: if (utf8_target != is_utf8_pat) { /* The target and the pattern have differing utf8ness. */ char *l = locinput; @@ -6328,7 +6944,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* The target and the pattern have the same utf8ness. */ /* Inline the first character, for speed. */ if ( loceol - locinput < ln - || UCHARAT(s) != nextchr + || UCHARAT(s) != nextbyte || (ln > 1 && memNE(s, locinput, ln))) { sayNO; @@ -6363,7 +6979,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) fold_array = PL_fold_latin1; goto do_exactf; - case EXACTFU_ONLY8: /* /abc/iu with something in /abc/ > 255 */ + case EXACTFU_REQ8: /* /abc/iu with something in /abc/ > 255 */ if (! utf8_target) { sayNO; } @@ -6412,8 +7028,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) fold_utf8_flags = 0; do_exactf: - s = STRING(scan); - ln = STR_LEN(scan); + s = STRINGs(scan); + ln = STR_LENs(scan); if ( utf8_target || is_utf8_pat @@ -6435,9 +7051,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } /* Neither the target nor the pattern are utf8 */ - if (UCHARAT(s) != nextchr + if (UCHARAT(s) != nextbyte && !NEXTCHR_IS_EOS - && UCHARAT(s) != fold_array[nextchr]) + && UCHARAT(s) != fold_array[nextbyte]) { sayNO; } @@ -6459,10 +7075,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (FLAGS(scan) != TRADITIONAL_BOUND) { - if (! IN_UTF8_CTYPE_LOCALE) { - Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), - B_ON_NON_UTF8_LOCALE_IS_WRONG); - } + CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_BOUND; goto boundu; } @@ -6470,9 +7083,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (locinput == reginfo->strbeg) b1 = isWORDCHAR_LC('\n'); else { - b1 = isWORDCHAR_LC_utf8_safe(reghop3((U8*)locinput, -1, - (U8*)(reginfo->strbeg)), - (U8*)(reginfo->strend)); + U8 *p = reghop3((U8*)locinput, -1, + (U8*)(reginfo->strbeg)); + b1 = isWORDCHAR_LC_utf8_safe(p, (U8*)(reginfo->strend)); } b2 = (NEXTCHR_IS_EOS) ? isWORDCHAR_LC('\n') @@ -6485,7 +7098,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) : isWORDCHAR_LC(UCHARAT(locinput - 1)); b2 = (NEXTCHR_IS_EOS) ? isWORDCHAR_LC('\n') - : isWORDCHAR_LC(nextchr); + : isWORDCHAR_LC(nextbyte); } if (to_complement ^ (b1 == b2)) { sayNO; @@ -6526,7 +7139,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) : isWORDCHAR_A(UCHARAT(locinput - 1)); b2 = (NEXTCHR_IS_EOS) ? isWORDCHAR_A('\n') - : isWORDCHAR_A(nextchr); + : isWORDCHAR_A(nextbyte); if (to_complement ^ (b1 == b2)) { sayNO; } @@ -6549,13 +7162,15 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case TRADITIONAL_BOUND: { bool b1, b2; - b1 = (locinput == reginfo->strbeg) - ? 0 /* isWORDCHAR_L1('\n') */ - : isWORDCHAR_utf8_safe( - reghop3((U8*)locinput, - -1, - (U8*)(reginfo->strbeg)), - (U8*) reginfo->strend); + if (locinput == reginfo->strbeg) { + b1 = 0 /* isWORDCHAR_L1('\n') */; + } + else { + U8 *p = reghop3((U8*)locinput, -1, + (U8*)(reginfo->strbeg)); + + b1 = isWORDCHAR_utf8_safe(p, (U8*) reginfo->strend); + } b2 = (NEXTCHR_IS_EOS) ? 0 /* isWORDCHAR_L1('\n') */ : isWORDCHAR_utf8_safe((U8*)locinput, @@ -6656,7 +7271,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) : isWORDCHAR_L1(UCHARAT(locinput - 1)); b2 = (NEXTCHR_IS_EOS) ? 0 /* isWORDCHAR_L1('\n') */ - : isWORDCHAR_L1(nextchr); + : isWORDCHAR_L1(nextbyte); match = cBOOL(b1 != b2); break; } @@ -6729,11 +7344,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case ANYOFPOSIXL: case ANYOFL: /* /[abc]/l */ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(scan); - if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE) - { - Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); - } /* FALLTHROUGH */ case ANYOFD: /* /[abc]/d */ case ANYOF: /* /[abc]/ */ @@ -6780,8 +7392,46 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case ANYOFH: if ( ! utf8_target || NEXTCHR_IS_EOS - || ( ANYOF_FLAGS(scan) != 0 - && ANYOF_FLAGS(scan) != (U8) *locinput) + || ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput) + || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, + utf8_target)) + { + sayNO; + } + goto increment_locinput; + break; + + case ANYOFHb: + if ( ! utf8_target + || NEXTCHR_IS_EOS + || ANYOF_FLAGS(scan) != (U8) *locinput + || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, + utf8_target)) + { + sayNO; + } + goto increment_locinput; + break; + + case ANYOFHr: + if ( ! utf8_target + || NEXTCHR_IS_EOS + || ! inRANGE((U8) NATIVE_UTF8_TO_I8(*locinput), + LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan)), + HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(scan))) + || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, + utf8_target)) + { + sayNO; + } + goto increment_locinput; + break; + + case ANYOFHs: + if ( ! utf8_target + || NEXTCHR_IS_EOS + || loceol - locinput < FLAGS(scan) + || memNE(locinput, ((struct regnode_anyofhs *) scan)->string, FLAGS(scan)) || ! reginclass(rex, scan, (U8*)locinput, (U8*) loceol, utf8_target)) { @@ -6790,6 +7440,56 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto increment_locinput; break; + case ANYOFR: + if (NEXTCHR_IS_EOS) { + sayNO; + } + + if (utf8_target) { + if ( ANYOF_FLAGS(scan) > NATIVE_UTF8_TO_I8(*locinput) + || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput, + (U8 *) reginfo->strend, + NULL), + ANYOFRbase(scan), ANYOFRdelta(scan))) + { + sayNO; + } + } + else { + if (! withinCOUNT((U8) *locinput, + ANYOFRbase(scan), ANYOFRdelta(scan))) + { + sayNO; + } + } + goto increment_locinput; + break; + + case ANYOFRb: + if (NEXTCHR_IS_EOS) { + sayNO; + } + + if (utf8_target) { + if ( ANYOF_FLAGS(scan) != (U8) *locinput + || ! withinCOUNT(utf8_to_uvchr_buf((U8 *) locinput, + (U8 *) reginfo->strend, + NULL), + ANYOFRbase(scan), ANYOFRdelta(scan))) + { + sayNO; + } + } + else { + if (! withinCOUNT((U8) *locinput, + ANYOFRbase(scan), ANYOFRdelta(scan))) + { + sayNO; + } + } + goto increment_locinput; + break; + /* The argument (FLAGS) to all the POSIX node types is the class number * */ @@ -6805,8 +7505,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* Use isFOO_lc() for characters within Latin1. (Note that * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else * wouldn't be invariant) */ - if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { - if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextchr)))) { + if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) { + if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), (U8) nextbyte)))) { sayNO; } @@ -6824,7 +7524,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* Here is a UTF-8 variant code point below 256 and the target is * UTF-8 */ if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), - EIGHT_BIT_UTF8_TO_NATIVE(nextchr, + EIGHT_BIT_UTF8_TO_NATIVE(nextbyte, *(locinput + 1)))))) { sayNO; @@ -6849,7 +7549,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } /* All UTF-8 variants match */ - if (! UTF8_IS_INVARIANT(nextchr)) { + if (! UTF8_IS_INVARIANT(nextbyte)) { goto increment_locinput; } @@ -6869,7 +7569,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) join_nposixa: - if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr, + if (! (to_complement ^ cBOOL(_generic_isCC_A(nextbyte, FLAGS(scan))))) { sayNO; @@ -6893,8 +7593,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* Use _generic_isCC() for characters within Latin1. (Note that * UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else * wouldn't be invariant) */ - if (UTF8_IS_INVARIANT(nextchr) || ! utf8_target) { - if (! (to_complement ^ cBOOL(_generic_isCC(nextchr, + if (UTF8_IS_INVARIANT(nextbyte) || ! utf8_target) { + if (! (to_complement ^ cBOOL(_generic_isCC(nextbyte, FLAGS(scan))))) { sayNO; @@ -6903,7 +7603,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) { if (! (to_complement - ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr, + ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextbyte, *(locinput + 1)), FLAGS(scan))))) { @@ -6974,7 +7674,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* Match either CR LF or '.', as all the other possibilities * require utf8 */ locinput++; /* Match the . or CR */ - if (nextchr == '\r' /* And if it was CR, and the next is LF, + if (nextbyte == '\r' /* And if it was CR, and the next is LF, match the LF */ && locinput < loceol && UCHARAT(locinput) == '\n') @@ -7010,8 +7710,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } break; - - case NREFFL: /* /\g{name}/il */ + + case REFFLN: /* /\g{name}/il */ { /* The capture buffer cases. The ones beginning with N for the named buffers just convert to the equivalent numbered and pretend they were called as the corresponding numbered buffer @@ -7031,28 +7731,28 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) utf8_fold_flags = FOLDEQ_LOCALE; goto do_nref; - case NREFFA: /* /\g{name}/iaa */ + case REFFAN: /* /\g{name}/iaa */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; type = REFFA; utf8_fold_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_nref; - case NREFFU: /* /\g{name}/iu */ + case REFFUN: /* /\g{name}/iu */ folder = foldEQ_latin1; fold_array = PL_fold_latin1; type = REFFU; utf8_fold_flags = 0; goto do_nref; - case NREFF: /* /\g{name}/i */ + case REFFN: /* /\g{name}/i */ folder = foldEQ; fold_array = PL_fold; type = REFF; utf8_fold_flags = 0; goto do_nref; - case NREF: /* /\g{name}/ */ + case REFN: /* /\g{name}/ */ type = REF; folder = NULL; fold_array = NULL; @@ -7134,9 +7834,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* Not utf8: Inline the first character, for speed. */ if ( ! NEXTCHR_IS_EOS && locinput < loceol - && UCHARAT(s) != nextchr + && UCHARAT(s) != nextbyte && ( type == REF - || UCHARAT(s) != fold_array[nextchr])) + || UCHARAT(s) != fold_array[nextbyte])) { sayNO; } @@ -7173,7 +7873,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) arg= (U32)ARG(scan); if (cur_eval && cur_eval->locinput == locinput) { if ( ++nochange_depth > max_nochange_depth ) - Perl_croak(aTHX_ + Perl_croak(aTHX_ "Pattern subroutine nesting without pos change" " exceeded limit in regex"); } else { @@ -7201,7 +7901,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) rex->recurse_locinput[arg]= locinput; DEBUG_r({ - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; DEBUG_STACK_r({ Perl_re_exec_indentf( aTHX_ "entering GOSUB, prev_recurse_locinput=%p recurse_locinput[%d]=%p\n", @@ -7220,12 +7920,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* NOTREACHED */ case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */ - if (cur_eval && cur_eval->locinput==locinput) { + if (logical == 2 && cur_eval && cur_eval->locinput==locinput) { if ( ++nochange_depth > max_nochange_depth ) Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); } else { nochange_depth = 0; - } + } { /* execute the code in the {...} */ @@ -7448,7 +8148,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) PL_curpm = PL_reg_curpm; if (logical != 2) { - PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol); + PUSH_STATE_GOTO(EVAL_B, next, locinput, loceol, + script_run_begin); /* NOTREACHED */ } } @@ -7508,7 +8209,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) DEBUG_EXECUTE_r( debug_start_match(re_sv, utf8_target, locinput, reginfo->strend, "EVAL/GOSUB: Matching embedded"); - ); + ); startpoint = rei->program + 1; EVAL_CLOSE_PAREN_CLEAR(st); /* ST.close_paren = 0; * close_paren only for GOSUB */ @@ -7548,7 +8249,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) ST.prev_eval = cur_eval; cur_eval = st; /* now continue from first node in postoned RE */ - PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput, loceol); + PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput, + loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -7619,7 +8321,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) is_utf8_pat = reginfo->is_utf8_pat = cBOOL(RX_UTF8(rex_sv)); SET_reg_curpm(rex_sv); rex = ReANY(rex_sv); - rexi = RXi_GET(rex); + rexi = RXi_GET(rex); REGCP_UNWIND(ST.lastcp); regcppop(rex, &maxopenparen); @@ -7682,8 +8384,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (ARG2L(scan)){ regnode *cursor; for (cursor=scan; - cursor && OP(cursor)!=END; - cursor=regnext(cursor)) + cursor && OP(cursor)!=END; + cursor=regnext(cursor)) { if ( OP(cursor)==CLOSE ){ n = ARG(cursor); @@ -7704,7 +8406,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sw = cBOOL(rex->lastparen >= n && rex->offs[n].end != -1); break; - case NGROUPP: /* (?()) */ + case GROUPPN: /* (?()) */ /* reg_check_named_buff_matched returns 0 for no match */ sw = cBOOL(0 < reg_check_named_buff_matched(rex,scan)); break; @@ -7802,19 +8504,19 @@ I and O refer to inner and outer, C and W refer to CURLYX and WHILEM: cur_ curlyx backtrack stack ------ --------------- -NULL +NULL CO -CI ai -CO ai bi +CI ai +CO ai bi NULL ai bi bo At this point the pattern succeeds, and we work back down the stack to clean up, restoring as we go: -CO ai bi -CI ai +CO ai bi +CI ai CO -NULL +NULL *******************************************************************/ @@ -7824,7 +8526,7 @@ NULL { /* No need to save/restore up to this paren */ I32 parenfloor = scan->flags; - + assert(next); /* keep Coverity happy */ if (OP(PREVOPER(next)) == NOTHING) /* LONGJMP */ next += ARG(next); @@ -7848,7 +8550,8 @@ NULL ST.count = -1; /* this will be updated by WHILEM */ ST.lastloc = NULL; /* this will be updated by WHILEM */ - PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol); + PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -7883,7 +8586,7 @@ NULL ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; ST.cache_offset = 0; ST.cache_mask = 0; - + DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "WHILEM: matched %ld out of %d..%d\n", depth, (long)n, min, max) @@ -7896,7 +8599,8 @@ NULL cur_curlyx->u.curlyx.lastloc = locinput; REGCP_SET(ST.lastcp); - PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol); + PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -8004,7 +8708,7 @@ NULL ST.save_curlyx = cur_curlyx; cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, - locinput, loceol); + locinput, loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ } @@ -8015,7 +8719,8 @@ NULL maxopenparen); cur_curlyx->u.curlyx.lastloc = locinput; REGCP_SET(ST.lastcp); - PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol); + PUSH_STATE_GOTO(WHILEM_A_max, A, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ } goto do_whilem_B_max; @@ -8067,7 +8772,7 @@ NULL ST.save_curlyx = cur_curlyx; cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B, - locinput, loceol); + locinput, loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ @@ -8098,7 +8803,7 @@ NULL REGCP_SET(ST.lastcp); PUSH_STATE_GOTO(WHILEM_A_min, /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS, - locinput, loceol); + locinput, loceol, script_run_begin); NOT_REACHED; /* NOTREACHED */ #undef ST @@ -8120,9 +8825,11 @@ NULL /* Now go into the branch */ if (has_cutgroup) { - PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol); + PUSH_YES_STATE_GOTO(BRANCH_next, scan, locinput, loceol, + script_run_begin); } else { - PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol); + PUSH_STATE_GOTO(BRANCH_next, scan, locinput, loceol, + script_run_begin); } NOT_REACHED; /* NOTREACHED */ @@ -8130,7 +8837,8 @@ NULL sv_yes_mark = st->u.mark.mark_name = scan->flags ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ]) : NULL; - PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol); + PUSH_STATE_GOTO(CUTGROUP_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case CUTGROUP_next_fail: @@ -8138,7 +8846,7 @@ NULL no_final = 1; if (st->u.mark.mark_name) sv_commit = st->u.mark.mark_name; - sayNO; + sayNO; NOT_REACHED; /* NOTREACHED */ case BRANCH_next: @@ -8165,7 +8873,7 @@ NULL } continue; /* execute next BRANCH[J] op */ /* NOTREACHED */ - + case MINMOD: /* next op will be non-greedy, e.g. A*? */ minmod = 1; break; @@ -8200,14 +8908,15 @@ NULL ST.count = 0; ST.minmod = minmod; minmod = 0; - ST.c1 = CHRTEST_UNINIT; + ST.Binfo.count = -1; REGCP_SET(ST.cp); if (!(ST.minmod ? ARG1(ST.me) : ARG2(ST.me))) /* min/max */ goto curlym_do_B; curlym_do_A: /* execute the A in /A{m,n}B/ */ - PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol); /* match A */ + PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput, loceol, /* match A */ + script_run_begin); NOT_REACHED; /* NOTREACHED */ case CURLYM_A: /* we've just matched an A */ @@ -8234,7 +8943,7 @@ NULL if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags)) goto fake_end; - + { I32 max = (ST.minmod ? ARG1(ST.me) : ARG2(ST.me)); if ( max == REG_INFTY || ST.count < max ) @@ -8246,24 +8955,21 @@ NULL REGCP_UNWIND(ST.cp); - if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ + if (ST.minmod || ST.count < ARG1(ST.me) /* min*/ || EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags)) sayNO; curlym_do_B: /* execute the B in /A{m,n}B/ */ - if (ST.c1 == CHRTEST_UNINIT) { - /* calculate c1 and c2 for possible match of 1st char - * following curly */ - ST.c1 = ST.c2 = CHRTEST_VOID; + if (ST.Binfo.count < 0) { + /* calculate possible match of 1st char following curly */ assert(ST.B); if (HAS_TEXT(ST.B) || JUMPABLE(ST.B)) { regnode *text_node = ST.B; if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node); if (PL_regkind[OP(text_node)] == EXACT) { - if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ - text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, - reginfo)) + if (! S_setup_EXACTISH_ST(aTHX_ text_node, + &ST.Binfo, reginfo)) { sayNO; } @@ -8274,37 +8980,21 @@ NULL DEBUG_EXECUTE_r( Perl_re_exec_indentf( aTHX_ "CURLYM trying tail with matches=%" IVdf "...\n", depth, (IV)ST.count) - ); - if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) { - if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) { - - /* (We can use memEQ and memNE in this file without - * having to worry about one being shorter than the - * other, since the first byte of each gives the - * length of the character) */ - if ( memNE(locinput, ST.c1_utf8, UTF8_SAFE_SKIP(locinput, - reginfo->strend)) - && memNE(locinput, ST.c2_utf8, UTF8_SAFE_SKIP(locinput, - reginfo->strend))) - { - /* simulate B failing */ - DEBUG_OPTIMISE_r( - Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%" UVXf " c1=0x%" UVXf " c2=0x%" UVXf "\n", - depth, - valid_utf8_to_uvchr((U8 *) locinput, NULL), - valid_utf8_to_uvchr(ST.c1_utf8, NULL), - valid_utf8_to_uvchr(ST.c2_utf8, NULL)) - ); - state_num = CURLYM_B_fail; - goto reenter_switch; - } - } - else if (nextchr != ST.c1 && nextchr != ST.c2) { - /* simulate B failing */ + ); + if (! NEXTCHR_IS_EOS && ST.Binfo.count >= 0) { + assert(ST.Binfo.count > 0); + + /* Do a quick test to hopefully rule out most non-matches */ + if ( locinput + ST.Binfo.min_length > loceol + || ! S_test_EXACTISH_ST(locinput, ST.Binfo)) + { DEBUG_OPTIMISE_r( - Perl_re_exec_indentf( aTHX_ "CURLYM Fast bail next target=0x%X c1=0x%X c2=0x%X\n", + Perl_re_exec_indentf( aTHX_ + "CURLYM Fast bail next target=0x%X anded==0x%X" + " mask=0x%X\n", depth, - (int) nextchr, ST.c1, ST.c2) + (int) nextbyte, ST.Binfo.first_byte_anded, + ST.Binfo.first_byte_mask) ); state_num = CURLYM_B_fail; goto reenter_switch; @@ -8324,14 +9014,15 @@ NULL if (EVAL_CLOSE_PAREN_IS_TRUE(cur_eval,(U32)ST.me->flags)) { - if (ST.count) + if (ST.count) goto fake_end; else sayNO; } } - - PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol); /* match B */ + + PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput, loceol, /* match B */ + script_run_begin); NOT_REACHED; /* NOTREACHED */ case CURLYM_B_fail: /* just failed to match a B */ @@ -8419,24 +9110,23 @@ NULL assert(ST.min <= ST.max); if (! HAS_TEXT(next) && ! JUMPABLE(next)) { - ST.c1 = ST.c2 = CHRTEST_VOID; + ST.Binfo.count = 0; } else { regnode *text_node = next; - if (! HAS_TEXT(text_node)) + if (! HAS_TEXT(text_node)) FIND_NEXT_IMPT(text_node); if (! HAS_TEXT(text_node)) - ST.c1 = ST.c2 = CHRTEST_VOID; + ST.Binfo.count = 0; else { if ( PL_regkind[OP(text_node)] != EXACT ) { - ST.c1 = ST.c2 = CHRTEST_VOID; + ST.Binfo.count = 0; } else { - if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ - text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8, - reginfo)) + if (! S_setup_EXACTISH_ST(aTHX_ text_node, + &ST.Binfo, reginfo)) { sayNO; } @@ -8456,13 +9146,15 @@ NULL SET_locinput(li); ST.count = ST.min; REGCP_SET(ST.cp); - if (ST.c1 == CHRTEST_VOID) - goto curly_try_B_min; + + if (ST.Binfo.count <= 0) + goto curly_try_B_min; ST.oldloc = locinput; /* set ST.maxpos to the furthest point along the - * string that could possibly match */ + * string that could possibly match, i.e., that a match could + * start at. */ if (ST.max == REG_INFTY) { ST.maxpos = loceol - 1; if (utf8_target) @@ -8509,15 +9201,14 @@ NULL NOT_REACHED; /* NOTREACHED */ case CURLY_B_min_fail: - /* failed to find B in a non-greedy match. - * Handles both cases where c1,c2 valid or not */ + /* failed to find B in a non-greedy match. */ REGCP_UNWIND(ST.cp); if (ST.paren) { UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); } - if (ST.c1 == CHRTEST_VOID) { + if (ST.Binfo.count == 0) { /* failed -- move forward one */ char *li = locinput; if (!regrepeat(rex, &li, ST.A, loceol, reginfo, 1)) { @@ -8543,84 +9234,78 @@ NULL curly_try_B_min_known: /* find the next place where 'B' could work, then call B */ - if (utf8_target) { - n = (ST.oldloc == locinput) ? 0 : 1; - if (ST.c1 == ST.c2) { - /* set n to utf8_distance(oldloc, locinput) */ - while ( locinput <= ST.maxpos - && locinput < loceol - && memNE(locinput, ST.c1_utf8, - UTF8_SAFE_SKIP(locinput, reginfo->strend))) - { - locinput += UTF8_SAFE_SKIP(locinput, - reginfo->strend); - n++; - } - } - else { - /* set n to utf8_distance(oldloc, locinput) */ - while ( locinput <= ST.maxpos - && locinput < loceol - && memNE(locinput, ST.c1_utf8, - UTF8_SAFE_SKIP(locinput, reginfo->strend)) - && memNE(locinput, ST.c2_utf8, - UTF8_SAFE_SKIP(locinput, reginfo->strend))) - { - locinput += UTF8_SAFE_SKIP(locinput, reginfo->strend); - n++; - } - } - } - else { /* Not utf8_target */ - if (ST.c1 == ST.c2) { - locinput = (char *) memchr(locinput, - ST.c1, - ST.maxpos + 1 - locinput); - if (! locinput) { - locinput = ST.maxpos + 1; + if (locinput + ST.Binfo.initial_exact < loceol) { + if (ST.Binfo.initial_exact >= ST.Binfo.max_length) { + + /* Here, the mask is all 1's for the entire length of + * any possible match. (That actually means that there + * is only one possible match.) Look for the next + * occurrence */ + locinput = ninstr(locinput, loceol, + (char *) ST.Binfo.matches, + (char *) ST.Binfo.matches + + ST.Binfo.initial_exact); + if (locinput == NULL) { + sayNO; } - } - else { - U8 c1_c2_bits_differing = ST.c1 ^ ST.c2; - - if (! isPOWER_OF_2(c1_c2_bits_differing)) { - while ( locinput <= ST.maxpos - && UCHARAT(locinput) != ST.c1 - && UCHARAT(locinput) != ST.c2) - { - locinput++; - } + } + else do { + /* If the first byte(s) of the mask are all ones, it + * means those bytes must match identically, so can use + * ninstr() to find the next possible matchpoint */ + if (ST.Binfo.initial_exact > 0) { + locinput = ninstr(locinput, loceol, + (char *) ST.Binfo.matches, + (char *) ST.Binfo.matches + + ST.Binfo.initial_exact); } - else { - /* If c1 and c2 only differ by a single bit, we can - * avoid a conditional each time through the loop, - * at the expense of a little preliminary setup and - * an extra mask each iteration. By masking out - * that bit, we match exactly two characters, c1 - * and c2, and so we don't have to test for both. - * On both ASCII and EBCDIC platforms, most of the - * ASCII-range and Latin1-range folded equivalents - * differ only in a single bit, so this is actually - * the most common case. (e.g. 'A' 0x41 vs 'a' - * 0x61). */ - U8 c1_masked = ST.c1 &~ c1_c2_bits_differing; - U8 c1_c2_mask = ~ c1_c2_bits_differing; - while ( locinput <= ST.maxpos - && (UCHARAT(locinput) & c1_c2_mask) - != c1_masked) - { - locinput++; + else { /* Otherwise find the next byte that matches, + masked */ + locinput = (char *) find_next_masked( + (U8 *) locinput, (U8 *) loceol, + ST.Binfo.first_byte_anded, + ST.Binfo.first_byte_mask); + /* Advance to the end of a multi-byte character */ + if (utf8_target) { + while ( locinput < loceol + && UTF8_IS_CONTINUATION(*locinput)) + { + locinput++; + } } } - } - n = locinput - ST.oldloc; - } + if ( locinput == NULL + || locinput + ST.Binfo.min_length > loceol) + { + sayNO; + } + + /* Here, we have found a possible match point; if can't + * rule it out, quit the loop so can check fully */ + if (S_test_EXACTISH_ST(locinput, ST.Binfo)) { + break; + } + + locinput += (utf8_target) ? UTF8SKIP(locinput) : 1; + + } while (locinput <= ST.maxpos); + } + if (locinput > ST.maxpos) sayNO; + + n = (utf8_target) + ? utf8_length((U8 *) ST.oldloc, (U8 *) locinput) + : (STRLEN) (locinput - ST.oldloc); + + + /* Here is at the beginning of a character that meets the mask + * criteria. Need to make sure that some real possibility */ + if (n) { /* In /a{m,n}b/, ST.oldloc is at "a" x m, locinput is - * at b; check that everything between oldloc and - * locinput matches */ + * at what may be the beginning of b; check that everything + * between oldloc and locinput matches */ char *li = ST.oldloc; ST.count += n; if (regrepeat(rex, &li, ST.A, loceol, reginfo, n) < n) @@ -8631,37 +9316,23 @@ NULL curly_try_B_min: CURLY_SETPAREN(ST.paren, ST.count); - PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol); + PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ curly_try_B_max: /* a successful greedy match: now try to match B */ - { - bool could_match = locinput < loceol; - - /* If it could work, try it. */ - if (ST.c1 != CHRTEST_VOID && could_match) { - if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target) - { - could_match = memEQ(locinput, ST.c1_utf8, - UTF8_SAFE_SKIP(locinput, - reginfo->strend)) - || memEQ(locinput, ST.c2_utf8, - UTF8_SAFE_SKIP(locinput, - reginfo->strend)); - } - else { - could_match = UCHARAT(locinput) == ST.c1 - || UCHARAT(locinput) == ST.c2; - } - } - if (ST.c1 == CHRTEST_VOID || could_match) { - CURLY_SETPAREN(ST.paren, ST.count); - PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol); - NOT_REACHED; /* NOTREACHED */ - } - } + if ( ST.Binfo.count <= 0 + || ( ST.Binfo.count > 0 + && locinput + ST.Binfo.min_length <= loceol + && S_test_EXACTISH_ST(locinput, ST.Binfo))) + { + CURLY_SETPAREN(ST.paren, ST.count); + PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput, loceol, + script_run_begin); + NOT_REACHED; /* NOTREACHED */ + } /* FALLTHROUGH */ case CURLY_B_max_fail: @@ -8713,8 +9384,9 @@ NULL SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput); - PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B, - locinput, loceol); /* match B */ + PUSH_YES_STATE_GOTO(EVAL_postponed_AB, /* match B */ + st->u.eval.prev_eval->u.eval.B, + locinput, loceol, script_run_begin); } if (locinput < reginfo->till) { @@ -8724,7 +9396,7 @@ NULL (long)(locinput - startpos), (long)(reginfo->till - startpos), PL_colors[5])); - + sayNO_SILENT; /* Cannot match: too short. */ } sayYES; /* Success! */ @@ -8743,7 +9415,7 @@ NULL ST.start = locinput; ST.end = loceol; ST.count = 1; - goto do_ifmatch; + goto do_ifmatch; case UNLESSM: /* -ve lookaround: (?!A), or with 'flags', (?...) */ locinput = st->locinput; loceol = st->loceol; + script_run_begin = st->sr0; } scan = ST.me + ARG(ST.me); if (scan == ST.me) @@ -8857,12 +9531,13 @@ NULL case PRUNE: /* (*PRUNE) */ if (scan->flags) sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); - PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol); + PUSH_STATE_GOTO(COMMIT_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case COMMIT_next_fail: - no_final = 1; - /* FALLTHROUGH */ + no_final = 1; + /* FALLTHROUGH */ sayNO; NOT_REACHED; /* NOTREACHED */ @@ -8883,11 +9558,12 @@ NULL #define ST st->u.mark case MARKPOINT: /* (*MARK:foo) */ ST.prev_mark = mark_state; - ST.mark_name = sv_commit = sv_yes_mark + ST.mark_name = sv_commit = sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); mark_state = st; ST.mark_loc = locinput; - PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol); + PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput, loceol, + script_run_begin); NOT_REACHED; /* NOTREACHED */ case MARKPOINT_next: @@ -8896,7 +9572,7 @@ NULL NOT_REACHED; /* NOTREACHED */ case MARKPOINT_next_fail: - if (popmark && sv_eq(ST.mark_name,popmark)) + if (popmark && sv_eq(ST.mark_name,popmark)) { if (ST.mark_loc > startpoint) reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); @@ -8910,7 +9586,7 @@ NULL }); } mark_state = ST.prev_mark; - sv_yes_mark = mark_state ? + sv_yes_mark = mark_state ? mark_state->u.mark.mark_name : NULL; sayNO; NOT_REACHED; /* NOTREACHED */ @@ -8920,43 +9596,45 @@ NULL /* (*SKIP) : if we fail we cut here*/ ST.mark_name = NULL; ST.mark_loc = locinput; - PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol); + PUSH_STATE_GOTO(SKIP_next,next, locinput, loceol, + script_run_begin); } else { - /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, - otherwise do nothing. Meaning we need to scan + /* (*SKIP:NAME) : if there is a (*MARK:NAME) fail where it was, + otherwise do nothing. Meaning we need to scan */ regmatch_state *cur = mark_state; SV *find = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); - + while (cur) { - if ( sv_eq( cur->u.mark.mark_name, - find ) ) + if ( sv_eq( cur->u.mark.mark_name, + find ) ) { ST.mark_name = find; - PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol); + PUSH_STATE_GOTO( SKIP_next, next, locinput, loceol, + script_run_begin); } cur = cur->u.mark.prev_mark; } - } + } /* Didn't find our (*MARK:NAME) so ignore this (*SKIP:NAME) */ - break; + break; case SKIP_next_fail: if (ST.mark_name) { - /* (*CUT:NAME) - Set up to search for the name as we + /* (*CUT:NAME) - Set up to search for the name as we collapse the stack*/ - popmark = ST.mark_name; + popmark = ST.mark_name; } else { /* (*CUT) - No name, we cut here.*/ if (ST.mark_loc > startpoint) reginfo->cutpoint = HOPBACKc(ST.mark_loc, 1); /* but we set sv_commit to latest mark_name if there is one so they can test to see how things lead to this - cut */ - if (mark_state) - sv_commit=mark_state->u.mark.mark_name; - } - no_final = 1; + cut */ + if (mark_state) + sv_commit=mark_state->u.mark.mark_name; + } + no_final = 1; sayNO; NOT_REACHED; /* NOTREACHED */ #undef ST @@ -8978,7 +9656,7 @@ NULL increment_locinput: assert(!NEXTCHR_IS_EOS); if (utf8_target) { - locinput += PL_utf8skip[nextchr]; + locinput += PL_utf8skip[nextbyte]; /* locinput is allowed to go 1 char off the end (signifying * EOS), but not 2+ */ if (locinput > loceol) @@ -8987,8 +9665,8 @@ NULL else locinput++; break; - - } /* end switch */ + + } /* end switch */ /* switch break jumps here */ scan = next; /* prepare to execute the next op and ... */ @@ -9004,8 +9682,10 @@ NULL /* push a new regex state, then continue at scan */ { regmatch_state *newst; + DECLARE_AND_GET_RE_DEBUG_FLAGS; - DEBUG_STACK_r({ + DEBUG_r( /* DEBUG_STACK_r */ + if (DEBUG_v_TEST || RE_DEBUG_FLAG(RE_DEBUG_EXTRA_STACK)) { regmatch_state *cur = st; regmatch_state *curyes = yes_state; U32 i; @@ -9024,19 +9704,21 @@ NULL if (curyes == cur) curyes = cur->u.yes.prev_yes_state; } - } else + } else { DEBUG_STATE_pp("push") - ); + }); depth++; st->locinput = locinput; st->loceol = loceol; - newst = st+1; + st->sr0 = script_run_begin; + newst = st+1; if (newst > SLAB_LAST(PL_regmatch_slab)) newst = S_push_slab(aTHX); PL_regmatch_state = newst; locinput = pushinput; loceol = pusheol; + script_run_begin = pushsr0; st = newst; continue; /* NOTREACHED */ @@ -9067,7 +9749,7 @@ NULL } DEBUG_STATE_r({ if (no_final) { - DEBUG_STATE_pp("pop (no final)"); + DEBUG_STATE_pp("pop (no final)"); } else { DEBUG_STATE_pp("pop (yes)"); } @@ -9088,10 +9770,11 @@ NULL st = yes_state; yes_state = st->u.yes.prev_yes_state; PL_regmatch_state = st; - + if (no_final) { locinput= st->locinput; loceol= st->loceol; + script_run_begin = st->sr0; } state_num = st->resume_state + no_final; goto reenter_switch; @@ -9132,7 +9815,7 @@ NULL } else { goto final_exit; } - } + } if (depth) { /* there's a previous state to backtrack to */ st--; @@ -9143,6 +9826,7 @@ NULL PL_regmatch_state = st; locinput= st->locinput; loceol= st->loceol; + script_run_begin = st->sr0; DEBUG_STATE_pp("pop"); depth--; @@ -9161,10 +9845,10 @@ NULL SV *sv_mrk = get_sv("REGMARK", 1); if (result) { sv_commit = &PL_sv_no; - if (!sv_yes_mark) + if (!sv_yes_mark) sv_yes_mark = &PL_sv_yes; } else { - if (!sv_commit) + if (!sv_commit) sv_commit = &PL_sv_yes; sv_yes_mark = &PL_sv_no; } @@ -9208,14 +9892,12 @@ STATIC I32 S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, char * loceol, regmatch_info *const reginfo, I32 max _pDEPTH) { - dVAR; char *scan; /* Pointer to current position in target string */ I32 c; char *this_eol = loceol; /* potentially adjusted version. */ I32 hardcount = 0; /* How many matches so far */ bool utf8_target = reginfo->is_utf8_target; unsigned int to_complement = 0; /* Invert the result? */ - UV utf8_flags = 0; _char_class_number classnum; PERL_ARGS_ASSERT_REGREPEAT; @@ -9233,22 +9915,22 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, this_eol = scan + max; /* Here, for the case of a non-UTF-8 target we have adjusted down - * to the maximum of how far we should go in it (leaving it set to the real - * end, if the maximum permissible would take us beyond that). This allows - * us to make the loop exit condition that we haven't gone past to - * also mean that we haven't exceeded the max permissible count, saving a - * test each time through the loops. But it assumes that the OP matches a - * single byte, which is true for most of the OPs below when applied to a - * non-UTF-8 target. Those relatively few OPs that don't have this - * characteristic will have to compensate. + * to the maximum of how far we should go in it (but leaving it set to the + * real end if the maximum permissible would take us beyond that). This + * allows us to make the loop exit condition that we haven't gone past + * to also mean that we haven't exceeded the max permissible + * count, saving a test each time through the loop. But it assumes that + * the OP matches a single byte, which is true for most of the OPs below + * when applied to a non-UTF-8 target. Those relatively few OPs that don't + * have this characteristic have to compensate. * - * There is no adjustment for UTF-8 targets, as the number of bytes per - * character varies. OPs will have to test both that the count is less - * than the max permissible (using to keep track), and that we - * are still within the bounds of the string (using . A few OPs - * match a single byte no matter what the encoding. They can omit the max - * test if, for the UTF-8 case, they do the adjustment that was skipped - * above. + * There is no such adjustment for UTF-8 targets, sinc the number of bytes + * per character can vary. OPs will have to test both that the count is + * less than the max permissible (using to keep track), and + * that we are still within the bounds of the string (using . A + * few OPs match a single byte no matter what the encoding. They can omit + * the max test if, for the UTF-8 case, they do the adjustment that was + * skipped above. * * Thus, the code above sets things up for the common case; and exceptional * cases need extra work; the common case is to make sure doesn't @@ -9279,207 +9961,179 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, else scan = this_eol; break; + case EXACTL: - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) { _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol); } + /* FALLTHROUGH */ + + case EXACTFL: + case EXACTFLU8: + _CHECK_AND_WARN_PROBLEMATIC_LOCALE; goto do_exact; - case EXACT_ONLY8: + case EXACT_REQ8: + case LEXACT_REQ8: + case EXACTFU_REQ8: if (! utf8_target) { break; } /* FALLTHROUGH */ + + case LEXACT: case EXACT: - do_exact: - assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + case EXACTF: + case EXACTFAA_NO_TRIE: + case EXACTFAA: + case EXACTFU: + case EXACTFUP: - c = (U8)*STRING(p); + do_exact: { + struct next_matchable_info Binfo; + PERL_UINT_FAST8_T definitive_len; - /* Can use a simple find if the pattern char to match on is invariant - * under UTF-8, or both target and pattern aren't UTF-8. Note that we - * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's - * true iff it doesn't matter if the argument is in UTF-8 or not */ - if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! reginfo->is_utf8_pat)) { - if (utf8_target && this_eol - scan > max) { - /* We didn't adjust because is UTF-8, but ok to do so, - * since here, to match at all, 1 char == 1 byte */ - this_eol = scan + max; - } - scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c); - } - else if (reginfo->is_utf8_pat) { - if (utf8_target) { - STRLEN scan_char_len; - - /* When both target and pattern are UTF-8, we have to do - * string EQ */ - while (hardcount < max - && scan < this_eol - && (scan_char_len = UTF8SKIP(scan)) <= STR_LEN(p) - && memEQ(scan, STRING(p), scan_char_len)) - { - scan += scan_char_len; - hardcount++; - } - } - else if (! UTF8_IS_ABOVE_LATIN1(c)) { + assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); - /* Target isn't utf8; convert the character in the UTF-8 - * pattern to non-UTF8, and do a simple find */ - c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); - scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c); - } /* else pattern char is above Latin1, can't possibly match the - non-UTF-8 target */ + /* Set up termination info, and quit if we can rule out that we've + * gotten a match of the termination criteria */ + if ( ! S_setup_EXACTISH_ST(aTHX_ p, &Binfo, reginfo) + || scan + Binfo.min_length > this_eol + || ! S_test_EXACTISH_ST(scan, Binfo)) + { + break; } - else { - - /* Here, the string must be utf8; pattern isn't, and is - * different in utf8 than not, so can't compare them directly. - * Outside the loop, find the two utf8 bytes that represent c, and - * then look for those in sequence in the utf8 string */ - U8 high = UTF8_TWO_BYTE_HI(c); - U8 low = UTF8_TWO_BYTE_LO(c); - while (hardcount < max - && scan + 1 < this_eol - && UCHARAT(scan) == high - && UCHARAT(scan + 1) == low) - { - scan += 2; - hardcount++; - } - } - break; + definitive_len = Binfo.initial_definitive; - case EXACTFAA_NO_TRIE: /* This node only generated for non-utf8 patterns */ - assert(! reginfo->is_utf8_pat); - /* FALLTHROUGH */ - case EXACTFAA: - utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; - if (reginfo->is_utf8_pat || ! utf8_target) { + /* Here there are potential matches, and the first byte(s) matched our + * filter + * + * If we got a definitive match of some initial bytes, there is no + * possibility of false positives as far as it got */ + if (definitive_len > 0) { - /* The possible presence of a MICRO SIGN in the pattern forbids us - * to view a non-UTF-8 pattern as folded when there is a UTF-8 - * target. */ - utf8_flags |= FOLDEQ_S2_ALREADY_FOLDED|FOLDEQ_S2_FOLDS_SANE; - } - goto do_exactf; + /* If as far as it got is the maximum possible, there were no false + * positives at all. Since we have everything set up, see how many + * repeats there are. */ + if (definitive_len >= Binfo.max_length) { - case EXACTFL: - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - utf8_flags = FOLDEQ_LOCALE; - goto do_exactf; + /* We've already found one match */ + scan += definitive_len; + hardcount++; - case EXACTF: /* This node only generated for non-utf8 patterns */ - assert(! reginfo->is_utf8_pat); - goto do_exactf; + /* If want more than the one match, and there is room for more, + * see if there are any */ + if (hardcount < max && scan + definitive_len <= this_eol) { - case EXACTFLU8: - if (! utf8_target) { - break; - } - utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED - | FOLDEQ_S2_FOLDS_SANE; - goto do_exactf; + /* If the character is only a single byte long, just span + * all such bytes. */ + if (definitive_len == 1) { + const char * orig_scan = scan; - case EXACTFU_ONLY8: - if (! utf8_target) { - break; - } - assert(reginfo->is_utf8_pat); - utf8_flags = FOLDEQ_S2_ALREADY_FOLDED; - goto do_exactf; + if (this_eol - (scan - hardcount) > max) { + this_eol = scan - hardcount + max; + } - case EXACTFU: - utf8_flags = FOLDEQ_S2_ALREADY_FOLDED; - /* FALLTHROUGH */ + /* Use different routines depending on whether it's an + * exact match or matches with a mask */ + if (Binfo.initial_exact == 1) { + scan = (char *) find_span_end((U8 *) scan, + (U8 *) this_eol, + Binfo.matches[0]); + } + else { + scan = (char *) find_span_end_mask( + (U8 *) scan, + (U8 *) this_eol, + Binfo.first_byte_anded, + Binfo.first_byte_mask); + } - case EXACTFUP: + hardcount += scan - orig_scan; + } + else { /* Here, the full character definitive match is more + than one byte */ + while ( hardcount < max + && scan + definitive_len <= this_eol + && S_test_EXACTISH_ST(scan, Binfo)) + { + scan += definitive_len; + hardcount++; + } + } + } - do_exactf: { - int c1, c2; - U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; + break; + } /* End of a full character is definitively matched */ - assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1); + /* Here, an initial portion of the character matched definitively, + * and the rest matched as well, but could have false positives */ - if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8, - reginfo)) - { - if (c1 == CHRTEST_VOID) { - /* Use full Unicode fold matching */ - char *tmpeol = loceol; - STRLEN pat_len = reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1; - while (hardcount < max - && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, - STRING(p), NULL, pat_len, - reginfo->is_utf8_pat, utf8_flags)) - { - scan = tmpeol; - tmpeol = loceol; - hardcount++; - } - } - else if (utf8_target) { - if (c1 == c2) { - while (scan < this_eol - && hardcount < max - && memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan, - loceol))) - { - scan += UTF8SKIP(c1_utf8); - hardcount++; - } - } - else { - while (scan < this_eol - && hardcount < max - && ( memEQ(scan, c1_utf8, UTF8_SAFE_SKIP(scan, - loceol)) - || memEQ(scan, c2_utf8, UTF8_SAFE_SKIP(scan, - loceol)))) + do { + PERL_INT_FAST8_T i; + U8 * matches = Binfo.matches; + + /* The first bytes were definitive. Look at the remaining */ + for (i = 0; i < Binfo.count; i++) { + if (memEQ(scan + definitive_len, + matches + definitive_len, + Binfo.lengths[i] - definitive_len)) { - scan += UTF8_SAFE_SKIP(scan, loceol); - hardcount++; + goto found_a_completion; } + + matches += Binfo.lengths[i]; } - } - else if (c1 == c2) { - scan = (char *) find_span_end((U8 *) scan, (U8 *) this_eol, (U8) c1); - } - else { - /* See comments in regmatch() CURLY_B_min_known_fail. We avoid - * a conditional each time through the loop if the characters - * differ only in a single bit, as is the usual situation */ - U8 c1_c2_bits_differing = c1 ^ c2; - if (isPOWER_OF_2(c1_c2_bits_differing)) { - U8 c1_c2_mask = ~ c1_c2_bits_differing; + /* Didn't find anything to complete our initial match. Stop + * here */ + break; - scan = (char *) find_span_end_mask((U8 *) scan, - (U8 *) this_eol, - c1 & c1_c2_mask, - c1_c2_mask); - } - else { - while ( scan < this_eol - && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2)) - { - scan++; - } + found_a_completion: + + /* Here, matched a full character, Include it in the result, + * and then look to see if the next char matches */ + hardcount++; + scan += Binfo.lengths[i]; + + } while ( hardcount < max + && scan + definitive_len < this_eol + && S_test_EXACTISH_ST(scan, Binfo)); + + /* Here, have advanced as far as possible */ + break; + } /* End of found some initial bytes that definitively matched */ + + /* Here, we can't rule out that we have found the beginning of 'B', but + * there were no initial bytes that could rule out anything + * definitively. Use brute force to examine all the possibilities */ + while (scan < this_eol && hardcount < max) { + PERL_INT_FAST8_T i; + U8 * matches = Binfo.matches; + + for (i = 0; i < Binfo.count; i++) { + if (memEQ(scan, matches, Binfo.lengths[i])) { + goto found1; } + + matches += Binfo.lengths[i]; } - } + + break; + + found1: + hardcount++; + scan += Binfo.lengths[i]; + } + break; - } + } case ANYOFPOSIXL: case ANYOFL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; + CHECK_AND_WARN_NON_UTF8_CTYPE_LOCALE_IN_SETS(p); - if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) { - Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); - } /* FALLTHROUGH */ case ANYOFD: case ANYOF: @@ -9532,27 +10186,110 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, case ANYOFH: if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ - if (ANYOF_FLAGS(p)) { /* If we know the first byte of what - matches, we can avoid calling reginclass - */ - while ( hardcount < max - && scan < this_eol - && (U8) *scan == ANYOF_FLAGS(p) - && reginclass(prog, p, (U8*)scan, (U8*) this_eol, - TRUE)) - { - scan += UTF8SKIP(scan); - hardcount++; - } + while ( hardcount < max + && scan < this_eol + && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case ANYOFHb: + if (utf8_target) { /* ANYOFHb only can match UTF-8 targets */ + + /* we know the first byte must be the FLAGS field */ + while ( hardcount < max + && scan < this_eol + && (U8) *scan == ANYOF_FLAGS(p) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, + TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case ANYOFHr: + if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ + while ( hardcount < max + && scan < this_eol + && inRANGE(NATIVE_UTF8_TO_I8(*scan), + LOWEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p)), + HIGHEST_ANYOF_HRx_BYTE(ANYOF_FLAGS(p))) + && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case ANYOFHs: + if (utf8_target) { /* ANYOFH only can match UTF-8 targets */ + while ( hardcount < max + && scan + FLAGS(p) < this_eol + && memEQ(scan, ((struct regnode_anyofhs *) p)->string, FLAGS(p)) + && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + break; + + case ANYOFR: + if (utf8_target) { + while ( hardcount < max + && scan < this_eol + && NATIVE_UTF8_TO_I8(*scan) >= ANYOF_FLAGS(p) + && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan, + (U8 *) this_eol, + NULL), + ANYOFRbase(p), ANYOFRdelta(p))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else { + while ( hardcount < max + && scan < this_eol + && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p))) + { + scan++; + hardcount++; } - else while ( hardcount < max - && scan < this_eol - && reginclass(prog, p, (U8*)scan, (U8*) this_eol, TRUE)) + } + break; + + case ANYOFRb: + if (utf8_target) { + while ( hardcount < max + && scan < this_eol + && (U8) *scan == ANYOF_FLAGS(p) + && withinCOUNT(utf8_to_uvchr_buf((U8 *) scan, + (U8 *) this_eol, + NULL), + ANYOFRbase(p), ANYOFRdelta(p))) { scan += UTF8SKIP(scan); hardcount++; } } + else { + while ( hardcount < max + && scan < this_eol + && withinCOUNT((U8) *scan, ANYOFRbase(p), ANYOFRdelta(p))) + { + scan++; + hardcount++; + } + } break; /* The argument (FLAGS) to all the POSIX node types is the class number */ @@ -9734,25 +10471,6 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; - case BOUNDL: - case NBOUNDL: - _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - /* FALLTHROUGH */ - case BOUND: - case BOUNDA: - case BOUNDU: - case EOS: - case GPOS: - case KEEPS: - case NBOUND: - case NBOUNDA: - case NBOUNDU: - case OPFAIL: - case SBOL: - case SEOL: - /* These are all 0 width, so match right here or not at all. */ - break; - default: Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]); NOT_REACHED; /* NOTREACHED */ @@ -9766,7 +10484,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, *startposp = scan; DEBUG_r({ - GET_RE_DEBUG_FLAGS_DECL; + DECLARE_AND_GET_RE_DEBUG_FLAGS; DEBUG_EXECUTE_r({ SV * const prop = sv_newmortal(); regprop(prog, prop, p, reginfo, NULL); @@ -9780,7 +10498,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* - reginclass - determine if a character falls into a character class - + n is the ANYOF-type regnode p is the target string p_end points to one byte beyond the end of the target string @@ -9797,8 +10515,9 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, STATIC bool S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const U8* const p_end, const bool utf8_target) { - dVAR; - const char flags = (OP(n) == ANYOFH) ? 0 : ANYOF_FLAGS(n); + const char flags = (inRANGE(OP(n), ANYOFH, ANYOFHs)) + ? 0 + : ANYOF_FLAGS(n); bool match = FALSE; UV c = *p; @@ -9825,7 +10544,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } /* If this character is potentially in the bitmap, check it */ - if (c < NUM_ANYOF_CODE_POINTS && OP(n) != ANYOFH) { + if (c < NUM_ANYOF_CODE_POINTS && ! inRANGE(OP(n), ANYOFH, ANYOFHb)) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; else if ((flags @@ -9838,7 +10557,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const } else if (flags & ANYOF_LOCALE_FLAGS) { if ( (flags & ANYOFL_FOLD) - && c < sizeof(PL_fold_locale) + && c < 256 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { match = TRUE; @@ -9926,8 +10645,14 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const && IN_UTF8_CTYPE_LOCALE))) { SV* only_utf8_locale = NULL; - SV * const definition = _get_regclass_nonbitmap_data(prog, n, TRUE, - 0, &only_utf8_locale, NULL); + SV * const definition = +#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) + get_regclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale, NULL); +#else + get_re_gclass_nonbitmap_data(prog, n, TRUE, 0, + &only_utf8_locale, NULL); +#endif if (definition) { U8 utf8_buffer[2]; U8 * utf8_p; @@ -10127,6 +10852,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) regmatch_info_aux_eval *eval_state = reginfo->info_aux_eval; eval_state->rex = rex; + eval_state->sv = reginfo->sv; if (reginfo->sv) { /* Make $_ available to executed code. */ @@ -10134,6 +10860,8 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) SAVE_DEFSV; DEFSV_set(reginfo->sv); } + /* will be dec'd by S_cleanup_regmatch_info_aux */ + SvREFCNT_inc_NN(reginfo->sv); if (!(mg = mg_find_mglob(reginfo->sv))) { /* prepare for quick setting of pos */ @@ -10159,7 +10887,7 @@ S_setup_eval_state(pTHX_ regmatch_info *const reginfo) /* this regexp is also owned by the new PL_reg_curpm, which will try to free it. */ av_push(PL_regex_padav, repointer); - PL_reg_curpm->op_pmoffset = av_tindex(PL_regex_padav); + PL_reg_curpm->op_pmoffset = av_top_index(PL_regex_padav); PL_regex_pad = AvARRAY(PL_regex_padav); } #endif @@ -10225,6 +10953,7 @@ S_cleanup_regmatch_info_aux(pTHX_ void *arg) } PL_curpm = eval_state->curpm; + SvREFCNT_dec(eval_state->sv); } PL_regmatch_state = aux->old_regmatch_state; @@ -10295,6 +11024,7 @@ S_to_byte_substr(pTHX_ regexp *prog) && !prog->substrs->data[i].substr) { SV* sv = newSVsv(prog->substrs->data[i].utf8_substr); if (! sv_utf8_downgrade(sv, TRUE)) { + SvREFCNT_dec_NN(sv); return FALSE; } if (SvVALID(prog->substrs->data[i].utf8_substr)) { @@ -10318,23 +11048,22 @@ S_to_byte_substr(pTHX_ regexp *prog) #ifndef PERL_IN_XSUB_RE bool -Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp) +Perl_is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp) { /* Temporary helper function for toke.c. Verify that the code point 'cp' * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in * the larger string bounded by 'strbeg' and 'strend'. * - * 'cp' needs to be assigned (if not a future version of the Unicode + * 'cp' needs to be assigned (if not, a future version of the Unicode * Standard could make it something that combines with adjacent characters, * so code using it would then break), and there has to be a GCB break * before and after the character. */ - dVAR; GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val; const U8 * prev_cp_start; - PERL_ARGS_ASSERT__IS_GRAPHEME; + PERL_ARGS_ASSERT_IS_GRAPHEME; if ( UNLIKELY(UNICODE_IS_SUPER(cp)) || UNLIKELY(UNICODE_IS_NONCHAR(cp))) @@ -10382,7 +11111,7 @@ Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, cons } /* -=head1 Unicode Support +=for apidoc_section $unicode =for apidoc isSCRIPT_RUN @@ -10451,7 +11180,6 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) * characters for at least one language in the Unicode Common Locale Data * Repository [CLDR]. */ - dVAR; /* Things that match /\d/u */ SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT]; @@ -10540,10 +11268,7 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) /* If is within the range [+0 .. +9] of the script's zero, it also is a * digit in that script. We can skip the rest of this code for this * character. */ - if (UNLIKELY( zero_of_run - && cp >= zero_of_run - && cp - zero_of_run <= 9)) - { + if (UNLIKELY(zero_of_run && withinCOUNT(cp, zero_of_run, 9))) { continue; } @@ -10764,7 +11489,7 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) * several scripts, and the intersection is not empty. However, if the * character is a decimal digit, it could still mean failure if it is * from the wrong sequence of 10. So, we need to look at if it's a - * digit. We've already handled the 10 decimal digits, and the next + * digit. We've already handled the 10 digits [0-9], and the next * lowest one is this one: */ if (cp < FIRST_NON_ASCII_DECIMAL_DIGIT) { continue; /* Not a digit; this character is part of the run */ @@ -10776,9 +11501,7 @@ Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) if ( script_of_char >= 0 && (zero_of_char = script_zeros[script_of_char])) { - if ( cp < zero_of_char - || cp > zero_of_char + 9) - { + if (! withinCOUNT(cp, zero_of_char, 9)) { continue; /* Not a digit; this character is part of the run */ }