X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/7f18ad16e2d0e9f1ba9b08fd0b11d9f69c5edcf7..3ed356df9354193bbcc5202f066f3c07ae84b443:/regexec.c?ds=sidebyside diff --git a/regexec.c b/regexec.c index 6b6dce1..4029f1e 100644 --- a/regexec.c +++ b/regexec.c @@ -95,14 +95,16 @@ const char* const non_utf8_target_but_utf8_required #define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0) +#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i) + #ifndef STATIC #define STATIC static #endif -/* Valid for non-utf8 strings, non-ANYOFV nodes only: avoids the reginclass +/* Valid for non-utf8 strings: avoids the reginclass * call if there are no complications: i.e., if everything matchable is * straight forward in the bitmap */ -#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0) \ : ANYOF_BITMAP_TEST(p,*(c))) /* @@ -658,8 +660,12 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "Not at start...\n")); goto fail; } - if (prog->check_offset_min == prog->check_offset_max && - !(prog->extflags & RXf_CANY_SEEN)) { + if (prog->check_offset_min == prog->check_offset_max + && !(prog->extflags & RXf_CANY_SEEN) + && ! multiline) /* /m can cause \n's to match that aren't + accounted for in the string max length. + See [perl #115242] */ + { /* Substring at constant offset from beg-of-str... */ I32 slen; @@ -1450,12 +1456,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* We know what class it must start with. */ switch (OP(c)) { - case ANYOFV: case ANYOF: - if (utf8_target || OP(c) == ANYOFV) { - STRLEN inclasslen = strend - s; + if (utf8_target) { REXEC_FBC_UTF8_CLASS_SCAN( - reginclass(prog, c, (U8*)s, &inclasslen, utf8_target)); + reginclass(prog, c, (U8*)s, utf8_target)); } else { REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s)); @@ -2935,6 +2939,8 @@ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) #define CHRTEST_UNINIT -1001 /* c1/c2 haven't been calculated yet */ #define CHRTEST_VOID -1000 /* the c1/c2 "next char" test should be skipped */ +#define CHRTEST_NOT_A_CP_1 -999 +#define CHRTEST_NOT_A_CP_2 -998 #define SLAB_FIRST(s) (&(s)->states[0]) #define SLAB_LAST(s) (&(s)->states[PERL_REGMATCH_SLAB_SLOTS-1]) @@ -3271,7 +3277,260 @@ S_clear_backtrack_stack(pTHX_ void *p) Safefree(osl); } } +static bool +S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, U8* c1_utf8, int *c2p, U8* c2_utf8) +{ + /* This function determines if there are one or two characters that match + * the first character of the passed-in EXACTish node , and if + * so, returns them in the passed-in pointers. + * + * If it determines that no possible character in the target string can + * match, it returns FALSE; otherwise TRUE. (The FALSE situation occurs if + * the first character in requires UTF-8 to represent, and the + * target string isn't in UTF-8.) + * + * If there are more than two characters that could match the beginning of + * , or if more context is required to determine a match or not, + * it sets both * and * to CHRTEST_VOID. + * + * The motiviation behind this function is to allow the caller to set up + * tight loops for matching. If is of type EXACT, there is + * only one possible character that can match its first character, and so + * the situation is quite simple. But things get much more complicated if + * folding is involved. It may be that the first character of an EXACTFish + * node doesn't participate in any possible fold, e.g., punctuation, so it + * can be matched only by itself. The vast majority of characters that are + * in folds match just two things, their lower and upper-case equivalents. + * But not all are like that; some have multiple possible matches, or match + * sequences of more than one character. This function sorts all that out. + * + * Consider the patterns A*B or A*?B where A and B are arbitrary. In a + * loop of trying to match A*, we know we can't exit where the thing + * following it isn't a B. And something can't be a B unless it is the + * beginning of B. By putting a quick test for that beginning in a tight + * loop, we can rule out things that can't possibly be B without having to + * break out of the loop, thus avoiding work. Similarly, if A is a single + * character, we can make a tight loop matching A*, using the outputs of + * this function. + * + * If the target string to match isn't in UTF-8, and there aren't + * complications which require CHRTEST_VOID, * and * are set to + * the one or two possible octets (which are characters in this situation) + * that can match. In all cases, if there is only one character that can + * match, * and * will be identical. + * + * If the target string is in UTF-8, the buffers pointed to by + * and will contain the one or two UTF-8 sequences of bytes that + * can match the beginning of . They should be declared with at + * least length UTF8_MAXBYTES+1. (If the target string isn't in UTF-8, it is + * undefined what these contain.) If one or both of the buffers are + * invariant under UTF-8, *, and * will also be set to the + * corresponding invariant. If variant, the corresponding * and/or + * * will be set to a negative number(s) that shouldn't match any code + * point (unless inappropriately coerced to unsigned). * will equal + * * if and only if and are the same. */ + + const bool utf8_target = PL_reg_match_utf8; + + UV c1 = CHRTEST_NOT_A_CP_1; + UV c2 = CHRTEST_NOT_A_CP_2; + bool use_chrtest_void = FALSE; + + /* Used when we have both utf8 input and utf8 output, to avoid converting + * to/from code points */ + bool utf8_has_been_setup = FALSE; + + dVAR; + + U8 *pat = (U8*)STRING(text_node); + + if (OP(text_node) == EXACT) { + + /* In an exact node, only one thing can be matched, that first + * character. If both the pat and the target are UTF-8, we can just + * copy the input to the output, avoiding finding the code point of + * that character */ + if (! UTF_PATTERN) { + c2 = c1 = *pat; + } + else if (utf8_target) { + Copy(pat, c1_utf8, UTF8SKIP(pat), U8); + Copy(pat, c2_utf8, UTF8SKIP(pat), U8); + utf8_has_been_setup = TRUE; + } + else { + c2 = c1 = valid_utf8_to_uvchr(pat, NULL); + } + } + else /* an EXACTFish node */ + if ((UTF_PATTERN + && is_MULTI_CHAR_FOLD_utf8_safe(pat, + pat + STR_LEN(text_node))) + || (! UTF_PATTERN + && is_MULTI_CHAR_FOLD_latin1_safe(pat, + pat + STR_LEN(text_node)))) + { + /* Multi-character folds require more context to sort out. Also + * PL_utf8_foldclosures used below doesn't handle them, so have to be + * handled outside this routine */ + use_chrtest_void = TRUE; + } + else { /* an EXACTFish node which doesn't begin with a multi-char fold */ + c1 = (UTF_PATTERN) ? valid_utf8_to_uvchr(pat, NULL) : *pat; + if (c1 > 256) { + /* Load the folds hash, if not already done */ + SV** listp; + if (! PL_utf8_foldclosures) { + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES+1]; + + /* Force loading this by folding an above-Latin1 char */ + to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL); + assert(PL_utf8_tofold); /* Verify that worked */ + } + PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold); + } + + /* The fold closures data structure is a hash with the keys being + * the UTF-8 of every character that is folded to, like 'k', and + * the values each an array of all code points that fold to its + * key. e.g. [ 'k', 'K', KELVIN_SIGN ]. Multi-character folds are + * not included */ + if ((! (listp = hv_fetch(PL_utf8_foldclosures, + (char *) pat, + UTF8SKIP(pat), + FALSE)))) + { + /* Not found in the hash, therefore there are no folds + * containing it, so there is only a single character that + * could match */ + c2 = c1; + } + else { /* Does participate in folds */ + AV* list = (AV*) *listp; + if (av_len(list) != 1) { + + /* If there aren't exactly two folds to this, it is outside + * the scope of this function */ + use_chrtest_void = TRUE; + } + else { /* There are two. Get them */ + SV** c_p = av_fetch(list, 0, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c1 = SvUV(*c_p); + + c_p = av_fetch(list, 1, FALSE); + if (c_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + c2 = SvUV(*c_p); + + /* Folds that cross the 255/256 boundary are forbidden if + * EXACTFL, or EXACTFA and one is ASCIII. Since the + * pattern character is above 256, and its only other match + * is below 256, the only legal match will be to itself. + * We have thrown away the original, so have to compute + * which is the one above 255 */ + if ((c1 < 256) != (c2 < 256)) { + if (OP(text_node) == EXACTFL + || (OP(text_node) == EXACTFA + && (isASCII(c1) || isASCII(c2)))) + { + if (c1 < 256) { + c1 = c2; + } + else { + c2 = c1; + } + } + } + } + } + } + else /* Here, c1 is < 255 */ + if (utf8_target + && HAS_NONLATIN1_FOLD_CLOSURE(c1) + && OP(text_node) != EXACTFL + && (OP(text_node) != EXACTFA || ! isASCII(c1))) + { + /* Here, there could be something above Latin1 in the target which + * folds to this character in the pattern. All such cases except + * LATIN SMALL LETTER Y WITH DIAERESIS have more than two characters + * involved in their folds, so are outside the scope of this + * function */ + if (UNLIKELY(c1 == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) { + c2 = LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + } + else { + use_chrtest_void = TRUE; + } + } + else { /* Here nothing above Latin1 can fold to the pattern character */ + switch (OP(text_node)) { + + case EXACTFL: /* /l rules */ + c2 = PL_fold_locale[c1]; + break; + + case EXACTF: + if (! utf8_target) { /* /d rules */ + c2 = PL_fold[c1]; + break; + } + /* FALLTHROUGH */ + /* /u rules for all these. This happens to work for + * EXACTFA as nothing in Latin1 folds to ASCII */ + case EXACTFA: + case EXACTFU_TRICKYFOLD: + case EXACTFU_SS: + case EXACTFU: + c2 = PL_fold_latin1[c1]; + break; + + default: + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node)); + assert(0); /* NOTREACHED */ + } + } + } + + /* Here have figured things out. Set up the returns */ + if (use_chrtest_void) { + *c2p = *c1p = CHRTEST_VOID; + } + else if (utf8_target) { + if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */ + uvchr_to_utf8(c1_utf8, c1); + uvchr_to_utf8(c2_utf8, c2); + } + + /* Invariants are stored in both the utf8 and byte outputs; Use + * negative numbers otherwise for the byte ones. Make sure that the + * byte ones are the same iff the utf8 ones are the same */ + *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1; + *c2p = (UTF8_IS_INVARIANT(*c2_utf8)) + ? *c2_utf8 + : (c1 == c2) + ? CHRTEST_NOT_A_CP_1 + : CHRTEST_NOT_A_CP_2; + } + else if (c1 > 255) { + if (c2 > 255) { /* both possibilities are above what a non-utf8 string + can represent */ + return FALSE; + } + + *c1p = *c2p = c2; /* c2 is the only representable value */ + } + else { /* c1 is representable; see about c2 */ + *c1p = c1; + *c2p = (c2 < 256) ? c2 : c1; + } + return TRUE; +} /* returns -1 on failure, $+[0] on success */ STATIC I32 @@ -3405,6 +3664,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) reenter_switch: SET_nextchr; + assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS)); switch (state_num) { case BOL: /* /^../ */ @@ -3438,12 +3698,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) st->u.keeper.val = rex->offs[0].start; rex->offs[0].start = locinput - PL_bostr; PUSH_STATE_GOTO(KEEPS_next, next, locinput); - /*NOT-REACHED*/ + assert(0); /*NOTREACHED*/ case KEEPS_next_fail: /* rollback the start point change */ rex->offs[0].start = st->u.keeper.val; sayNO_SILENT; - /*NOT-REACHED*/ + assert(0); /*NOTREACHED*/ case EOL: /* /..$/ */ goto seol; @@ -3952,8 +4212,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } /* Neither the target nor the pattern are utf8 */ - if (UCHARAT(s) != nextchr && - UCHARAT(s) != fold_array[nextchr]) + if (UCHARAT(s) != nextchr + && !NEXTCHR_IS_EOS + && UCHARAT(s) != fold_array[nextchr]) { sayNO; } @@ -4049,15 +4310,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sayNO; break; - case ANYOFV: /* /[abx{df}]/i */ case ANYOF: /* /[abc]/ */ if (NEXTCHR_IS_EOS) sayNO; - if (utf8_target || state_num == ANYOFV) { - STRLEN inclasslen = PL_regeol - locinput; - if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target)) + if (utf8_target) { + if (!reginclass(rex, scan, (U8*)locinput, utf8_target)) sayNO; - locinput += inclasslen; + locinput += UTF8SKIP(locinput); break; } else { @@ -4396,7 +4655,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* This call case insensitively compares the entire buffer * at s, with the current input starting at locinput, but * not going off the end given by PL_regeol, and returns in - * limit upon success, how much of the current input was + * upon success, how much of the current input was * matched */ if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target, locinput, &limit, 0, utf8_target, utf8_fold_flags)) @@ -5400,19 +5659,12 @@ NULL if this changes back then the macro for IS_TEXT and friends need to change. */ - if (PL_regkind[OP(text_node)] == EXACT) - { - - ST.c1 = (U8)*STRING(text_node); - switch (OP(text_node)) { - case EXACTF: ST.c2 = PL_fold[ST.c1]; break; - case EXACTFA: - case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: - case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; - case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; - default: ST.c2 = ST.c1; - } + if (PL_regkind[OP(text_node)] == EXACT) { + if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ + text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8)) + { + sayNO; + } } } } @@ -5423,21 +5675,36 @@ NULL (int)(REPORT_CODE_OFF+(depth*2)), "", (IV)ST.count) ); - if ( !NEXTCHR_IS_EOS - && ST.c1 != CHRTEST_VOID - && nextchr != ST.c1 - && nextchr != ST.c2) - { - /* simulate B failing */ - DEBUG_OPTIMISE_r( - PerlIO_printf(Perl_debug_log, - "%*s CURLYM Fast bail c1=%"IVdf" c2=%"IVdf"\n", - (int)(REPORT_CODE_OFF+(depth*2)),"", - (IV)ST.c1,(IV)ST.c2 - )); - state_num = CURLYM_B_fail; - goto reenter_switch; - } + if (! NEXTCHR_IS_EOS && ST.c1 != CHRTEST_VOID) { + if (! UTF8_IS_INVARIANT(nextchr) && utf8_target) { + if (memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) + && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + { + /* simulate B failing */ + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM Fast bail next target=U+%"UVXf" c1=U+%"UVXf" c2=U+%"UVXf"\n", + (int)(REPORT_CODE_OFF+(depth*2)),"", + valid_utf8_to_uvchr((U8 *) locinput, NULL), + valid_utf8_to_uvchr(ST.c1_utf8, NULL), + valid_utf8_to_uvchr(ST.c2_utf8, NULL)) + ); + state_num = CURLYM_B_fail; + goto reenter_switch; + } + } + else if (nextchr != ST.c1 && nextchr != ST.c2) { + /* simulate B failing */ + DEBUG_OPTIMISE_r( + PerlIO_printf(Perl_debug_log, + "%*s CURLYM Fast bail next target=U+%X c1=U+%X c2=U+%X\n", + (int)(REPORT_CODE_OFF+(depth*2)),"", + (int) nextchr, ST.c1, ST.c2) + ); + state_num = CURLYM_B_fail; + goto reenter_switch; + } + } if (ST.me->flags) { /* emulate CLOSE: mark current A as captured */ @@ -5500,23 +5767,23 @@ NULL } \ } - case STAR: /* /A*B/ where A is width 1 */ + case STAR: /* /A*B/ where A is width 1 char */ ST.paren = 0; ST.min = 0; ST.max = REG_INFTY; scan = NEXTOPER(scan); goto repeat; - case PLUS: /* /A+B/ where A is width 1 */ + case PLUS: /* /A+B/ where A is width 1 char */ ST.paren = 0; ST.min = 1; ST.max = REG_INFTY; scan = NEXTOPER(scan); goto repeat; - case CURLYN: /* /(A){m,n}B/ where A is width 1 */ - ST.paren = scan->flags; /* Which paren to set */ - ST.lastparen = rex->lastparen; + case CURLYN: /* /(A){m,n}B/ where A is width 1 char */ + ST.paren = scan->flags; /* Which paren to set */ + ST.lastparen = rex->lastparen; ST.lastcloseparen = rex->lastcloseparen; if (ST.paren > PL_regsize) PL_regsize = ST.paren; @@ -5530,7 +5797,7 @@ NULL scan = regnext(NEXTOPER(scan) + NODE_STEP_REGNODE); goto repeat; - case CURLY: /* /A{m,n}B/ where A is width 1 */ + case CURLY: /* /A{m,n}B/ where A is width 1 char */ ST.paren = 0; ST.min = ARG1(scan); /* min to match */ ST.max = ARG2(scan); /* max to match */ @@ -5545,10 +5812,11 @@ NULL * of the quantifier and the EXACT-like node. -- japhy */ - if (ST.min > ST.max) /* XXX make this a compile-time check? */ - sayNO; - if (HAS_TEXT(next) || JUMPABLE(next)) { - U8 *s; + assert(ST.min <= ST.max); + if (! HAS_TEXT(next) && ! JUMPABLE(next)) { + ST.c1 = ST.c2 = CHRTEST_VOID; + } + else { regnode *text_node = next; if (! HAS_TEXT(text_node)) @@ -5559,10 +5827,8 @@ NULL else { if ( PL_regkind[OP(text_node)] != EXACT ) { ST.c1 = ST.c2 = CHRTEST_VOID; - goto assume_ok_easy; } - else - s = (U8*)STRING(text_node); + else { /* Currently we only get here when @@ -5570,37 +5836,14 @@ NULL if this changes back then the macro for IS_TEXT and friends need to change. */ - if (!UTF_PATTERN) { - ST.c1 = *s; - switch (OP(text_node)) { - case EXACTF: ST.c2 = PL_fold[ST.c1]; break; - case EXACTFA: - case EXACTFU_SS: - case EXACTFU_TRICKYFOLD: - case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; - case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; - default: ST.c2 = ST.c1; break; - } - } - else { /* UTF_PATTERN */ - if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) { - STRLEN ulen; - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; - - to_utf8_fold((U8*)s, tmpbuf, &ulen); - ST.c1 = ST.c2 = utf8n_to_uvchr(tmpbuf, UTF8_MAXLEN, 0, - uniflags); - } - else { - ST.c2 = ST.c1 = utf8n_to_uvchr(s, UTF8_MAXBYTES, 0, - uniflags); - } - } + if (! S_setup_EXACTISH_ST_c1_c2(aTHX_ + text_node, &ST.c1, ST.c1_utf8, &ST.c2, ST.c2_utf8)) + { + sayNO; + } + } } } - else - ST.c1 = ST.c2 = CHRTEST_VOID; - assume_ok_easy: ST.A = scan; ST.B = next; @@ -5686,31 +5929,26 @@ NULL if (utf8_target) { n = (ST.oldloc == locinput) ? 0 : 1; if (ST.c1 == ST.c2) { - STRLEN len; /* set n to utf8_distance(oldloc, locinput) */ - while (locinput <= ST.maxpos && - utf8n_to_uvchr((U8*)locinput, - UTF8_MAXBYTES, &len, - uniflags) != (UV)ST.c1) { - locinput += len; + while (locinput <= ST.maxpos + && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput))) + { + locinput += UTF8SKIP(locinput); n++; } } else { /* set n to utf8_distance(oldloc, locinput) */ - while (locinput <= ST.maxpos) { - STRLEN len; - const UV c = utf8n_to_uvchr((U8*)locinput, - UTF8_MAXBYTES, &len, - uniflags); - if (c == (UV)ST.c1 || c == (UV)ST.c2) - break; - locinput += len; + while (locinput <= ST.maxpos + && memNE(locinput, ST.c1_utf8, UTF8SKIP(locinput)) + && memNE(locinput, ST.c2_utf8, UTF8SKIP(locinput))) + { + locinput += UTF8SKIP(locinput); n++; } } } - else { + else { /* Not utf8_target */ if (ST.c1 == ST.c2) { while (locinput <= ST.maxpos && UCHARAT(locinput) != ST.c1) @@ -5775,6 +6013,7 @@ NULL PUSH_STATE_GOTO(CURLY_B_min, ST.B, locinput); } } + sayNO; assert(0); /* NOTREACHED */ @@ -5785,16 +6024,25 @@ NULL goto fake_end; } { - UV c = 0; - if (ST.c1 != CHRTEST_VOID && locinput < PL_regeol) - c = utf8_target ? utf8n_to_uvchr((U8*)locinput, - UTF8_MAXBYTES, 0, uniflags) - : (UV) UCHARAT(locinput); + bool could_match = locinput < PL_regeol; + /* If it could work, try it. */ - if (ST.c1 == CHRTEST_VOID - || (locinput < PL_regeol && - (c == (UV)ST.c1 || c == (UV)ST.c2))) - { + if (ST.c1 != CHRTEST_VOID && could_match) { + if (! UTF8_IS_INVARIANT(UCHARAT(locinput)) && utf8_target) + { + could_match = memEQ(locinput, + ST.c1_utf8, + UTF8SKIP(locinput)) + || memEQ(locinput, + ST.c2_utf8, + UTF8SKIP(locinput)); + } + else { + could_match = UCHARAT(locinput) == ST.c1 + || UCHARAT(locinput) == ST.c2; + } + } + if (ST.c1 == CHRTEST_VOID || could_match) { CURLY_SETPAREN(ST.paren, ST.count); PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput); assert(0); /* NOTREACHED */ @@ -6085,6 +6333,7 @@ NULL /* this is a point to jump to in order to increment * locinput by one character */ increment_locinput: + assert(!NEXTCHR_IS_EOS); if (utf8_target) { locinput += PL_utf8skip[nextchr]; /* locinput is allowed to go 1 char off the end, but not 2+ */ @@ -6282,21 +6531,24 @@ no_silent: /* - regrepeat - repeatedly match something simple, report how many * + * What 'simple' means is a node which can be the operand of a quantifier like + * '+', or {1,3} + * * startposp - pointer a pointer to the start position. This is updated * to point to the byte following the highest successful * match. * p - the regnode to be repeatedly matched against. - * max - maximum number of characters to match. + * max - maximum number of things to match. * depth - (for debugging) backtracking depth. */ STATIC I32 S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 max, int depth) { dVAR; - char *scan; + char *scan; /* Pointer to current position in target string */ I32 c; - char *loceol = PL_regeol; - I32 hardcount = 0; + char *loceol = PL_regeol; /* local version */ + I32 hardcount = 0; /* How many matches so far */ bool utf8_target = PL_reg_match_utf8; UV utf8_flags; #ifndef DEBUGGING @@ -6308,12 +6560,35 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma scan = *startposp; if (max == REG_INFTY) max = I32_MAX; - else if (max < loceol - scan) + else if (! utf8_target && scan + max < loceol) loceol = scan + max; + + /* Here, for the case of a non-UTF-8 target we have adjusted down + * to the maximum of how far we should go in it (leaving it set to the real + * end, if the maximum permissible would take us beyond that). This allows + * us to make the loop exit condition that we haven't gone past to + * also mean that we haven't exceeded the max permissible count, saving a + * test each time through the loop. But it assumes that the OP matches a + * single byte, which is true for most of the OPs below when applied to a + * non-UTF-8 target. Those relatively few OPs that don't have this + * characteristic will have to compensate. + * + * There is no adjustment for UTF-8 targets, as the number of bytes per + * character varies. OPs will have to test both that the count is less + * than the max permissible (using to keep track), and that we + * are still within the bounds of the string (using . A few OPs + * match a single byte no matter what the encoding. They can omit the max + * test if, for the UTF-8 case, they do the adjustment that was skipped + * above. + * + * Thus, the code above sets things up for the common case; and exceptional + * cases need extra work; the common case is to make sure doesn't + * go past , and for UTF-8 to also use to make sure the + * count doesn't exceed the maximum permissible */ + switch (OP(p)) { case REG_ANY: if (utf8_target) { - loceol = PL_regeol; while (scan < loceol && hardcount < max && *scan != '\n') { scan += UTF8SKIP(scan); hardcount++; @@ -6325,7 +6600,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case SANY: if (utf8_target) { - loceol = PL_regeol; while (scan < loceol && hardcount < max) { scan += UTF8SKIP(scan); hardcount++; @@ -6334,35 +6608,69 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma else scan = loceol; break; - case CANY: - scan = loceol; + case CANY: /* Move forward bytes, unless goes off end */ + if (utf8_target && scan + max < loceol) { + + /* hadn't been adjusted in the UTF-8 case */ + scan += max; + } + else { + scan = loceol; + } break; case EXACT: - /* To get here, EXACTish nodes must have *byte* length == 1. That - * means they match only characters in the string that can be expressed - * as a single byte. For non-utf8 strings, that means a simple match. - * For utf8 strings, the character matched must be an invariant, or - * downgradable to a single byte. The pattern's utf8ness is - * irrelevant, as since it's a single byte, it either isn't utf8, or if - * it is, it's an invariant */ + assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1); c = (U8)*STRING(p); - assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); - if (! utf8_target || UNI_IS_INVARIANT(c)) { + /* Can use a simple loop if the pattern char to match on is invariant + * under UTF-8, or both target and pattern aren't UTF-8. Note that we + * can use UTF8_IS_INVARIANT() even if the pattern isn't UTF-8, as it's + * true iff it doesn't matter if the argument is in UTF-8 or not */ + if (UTF8_IS_INVARIANT(c) || (! utf8_target && ! UTF_PATTERN)) { + if (utf8_target && scan + max < loceol) { + /* We didn't adjust because is UTF-8, but ok to do so, + * since here, to match at all, 1 char == 1 byte */ + loceol = scan + max; + } while (scan < loceol && UCHARAT(scan) == c) { scan++; } } - else { + else if (UTF_PATTERN) { + if (utf8_target) { + STRLEN scan_char_len; + + /* When both target and pattern are UTF-8, we have to do + * string EQ */ + while (hardcount < max + && scan + (scan_char_len = UTF8SKIP(scan)) <= loceol + && scan_char_len <= STR_LEN(p) + && memEQ(scan, STRING(p), scan_char_len)) + { + scan += scan_char_len; + hardcount++; + } + } + else if (! UTF8_IS_ABOVE_LATIN1(c)) { - /* Here, the string is utf8, and the pattern char is different - * in utf8 than not, so can't compare them directly. Outside the - * loop, find the two utf8 bytes that represent c, and then - * look for those in sequence in the utf8 string */ + /* Target isn't utf8; convert the character in the UTF-8 + * pattern to non-UTF8, and do a simple loop */ + c = TWO_BYTE_UTF8_TO_UNI(c, *(STRING(p) + 1)); + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } /* else pattern char is above Latin1, can't possibly match the + non-UTF-8 target */ + } + else { + + /* Here, the string must be utf8; pattern isn't, and is + * different in utf8 than not, so can't compare them directly. + * Outside the loop, find the two utf8 bytes that represent c, and + * then look for those in sequence in the utf8 string */ U8 high = UTF8_TWO_BYTE_HI(c); U8 low = UTF8_TWO_BYTE_LO(c); - loceol = PL_regeol; while (hardcount < max && scan + 1 < loceol @@ -6374,6 +6682,7 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } } break; + case EXACTFA: utf8_flags = FOLDEQ_UTF8_NOMIX_ASCII; goto do_exactf; @@ -6392,65 +6701,69 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case EXACTFU: utf8_flags = (UTF_PATTERN) ? FOLDEQ_S2_ALREADY_FOLDED : 0; - /* The comments for the EXACT case above apply as well to these fold - * ones */ - - do_exactf: - c = (U8)*STRING(p); - assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); - - if (utf8_target || OP(p) == EXACTFU_SS) { /* Use full Unicode fold matching */ - char *tmpeol = loceol; - while (hardcount < max - && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, - STRING(p), NULL, 1, cBOOL(UTF_PATTERN), utf8_flags)) - { - scan = tmpeol; - tmpeol = loceol; - hardcount++; - } - - /* XXX Note that the above handles properly the German sharp s in - * the pattern matching ss in the string. But it doesn't handle - * properly cases where the string contains say 'LIGATURE ff' and - * the pattern is 'f+'. This would require, say, a new function or - * revised interface to foldEQ_utf8(), in which the maximum number - * of characters to match could be passed and it would return how - * many actually did. This is just one of many cases where - * multi-char folds don't work properly, and so the fix is being - * deferred */ - } - else { - U8 folded; - - /* Here, the string isn't utf8 and c is a single byte; and either - * the pattern isn't utf8 or c is an invariant, so its utf8ness - * doesn't affect c. Can just do simple comparisons for exact or - * fold matching. */ - switch (OP(p)) { - case EXACTF: folded = PL_fold[c]; break; - case EXACTFA: - case EXACTFU_TRICKYFOLD: - case EXACTFU: folded = PL_fold_latin1[c]; break; - case EXACTFL: folded = PL_fold_locale[c]; break; - default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); - } - while (scan < loceol && - (UCHARAT(scan) == c || UCHARAT(scan) == folded)) - { - scan++; - } + do_exactf: { + int c1, c2; + U8 c1_utf8[UTF8_MAXBYTES+1], c2_utf8[UTF8_MAXBYTES+1]; + + assert(STR_LEN(p) == (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1); + + if (S_setup_EXACTISH_ST_c1_c2(aTHX_ p, &c1, c1_utf8, &c2, c2_utf8)) { + if (c1 == CHRTEST_VOID) { + /* Use full Unicode fold matching */ + char *tmpeol = PL_regeol; + STRLEN pat_len = (UTF_PATTERN) ? UTF8SKIP(STRING(p)) : 1; + while (hardcount < max + && foldEQ_utf8_flags(scan, &tmpeol, 0, utf8_target, + STRING(p), NULL, pat_len, + cBOOL(UTF_PATTERN), utf8_flags)) + { + scan = tmpeol; + tmpeol = PL_regeol; + hardcount++; + } + } + else if (utf8_target) { + if (c1 == c2) { + while (scan < loceol + && hardcount < max + && memEQ(scan, c1_utf8, UTF8SKIP(scan))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else { + while (scan < loceol + && hardcount < max + && (memEQ(scan, c1_utf8, UTF8SKIP(scan)) + || memEQ(scan, c2_utf8, UTF8SKIP(scan)))) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + } + else if (c1 == c2) { + while (scan < loceol && UCHARAT(scan) == c1) { + scan++; + } + } + else { + while (scan < loceol && + (UCHARAT(scan) == c1 || UCHARAT(scan) == c2)) + { + scan++; + } + } } break; - case ANYOFV: + } case ANYOF: - if (utf8_target || OP(p) == ANYOFV) { + if (utf8_target) { STRLEN inclasslen; - loceol = PL_regeol; - inclasslen = loceol - scan; while (hardcount < max - && ((inclasslen = loceol - scan) > 0) - && reginclass(prog, p, (U8*)scan, &inclasslen, utf8_target)) + && scan + (inclasslen = UTF8SKIP(scan)) <= loceol + && reginclass(prog, p, (U8*)scan, utf8_target)) { scan += inclasslen; hardcount++; @@ -6463,7 +6776,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case ALNUMU: if (utf8_target) { utf8_wordchar: - loceol = PL_regeol; LOAD_UTF8_CHARCLASS_ALNUM(); while (hardcount < max && scan < loceol && swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) @@ -6485,6 +6797,12 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } break; case ALNUMA: + if (utf8_target && scan + max < loceol) { + + /* We didn't adjust because is UTF-8, but ok to do so, + * since here, to match, 1 char == 1 byte */ + loceol = scan + max; + } while (scan < loceol && isWORDCHAR_A((U8) *scan)) { scan++; } @@ -6492,7 +6810,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case ALNUML: PL_reg_flags |= RF_tainted; if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && isALNUM_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); @@ -6508,7 +6825,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma utf8_Nwordchar: - loceol = PL_regeol; LOAD_UTF8_CHARCLASS_ALNUM(); while (hardcount < max && scan < loceol && ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) @@ -6531,14 +6847,23 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case POSIXA: - while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) { + if (utf8_target && scan + max < loceol) { + + /* We didn't adjust because is UTF-8, but ok to do so, + * since here, to match, 1 char == 1 byte */ + loceol = scan + max; + } + while (scan < loceol && _generic_isCC_A((U8) *scan, FLAGS(p))) { scan++; } break; case NPOSIXA: if (utf8_target) { - while (scan < loceol && ! _generic_isCC_A((U8) *scan, FLAGS(p))) { + while (scan < loceol && hardcount < max + && ! _generic_isCC_A((U8) *scan, FLAGS(p))) + { scan += UTF8SKIP(scan); + hardcount++; } } else { @@ -6549,8 +6874,11 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case NALNUMA: if (utf8_target) { - while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) { + while (scan < loceol && hardcount < max + && ! isWORDCHAR_A((U8) *scan)) + { scan += UTF8SKIP(scan); + hardcount++; } } else { @@ -6562,7 +6890,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case NALNUML: PL_reg_flags |= RF_tainted; if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && !isALNUM_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); @@ -6578,7 +6905,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma utf8_space: - loceol = PL_regeol; LOAD_UTF8_CHARCLASS_SPACE(); while (hardcount < max && scan < loceol && (*scan == ' ' || @@ -6604,6 +6930,12 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } break; case SPACEA: + if (utf8_target && scan + max < loceol) { + + /* We didn't adjust because is UTF-8, but ok to do so, + * since here, to match, 1 char == 1 byte */ + loceol = scan + max; + } while (scan < loceol && isSPACE_A((U8) *scan)) { scan++; } @@ -6611,7 +6943,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case SPACEL: PL_reg_flags |= RF_tainted; if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && isSPACE_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); @@ -6627,7 +6958,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma utf8_Nspace: - loceol = PL_regeol; LOAD_UTF8_CHARCLASS_SPACE(); while (hardcount < max && scan < loceol && ! (*scan == ' ' || @@ -6654,8 +6984,11 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case NSPACEA: if (utf8_target) { - while (scan < loceol && ! isSPACE_A((U8) *scan)) { + while (hardcount < max && scan < loceol + && ! isSPACE_A((U8) *scan)) + { scan += UTF8SKIP(scan); + hardcount++; } } else { @@ -6667,7 +7000,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case NSPACEL: PL_reg_flags |= RF_tainted; if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && !isSPACE_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); @@ -6680,7 +7012,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case DIGIT: if (utf8_target) { - loceol = PL_regeol; LOAD_UTF8_CHARCLASS_DIGIT(); while (hardcount < max && scan < loceol && swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) { @@ -6693,6 +7024,12 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } break; case DIGITA: + if (utf8_target && scan + max < loceol) { + + /* We didn't adjust because is UTF-8, but ok to do so, + * since here, to match, 1 char == 1 byte */ + loceol = scan + max; + } while (scan < loceol && isDIGIT_A((U8) *scan)) { scan++; } @@ -6700,7 +7037,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case DIGITL: PL_reg_flags |= RF_tainted; if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && isDIGIT_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); @@ -6713,7 +7049,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case NDIGIT: if (utf8_target) { - loceol = PL_regeol; LOAD_UTF8_CHARCLASS_DIGIT(); while (hardcount < max && scan < loceol && !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) { @@ -6727,8 +7062,10 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case NDIGITA: if (utf8_target) { - while (scan < loceol && ! isDIGIT_A((U8) *scan)) { + while (hardcount < max && scan < loceol + && ! isDIGIT_A((U8) *scan)) { scan += UTF8SKIP(scan); + hardcount++; } } else { @@ -6740,7 +7077,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma case NDIGITL: PL_reg_flags |= RF_tainted; if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && !isDIGIT_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); @@ -6753,27 +7089,24 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case LNBREAK: if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8_safe(scan, loceol))) { scan += c; hardcount++; } } else { - /* - LNBREAK can match two latin chars, which is ok, - because we have a null terminated string, but we - have to use hardcount in this situation - */ + /* LNBREAK can match one or two latin chars, which is ok, but we + * have to use hardcount in this situation, and throw away the + * adjustment to done before the switch statement */ + loceol = PL_regeol; while (scan < loceol && (c=is_LNBREAK_latin1_safe(scan, loceol))) { scan+=c; hardcount++; } - } + } break; case HORIZWS: if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8_safe(scan, loceol))) { @@ -6787,7 +7120,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case NHORIZWS: if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && !is_HORIZWS_utf8_safe(scan, loceol)) { @@ -6802,7 +7134,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case VERTWS: if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8_safe(scan, loceol))) { @@ -6817,7 +7148,6 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma break; case NVERTWS: if (utf8_target) { - loceol = PL_regeol; while (hardcount < max && scan < loceol && !is_VERTWS_utf8_safe(scan, loceol)) { @@ -6831,8 +7161,27 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma } break; - default: /* Called on something of 0 width. */ - break; /* So match right here or not at all. */ + case BOUND: + case BOUNDA: + case BOUNDL: + case BOUNDU: + case EOS: + case GPOS: + case KEEPS: + case NBOUND: + case NBOUNDA: + case NBOUNDL: + case NBOUNDU: + case OPFAIL: + case SBOL: + case SEOL: + /* These are all 0 width, so match right here or not at all. */ + break; + + default: + Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]); + assert(0); /* NOTREACHED */ + } if (hardcount) @@ -6859,32 +7208,35 @@ S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p, I32 ma #if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) /* - regclass_swash - prepare the utf8 swash. Wraps the shared core version to -create a copy so that changes the caller makes won't change the shared one +create a copy so that changes the caller makes won't change the shared one. +If is non-null, will return NULL in it, for back-compat. */ SV * Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp) { PERL_ARGS_ASSERT_REGCLASS_SWASH; - return newSVsv(core_regclass_swash(prog, node, doinit, listsvp, altsvp)); + + if (altsvp) { + *altsvp = NULL; + } + + return newSVsv(core_regclass_swash(prog, node, doinit, listsvp)); } #endif STATIC SV * -S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp, SV **altsvp) +S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool doinit, SV** listsvp) { /* Returns the swash for the input 'node' in the regex 'prog'. * If is true, will attempt to create the swash if not already * done. * If is non-null, will return the swash initialization string in * it. - * If is non-null, will return the alternates to the regular swash - * in it * Tied intimately to how regcomp.c sets up the data structure */ dVAR; SV *sw = NULL; SV *si = NULL; - SV *alt = NULL; SV* invlist = NULL; RXi_GET_DECL(prog,progi); @@ -6905,12 +7257,12 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo si = *ary; /* ary[0] = the string to initialize the swash with */ - /* Elements 3 and 4 are either both present or both absent. [3] is - * any inversion list generated at compile time; [4] indicates if + /* Elements 2 and 3 are either both present or both absent. [2] is + * any inversion list generated at compile time; [3] indicates if * that inversion list has any user-defined properties in it. */ - if (av_len(av) >= 3) { - invlist = ary[3]; - if (SvUV(ary[4])) { + if (av_len(av) >= 2) { + invlist = ary[2]; + if (SvUV(ary[3])) { swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY; } } @@ -6934,13 +7286,6 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo &swash_init_flags); (void)av_store(av, 1, sw); } - - /* Element [2] is for any multi-char folds. Note that is a - * fundamentally flawed design, because can't backtrack and try - * again. See [perl #89774] */ - if (SvTYPE(ary[2]) == SVt_PVAV) { - alt = ary[2]; - } } } @@ -6965,9 +7310,6 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo *listsvp = matches_string; } - if (altsvp) - *altsvp = alt; - return sw; } @@ -6976,15 +7318,9 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo n is the ANYOF regnode p is the target string - lenp is pointer to the maximum number of bytes of how far to go in p - (This is assumed wthout checking to always be at least the current - character's size) utf8_target tells whether p is in UTF-8. - Returns true if matched; false otherwise. If lenp is not NULL, on return - from a successful match, the value it points to will be updated to how many - bytes in p were matched. If there was no match, the value is undefined, - possibly changed from the input. + Returns true if matched; false otherwise. Note that this can be a synthetic start class, a combination of various nodes, so things you think might be mutually exclusive, such as locale, @@ -6993,19 +7329,19 @@ S_core_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bo */ STATIC bool -S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target) +S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, register const bool utf8_target) { dVAR; const char flags = ANYOF_FLAGS(n); bool match = FALSE; UV c = *p; - STRLEN c_len = 0; - STRLEN maxlen; PERL_ARGS_ASSERT_REGINCLASS; - /* If c is not already the code point, get it */ - if (utf8_target && !UTF8_IS_INVARIANT(c)) { + /* If c is not already the code point, get it. Note that + * UTF8_IS_INVARIANT() works even if not in UTF-8 */ + if (! UTF8_IS_INVARIANT(c) && utf8_target) { + STRLEN c_len = 0; c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len, (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); @@ -7014,21 +7350,6 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, if (c_len == (STRLEN)-1) Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); } - else { - c_len = 1; - } - - /* Use passed in max length, or one character if none passed in or less - * than one character. And assume will match just one character. This is - * overwritten later if matched more. */ - if (lenp) { - maxlen = (*lenp > c_len) ? *lenp : c_len; - *lenp = c_len; - - } - else { - maxlen = c_len; - } /* If this character is potentially in the bitmap, check it */ if (c < 256) { @@ -7040,11 +7361,10 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, { match = TRUE; } - else if (flags & ANYOF_LOCALE) { PL_reg_flags |= RF_tainted; - if ((flags & ANYOF_LOC_NONBITMAP_FOLD) + if ((flags & ANYOF_LOC_FOLD) && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { match = TRUE; @@ -7088,7 +7408,7 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, } /* If the bitmap didn't (or couldn't) match, and something outside the - * bitmap could match, try that. Locale nodes specifiy completely the + * bitmap could match, try that. Locale nodes specify completely the * behavior of code points in the bit map (otherwise, a utf8 target would * cause them to be treated as Unicode and not locale), except in * the very unlikely event when this node is a synthetic start class, which @@ -7107,167 +7427,19 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, || (! (flags & ANYOF_LOCALE)) || (flags & ANYOF_IS_SYNTHETIC))))) { - AV *av; - SV * const sw = core_regclass_swash(prog, n, TRUE, 0, (SV**)&av); - + SV * const sw = core_regclass_swash(prog, n, TRUE, 0); if (sw) { U8 * utf8_p; if (utf8_target) { utf8_p = (U8 *) p; - } else { - - /* Not utf8. Convert as much of the string as available up - * to the limit of how far the (single) character in the - * pattern can possibly match (no need to go further). If - * the node is a straight ANYOF or not folding, it can't - * match more than one. Otherwise, It can match up to how - * far a single char can fold to. Since not utf8, each - * character is a single byte, so the max it can be in - * bytes is the same as the max it can be in characters */ - STRLEN len = (OP(n) == ANYOF - || ! (flags & ANYOF_LOC_NONBITMAP_FOLD)) - ? 1 - : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND) - ? maxlen - : UTF8_MAX_FOLD_CHAR_EXPAND; + } else { /* Convert to utf8 */ + STRLEN len = 1; utf8_p = bytes_to_utf8(p, &len); } - if (swash_fetch(sw, utf8_p, TRUE)) + if (swash_fetch(sw, utf8_p, TRUE)) { match = TRUE; - else if (flags & ANYOF_LOC_NONBITMAP_FOLD) { - - /* Here, we need to test if the fold of the target string - * matches. The non-multi char folds have all been moved to - * the compilation phase, and the multi-char folds have - * been stored by regcomp into 'av'; we linearly check to - * see if any match the target string (folded). We know - * that the originals were each one character, but we don't - * currently know how many characters/bytes each folded to, - * except we do know that there are small limits imposed by - * Unicode. XXX A performance enhancement would be to have - * regcomp.c store the max number of chars/bytes that are - * in an av entry, as, say the 0th element. Even better - * would be to have a hash of the few characters that can - * start a multi-char fold to the max number of chars of - * those folds. - * - * If there is a match, we will need to advance (if lenp is - * specified) the match pointer in the target string. But - * what we are comparing here isn't that string directly, - * but its fold, whose length may differ from the original. - * As we go along in constructing the fold, therefore, we - * create a map so that we know how many bytes in the - * source to advance given that we have matched a certain - * number of bytes in the fold. This map is stored in - * 'map_fold_len_back'. Let n mean the number of bytes in - * the fold of the first character that we are folding. - * Then map_fold_len_back[n] is set to the number of bytes - * in that first character. Similarly let m be the - * corresponding number for the second character to be - * folded. Then map_fold_len_back[n+m] is set to the - * number of bytes occupied by the first two source - * characters. ... */ - U8 map_fold_len_back[UTF8_MAXBYTES_CASE+1] = { 0 }; - U8 folded[UTF8_MAXBYTES_CASE+1]; - STRLEN foldlen = 0; /* num bytes in fold of 1st char */ - STRLEN total_foldlen = 0; /* num bytes in fold of all - chars */ - - if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) { - - /* Here, only need to fold the first char of the target - * string. It the source wasn't utf8, is 1 byte long */ - to_utf8_fold(utf8_p, folded, &foldlen); - total_foldlen = foldlen; - map_fold_len_back[foldlen] = (utf8_target) - ? UTF8SKIP(utf8_p) - : 1; - } - else { - - /* Here, need to fold more than the first char. Do so - * up to the limits */ - U8* source_ptr = utf8_p; /* The source for the fold - is the regex target - string */ - U8* folded_ptr = folded; - U8* e = utf8_p + maxlen; /* Can't go beyond last - available byte in the - target string */ - U8 i; - for (i = 0; - i < UTF8_MAX_FOLD_CHAR_EXPAND && source_ptr < e; - i++) - { - - /* Fold the next character */ - U8 this_char_folded[UTF8_MAXBYTES_CASE+1]; - STRLEN this_char_foldlen; - to_utf8_fold(source_ptr, - this_char_folded, - &this_char_foldlen); - - /* Bail if it would exceed the byte limit for - * folding a single char. */ - if (this_char_foldlen + folded_ptr - folded > - UTF8_MAXBYTES_CASE) - { - break; - } - - /* Add the fold of this character */ - Copy(this_char_folded, - folded_ptr, - this_char_foldlen, - U8); - source_ptr += UTF8SKIP(source_ptr); - folded_ptr += this_char_foldlen; - total_foldlen = folded_ptr - folded; - - /* Create map from the number of bytes in the fold - * back to the number of bytes in the source. If - * the source isn't utf8, the byte count is just - * the number of characters so far */ - map_fold_len_back[total_foldlen] - = (utf8_target) - ? source_ptr - utf8_p - : i + 1; - } - *folded_ptr = '\0'; - } - - - /* Do the linear search to see if the fold is in the list - * of multi-char folds. */ - if (av) { - I32 i; - for (i = 0; i <= av_len(av); i++) { - SV* const sv = *av_fetch(av, i, FALSE); - STRLEN len; - const char * const s = SvPV_const(sv, len); - - if (len <= total_foldlen - && memEQ(s, (char*)folded, len) - - /* If 0, means matched a partial char. See - * [perl #90536] */ - && map_fold_len_back[len]) - { - - /* Advance the target string ptr to account for - * this fold, but have to translate from the - * folded length to the corresponding source - * length. */ - if (lenp) { - *lenp = map_fold_len_back[len]; - } - match = TRUE; - break; - } - } - } - } + } /* If we allocated a string above, free it */ if (! utf8_target) Safefree(utf8_p); @@ -7453,15 +7625,15 @@ S_to_byte_substr(pTHX_ register regexp *prog) if (! sv_utf8_downgrade(sv, TRUE)) { return FALSE; } - if (SvVALID(prog->substrs->data[i].utf8_substr)) { - if (SvTAIL(prog->substrs->data[i].utf8_substr)) { - /* Trim the trailing \n that fbm_compile added last - time. */ - SvCUR_set(sv, SvCUR(sv) - 1); - fbm_compile(sv, FBMcf_TAIL); - } else - fbm_compile(sv, 0); - } + if (SvVALID(prog->substrs->data[i].utf8_substr)) { + if (SvTAIL(prog->substrs->data[i].utf8_substr)) { + /* Trim the trailing \n that fbm_compile added last + time. */ + SvCUR_set(sv, SvCUR(sv) - 1); + fbm_compile(sv, FBMcf_TAIL); + } else + fbm_compile(sv, 0); + } prog->substrs->data[i].substr = sv; if (prog->substrs->data[i].utf8_substr == prog->check_utf8) prog->check_substr = sv;