X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/d1c771f5a95fddf225347623798f65884aa6eee7..df5fcde5a244ac321cdf54f61194eb713c04d040:/regexec.c diff --git a/regexec.c b/regexec.c index ec4c4b0..d866f1c 100644 --- a/regexec.c +++ b/regexec.c @@ -94,7 +94,11 @@ #define STATIC static #endif -#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c))) +/* Valid for non-utf8 strings only: avoids the reginclass call if there are no + * complications: i.e., if everything matchable is straight forward in the + * bitmap */ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \ + : ANYOF_BITMAP_TEST(p,*(c))) /* * Forwards. @@ -176,76 +180,106 @@ #endif -#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ - case NAMEL: \ - PL_reg_flags |= RF_tainted; \ - /* FALL THROUGH */ \ - case NAME: \ - if (!nextchr) \ - sayNO; \ - if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \ - if (!CAT2(PL_utf8_,CLASS)) { \ - bool ok; \ - ENTER; \ - save_re_context(); \ - ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \ - assert(ok); \ - LEAVE; \ - } \ - if (!(OP(scan) == NAME \ +#define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ + case NAMEL: \ + PL_reg_flags |= RF_tainted; \ + /* FALL THROUGH */ \ + case NAME: \ + if (!nextchr) \ + sayNO; \ + if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \ + if (!CAT2(PL_utf8_,CLASS)) { \ + bool ok; \ + ENTER; \ + save_re_context(); \ + ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \ + assert(ok); \ + LEAVE; \ + } \ + if (!(OP(scan) == NAME \ ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \ - : LCFUNC_utf8((U8*)locinput))) \ - { \ - sayNO; \ - } \ - locinput += PL_utf8skip[nextchr]; \ - nextchr = UCHARAT(locinput); \ - break; \ - } \ - if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \ - sayNO; \ - nextchr = UCHARAT(++locinput); \ + : LCFUNC_utf8((U8*)locinput))) \ + { \ + sayNO; \ + } \ + locinput += PL_utf8skip[nextchr]; \ + nextchr = UCHARAT(locinput); \ + break; \ + } \ + /* Drops through to the macro that calls this one */ + +#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ + _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ + if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \ + sayNO; \ + nextchr = UCHARAT(++locinput); \ break -#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ - case NAMEL: \ - PL_reg_flags |= RF_tainted; \ - /* FALL THROUGH */ \ - case NAME : \ - if (!nextchr && locinput >= PL_regeol) \ - sayNO; \ - if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \ - if (!CAT2(PL_utf8_,CLASS)) { \ - bool ok; \ - ENTER; \ - save_re_context(); \ - ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \ - assert(ok); \ - LEAVE; \ - } \ - if ((OP(scan) == NAME \ +/* Almost identical to the above, but has a case for a node that matches chars + * between 128 and 255 using Unicode (latin1) semantics. */ +#define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \ + _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ + if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \ + sayNO; \ + nextchr = UCHARAT(++locinput); \ + break + +#define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ + case NAMEL: \ + PL_reg_flags |= RF_tainted; \ + /* FALL THROUGH */ \ + case NAME : \ + if (!nextchr && locinput >= PL_regeol) \ + sayNO; \ + if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \ + if (!CAT2(PL_utf8_,CLASS)) { \ + bool ok; \ + ENTER; \ + save_re_context(); \ + ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \ + assert(ok); \ + LEAVE; \ + } \ + if ((OP(scan) == NAME \ ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \ - : LCFUNC_utf8((U8*)locinput))) \ - { \ - sayNO; \ - } \ - locinput += PL_utf8skip[nextchr]; \ - nextchr = UCHARAT(locinput); \ - break; \ - } \ - if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \ - sayNO; \ - nextchr = UCHARAT(++locinput); \ + : LCFUNC_utf8((U8*)locinput))) \ + { \ + sayNO; \ + } \ + locinput += PL_utf8skip[nextchr]; \ + nextchr = UCHARAT(locinput); \ + break; \ + } + +#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ + _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ + if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \ + sayNO; \ + nextchr = UCHARAT(++locinput); \ break +#define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \ + _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU) \ + if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \ + sayNO; \ + nextchr = UCHARAT(++locinput); \ + break /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ /* for use after a quantifier and before an EXACT-like node -- japhy */ -/* it would be nice to rework regcomp.sym to generate this stuff. sigh */ +/* it would be nice to rework regcomp.sym to generate this stuff. sigh + * + * NOTE that *nothing* that affects backtracking should be in here, specifically + * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a + * node that is in between two EXACT like nodes when ascertaining what the required + * "follow" character is. This should probably be moved to regex compile time + * although it may be done at run time beause of the REF possibility - more + * investigation required. -- demerphq +*/ #define JUMPABLE(rn) ( \ OP(rn) == OPEN || \ (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ @@ -253,7 +287,6 @@ OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ OP(rn) == PLUS || OP(rn) == MINMOD || \ OP(rn) == KEEPS || \ - /*(PL_regkind[OP(rn)] == VERB && OP(rn) != PRUNE && OP(rn) != COMMIT && OP(rn) != MARKPOINT && OP(rn) != SKIP && OP(rn) != CUTGROUP) || */\ (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ ) #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) @@ -1326,7 +1359,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, switch (OP(c)) { case ANYOF: if (utf8_target) { - REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) || + REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) || !UTF8_IS_INVARIANT((U8)s[0]) ? reginclass(prog, c, (U8*)s, 0, utf8_target) : REGINCLASS(prog, c, (U8*)s)); @@ -1499,12 +1532,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } ); } - else { + else { /* Not utf8 */ tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; - tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); + tmp = cBOOL((OP(c) == BOUNDL) + ? isALNUM_LC(tmp) + : (isWORDCHAR_L1(tmp) + && (isASCII(tmp) || (FLAGS(c) & USE_UNI)))); REXEC_FBC_SCAN( if (tmp == - !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) { + !((OP(c) == BOUNDL) + ? isALNUM_LC(*s) + : (isWORDCHAR_L1((U8) *s) + && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))))) + { tmp = !tmp; REXEC_FBC_TRYIT; } @@ -1537,12 +1577,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } else { tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; - tmp = ((OP(c) == NBOUND ? - isALNUM(tmp) : isALNUM_LC(tmp)) != 0); + tmp = cBOOL((OP(c) == NBOUNDL) + ? isALNUM_LC(tmp) + : (isWORDCHAR_L1(tmp) + && (isASCII(tmp) || (FLAGS(c) & USE_UNI)))); REXEC_FBC_SCAN( - if (tmp == - !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s))) + if (tmp == ! cBOOL( + (OP(c) == NBOUNDL) + ? isALNUM_LC(*s) + : (isWORDCHAR_L1((U8) *s) + && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))))) + { tmp = !tmp; + } else REXEC_FBC_TRYIT; ); } @@ -1553,7 +1600,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_WORD(), swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target), - isALNUM(*s) + (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s) ); case ALNUML: REXEC_FBC_CSCAN_TAINT( @@ -1564,7 +1611,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_WORD(), !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target), - !isALNUM(*s) + ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)) ); case NALNUML: REXEC_FBC_CSCAN_TAINT( @@ -1575,7 +1622,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_SPACE(), *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target), - isSPACE(*s) + isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)) ); case SPACEL: REXEC_FBC_CSCAN_TAINT( @@ -1586,7 +1633,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_SPACE(), !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)), - !isSPACE(*s) + !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))) ); case NSPACEL: REXEC_FBC_CSCAN_TAINT( @@ -1733,10 +1780,16 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, PerlIO_printf( Perl_debug_log, " Scanning for legal start char...\n"); } - ); - while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { - uc++; - } + ); + if (utf8_target) { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc += UTF8SKIP(uc); + } + } else { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc++; + } + } s= (char *)uc; } if (uc >(U8*)last_start) break; @@ -3170,7 +3223,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) "%*s %smatched empty string...%s\n", REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) ); - break; + if (!trie->jump) + break; } else { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, @@ -3562,7 +3616,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) else { ln = (locinput != PL_bostr) ? UCHARAT(locinput - 1) : '\n'; - if (OP(scan) == BOUND || OP(scan) == NBOUND) { + if (FLAGS(scan) & USE_UNI) { + + /* Here, can't be BOUNDL or NBOUNDL because they never set + * the flags to USE_UNI */ + ln = isWORDCHAR_L1(ln); + n = isWORDCHAR_L1(nextchr); + } + else if (OP(scan) == BOUND || OP(scan) == NBOUND) { ln = isALNUM(ln); n = isALNUM(nextchr); } @@ -3578,22 +3639,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case ANYOF: if (utf8_target) { STRLEN inclasslen = PL_regeol - locinput; + if (locinput >= PL_regeol) + sayNO; if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target)) goto anyof_fail; - if (locinput >= PL_regeol) - sayNO; - locinput += inclasslen ? inclasslen : UTF8SKIP(locinput); + locinput += inclasslen; nextchr = UCHARAT(locinput); break; } else { if (nextchr < 0) nextchr = UCHARAT(locinput); - if (!REGINCLASS(rex, scan, (U8*)locinput)) - goto anyof_fail; if (!nextchr && locinput >= PL_regeol) sayNO; + if (!REGINCLASS(rex, scan, (U8*)locinput)) + goto anyof_fail; nextchr = UCHARAT(++locinput); break; } @@ -3609,11 +3670,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) sayNO; break; /* Special char classes - The defines start on line 129 or so */ - CCC_TRY_AFF( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC); - CCC_TRY_NEG(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC); + CCC_TRY_AFF_U( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC); + CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC); - CCC_TRY_AFF( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC); - CCC_TRY_NEG(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC); + CCC_TRY_AFF_U( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC); + CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC); CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); @@ -3968,7 +4029,24 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) COP * const ocurcop = PL_curcop; PAD *old_comppad; char *saved_regeol = PL_regeol; - + struct re_save_state saved_state; + + /* To not corrupt the existing regex state while executing the + * eval we would normally put it on the save stack, like with + * save_re_context. However, re-evals have a weird scoping so we + * can't just add ENTER/LEAVE here. With that, things like + * + * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a}) + * + * would break, as they expect the localisation to be unwound + * only when the re-engine backtracks through the bit that + * localised it. + * + * What we do instead is just saving the state in a local c + * variable. + */ + Copy(&PL_reg_state, &saved_state, 1, struct re_save_state); + n = ARG(scan); PL_op = (OP_4tree*)rexi->data->data[n]; DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, @@ -3990,6 +4068,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) PUTBACK; } + Copy(&saved_state, &PL_reg_state, 1, struct re_save_state); + PL_op = oop; PAD_RESTORE_LOCAL(old_comppad); PL_curcop = ocurcop; @@ -5681,23 +5761,103 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) case CANY: scan = loceol; break; - case EXACT: /* length of string is 1 */ - c = (U8)*STRING(p); - while (scan < loceol && UCHARAT(scan) == c) - scan++; - break; - case EXACTF: /* length of string is 1 */ + case EXACT: + /* To get here, EXACTish nodes must have *byte* length == 1. That + * means they match only characters in the string that can be expressed + * as a single byte. For non-utf8 strings, that means a simple match. + * For utf8 strings, the character matched must be an invariant, or + * downgradable to a single byte. The pattern's utf8ness is + * irrelevant, as since it's a single byte, it either isn't utf8, or if + * it is, it's an invariant */ + c = (U8)*STRING(p); - while (scan < loceol && - (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c])) - scan++; + assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); + + if (! utf8_target || UNI_IS_INVARIANT(c)) { + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } + else { + + /* Here, the string is utf8, and the pattern char is different + * in utf8 than not, so can't compare them directly. Outside the + * loop, find find the two utf8 bytes that represent c, and then + * look for those in sequence in the utf8 string */ + U8 high = UTF8_TWO_BYTE_HI(c); + U8 low = UTF8_TWO_BYTE_LO(c); + loceol = PL_regeol; + + while (hardcount < max + && scan + 1 < loceol + && UCHARAT(scan) == high + && UCHARAT(scan + 1) == low) + { + scan += 2; + hardcount++; + } + } break; - case EXACTFL: /* length of string is 1 */ + case EXACTFL: PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case EXACTF: + + /* The comments for the EXACT case apply as well to these fold ones */ + c = (U8)*STRING(p); - while (scan < loceol && - (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c])) - scan++; + assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); + + if (utf8_target) { /* Use full Unicode fold matching */ + + /* For the EXACTFL case, It doesn't really make sense to compare + * locale and utf8, but it is best we can do. The documents warn + * against mixing them */ + + char *tmpeol = loceol; + while (hardcount < max + && foldEQ_utf8(scan, &tmpeol, 0, utf8_target, + STRING(p), NULL, 1, UTF_PATTERN)) + { + scan = tmpeol; + tmpeol = loceol; + hardcount++; + } + + /* XXX Note that the above handles properly the German sharp s in + * the pattern matching ss in the string. But it doesn't handle + * properly cases where the string contains say 'LIGATURE ff' and + * the pattern is 'f+'. This would require, say, a new function or + * revised interface to foldEQ_utf8(), in which the maximum number + * of characters to match could be passed and it would return how + * many actually did. This is just one of many cases where + * multi-char folds don't work properly, and so the fix is being + * deferred */ + } + else { + + /* Here, the string isn't utf8; and either the pattern isn't utf8 + * or c is an invariant, so its utf8ness doesn't affect c. Can + * just do simple comparisons for exact or fold matching. */ + switch (OP(p)) { + case EXACTF: + while (scan < loceol && + (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c])) + { + scan++; + } + break; + case EXACTFL: + while (scan < loceol && + (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c])) + { + scan++; + } + break; + default: + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); + } + } break; case ANYOF: if (utf8_target) { @@ -5717,13 +5877,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) loceol = PL_regeol; LOAD_UTF8_CHARCLASS_ALNUM(); while (hardcount < max && scan < loceol && - swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) { + swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) + { scan += UTF8SKIP(scan); hardcount++; } + } else if (FLAGS(p) & USE_UNI) { + while (scan < loceol && isWORDCHAR_L1((U8) *scan)) { + scan++; + } } else { - while (scan < loceol && isALNUM(*scan)) - scan++; + while (scan < loceol && isALNUM((U8) *scan)) { + scan++; + } } break; case ALNUML: @@ -5745,13 +5911,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) loceol = PL_regeol; LOAD_UTF8_CHARCLASS_ALNUM(); while (hardcount < max && scan < loceol && - !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) { + !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) + { scan += UTF8SKIP(scan); hardcount++; } + } else if (FLAGS(p) & USE_UNI) { + while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) { + scan++; + } } else { - while (scan < loceol && !isALNUM(*scan)) - scan++; + while (scan < loceol && ! isALNUM((U8) *scan)) { + scan++; + } } break; case NALNUML: @@ -5774,13 +5946,18 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) LOAD_UTF8_CHARCLASS_SPACE(); while (hardcount < max && scan < loceol && (*scan == ' ' || - swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) { + swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) + { scan += UTF8SKIP(scan); hardcount++; } + } else if (FLAGS(p) & USE_UNI) { + while (scan < loceol && isSPACE_L1((U8) *scan)) { + scan++; + } } else { - while (scan < loceol && isSPACE(*scan)) - scan++; + while (scan < loceol && isSPACE((U8) *scan)) + scan++; } break; case SPACEL: @@ -5803,13 +5980,19 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) LOAD_UTF8_CHARCLASS_SPACE(); while (hardcount < max && scan < loceol && !(*scan == ' ' || - swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) { + swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) + { scan += UTF8SKIP(scan); hardcount++; } + } else if (FLAGS(p) & USE_UNI) { + while (scan < loceol && ! isSPACE_L1((U8) *scan)) { + scan++; + } } else { - while (scan < loceol && !isSPACE(*scan)) - scan++; + while (scan < loceol && ! isSPACE((U8) *scan)) { + scan++; + } } break; case NSPACEL: @@ -6005,91 +6188,60 @@ Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool /* - reginclass - determine if a character falls into a character class - The n is the ANYOF regnode, the p is the target string, lenp - is pointer to the maximum length of how far to go in the p - (if the lenp is zero, UTF8SKIP(p) is used), - utf8_target tells whether the target string is in UTF-8. + n is the ANYOF regnode + p is the target string + lenp is pointer to the maximum number of bytes of how far to go in p + (This is assumed wthout checking to always be at least the current + character's size) + utf8_target tells whether p is in UTF-8. + + Returns true if matched; false otherwise. If lenp is not NULL, on return + from a successful match, the value it points to will be updated to how many + bytes in p were matched. If there was no match, the value is undefined, + possibly changed from the input. */ STATIC bool -S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool utf8_target) +S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target) { dVAR; const char flags = ANYOF_FLAGS(n); bool match = FALSE; UV c = *p; - STRLEN len = 0; - STRLEN plen; + STRLEN c_len = 0; + STRLEN maxlen; PERL_ARGS_ASSERT_REGINCLASS; + /* If c is not already the code point, get it */ if (utf8_target && !UTF8_IS_INVARIANT(c)) { - c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len, + c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len, (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for * UTF8_ALLOW_FFFF */ - if (len == (STRLEN)-1) + if (c_len == (STRLEN)-1) Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); } + else { + c_len = 1; + } - plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c)); - if (utf8_target || (flags & ANYOF_UNICODE)) { - if (lenp) - *lenp = 0; - if (utf8_target && !ANYOF_RUNTIME(n)) { - if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c)) - match = TRUE; - } - if (!match && utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) - match = TRUE; - if (!match) { - AV *av; - SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av); - - if (sw) { - U8 * utf8_p; - if (utf8_target) { - utf8_p = (U8 *) p; - } else { - STRLEN len = 1; - utf8_p = bytes_to_utf8(p, &len); - } - if (swash_fetch(sw, utf8_p, 1)) - match = TRUE; - else if (flags & ANYOF_FOLD) { - if (!match && lenp && av) { - I32 i; - for (i = 0; i <= av_len(av); i++) { - SV* const sv = *av_fetch(av, i, FALSE); - STRLEN len; - const char * const s = SvPV_const(sv, len); - if (len <= plen && memEQ(s, (char*)utf8_p, len)) { - *lenp = len; - match = TRUE; - break; - } - } - } - if (!match) { - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; - - STRLEN tmplen; - to_utf8_fold(utf8_p, tmpbuf, &tmplen); - if (swash_fetch(sw, tmpbuf, 1)) - match = TRUE; - } - } + /* Use passed in max length, or one character if none passed in or less + * than one character. And assume will match just one character. This is + * overwritten later if matched more. */ + if (lenp) { + maxlen = (*lenp > c_len) ? *lenp : c_len; + *lenp = c_len; - /* If we allocated a string above, free it */ - if (! utf8_target) Safefree(utf8_p); - } - } - if (match && lenp && *lenp == 0) - *lenp = UNISKIP(NATIVE_TO_UNI(c)); } - if (!match && c < 256) { + else { + maxlen = c_len; + } + + /* If this character is potentially in the bitmap, check it */ + if (c < 256) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; else if (flags & ANYOF_FOLD) { @@ -6105,7 +6257,7 @@ S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const match = TRUE; } - if (!match && (flags & ANYOF_CLASS)) { + if (!match && (flags & ANYOF_CLASS) && ANYOF_CLASS_TEST_ANY_SET(n)) { PL_reg_flags |= RF_tainted; if ( (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || @@ -6145,6 +6297,105 @@ S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const } } + /* If the bitmap didn't (or couldn't) match, and something outside the + * bitmap could match, try that */ + if (!match) { + if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { + match = TRUE; + } + else if ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && flags & ANYOF_UTF8)) + { + AV *av; + SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av); + + if (sw) { + U8 * utf8_p; + if (utf8_target) { + utf8_p = (U8 *) p; + } else { + STRLEN len = 1; + utf8_p = bytes_to_utf8(p, &len); + } + if (swash_fetch(sw, utf8_p, 1)) + match = TRUE; + else if (flags & ANYOF_FOLD) { + if (!match && lenp && av) { + I32 i; + for (i = 0; i <= av_len(av); i++) { + SV* const sv = *av_fetch(av, i, FALSE); + STRLEN len; + const char * const s = SvPV_const(sv, len); + if (len <= maxlen && memEQ(s, (char*)utf8_p, len)) { + *lenp = len; + match = TRUE; + break; + } + } + } + if (!match) { + U8 folded[UTF8_MAXBYTES_CASE+1]; + + /* See if the folded version matches */ + STRLEN foldlen; + to_utf8_fold(utf8_p, folded, &foldlen); + if (swash_fetch(sw, folded, 1)) { /* 1 => is utf8 */ + match = TRUE; + } + else { + SV** listp; + + /* Consider "k" =~ /[K]/i. The line above would + * have just folded the 'k' to itself, and that + * isn't going to match 'K'. So we look through + * the closure of everything that folds to 'k'. + * That will find the 'K'. Initialize the list, if + * necessary */ + if (! PL_utf8_foldclosures) { + + /* If the folds haven't been read in, call a + * fold function to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES+1]; + STRLEN dummy_len; + to_utf8_fold((U8*) "A", dummy, &dummy_len); + } + PL_utf8_foldclosures = + _swash_inversion_hash(PL_utf8_tofold); + } + + /* The data structure is a hash with the keys every + * character that is folded to, like 'k', and the + * values each an array of everything that folds to + * its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) folded, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV i; + for (i = 0; i <= av_len(list); i++) { + SV** try_p = av_fetch(list, i, FALSE); + if (try_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + /* Don't have to worry about embeded nulls + * since NULL isn't folded or foldable */ + if (swash_fetch(sw, (U8*) SvPVX(*try_p),1)) { + match = TRUE; + break; + } + } + } + } + } + } + + /* If we allocated a string above, free it */ + if (! utf8_target) Safefree(utf8_p); + } + } + } + return (flags & ANYOF_INVERT) ? !match : match; }