X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/7cdde5444c9ad8cccf237ec340ddb54f58ce3cf0..8f9aa6a33b8add4f811c7b17012eee9544cda48b:/regexec.c diff --git a/regexec.c b/regexec.c index 3ca8451..476a966 100644 --- a/regexec.c +++ b/regexec.c @@ -127,7 +127,7 @@ /* Doesn't do an assert to verify that is correct */ #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \ - if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END + if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a") #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0") @@ -179,94 +179,115 @@ #define RE_utf8_posix_digit PL_utf8_posix_digit #endif - -#define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ - case NAMEL: \ - PL_reg_flags |= RF_tainted; \ - /* FALL THROUGH */ \ - case NAME: \ - if (!nextchr) \ - sayNO; \ - if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \ - if (!CAT2(PL_utf8_,CLASS)) { \ - bool ok; \ - ENTER; \ - save_re_context(); \ - ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \ - assert(ok); \ - LEAVE; \ - } \ - if (!(OP(scan) == NAME \ - ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \ - : LCFUNC_utf8((U8*)locinput))) \ - { \ - sayNO; \ - } \ - locinput += PL_utf8skip[nextchr]; \ - nextchr = UCHARAT(locinput); \ - break; \ - } \ - /* Drops through to the macro that calls this one */ - -#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ - _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ - if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \ - sayNO; \ - nextchr = UCHARAT(++locinput); \ - break - -/* Almost identical to the above, but has a case for a node that matches chars - * between 128 and 255 using Unicode (latin1) semantics. */ -#define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \ - _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ - if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \ - sayNO; \ - nextchr = UCHARAT(++locinput); \ - break - -#define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ - case NAMEL: \ - PL_reg_flags |= RF_tainted; \ - /* FALL THROUGH */ \ - case NAME : \ - if (!nextchr && locinput >= PL_regeol) \ - sayNO; \ - if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \ - if (!CAT2(PL_utf8_,CLASS)) { \ - bool ok; \ - ENTER; \ - save_re_context(); \ - ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \ - assert(ok); \ - LEAVE; \ - } \ - if ((OP(scan) == NAME \ - ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \ - : LCFUNC_utf8((U8*)locinput))) \ - { \ - sayNO; \ - } \ - locinput += PL_utf8skip[nextchr]; \ - nextchr = UCHARAT(locinput); \ - break; \ - } - -#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ - _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ - if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \ - sayNO; \ - nextchr = UCHARAT(++locinput); \ - break - - -#define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \ - _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU) \ - if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \ - sayNO; \ - nextchr = UCHARAT(++locinput); \ - break - - +#define PLACEHOLDER /* Something for the preprocessor to grab onto */ + +/* The actual code for CCC_TRY, which uses several variables from the routine + * it's callable from. It is designed to be the bulk of a case statement. + * FUNC is the macro or function to call on non-utf8 targets that indicate if + * nextchr matches the class. + * UTF8_TEST is the whole test string to use for utf8 targets + * LOAD is what to use to test, and if not present to load in the swash for the + * class + * POS_OR_NEG is either empty or ! to complement the results of FUNC or + * UTF8_TEST test. + * The logic is: Fail if we're at the end-of-string; otherwise if the target is + * utf8 and a variant, load the swash if necessary and test using the utf8 + * test. Advance to the next character if test is ok, otherwise fail; If not + * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it + * fails, or advance to the next character */ + +#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \ + if (locinput >= PL_regeol) { \ + sayNO; \ + } \ + if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \ + LOAD_UTF8_CHARCLASS(CLASS, STR); \ + if (POS_OR_NEG (UTF8_TEST)) { \ + sayNO; \ + } \ + locinput += PL_utf8skip[nextchr]; \ + nextchr = UCHARAT(locinput); \ + break; \ + } \ + if (POS_OR_NEG (FUNC(nextchr))) { \ + sayNO; \ + } \ + nextchr = UCHARAT(++locinput); \ + break; + +/* Handle the non-locale cases for a character class and its complement. It + * calls _CCC_TRY_CODE with a ! to complement the test for the character class. + * This is because that code fails when the test succeeds, so we want to have + * the test fail so that the code succeeds. The swash is stored in a + * predictable PL_ place */ +#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \ + CLASS, STR) \ + case NAME: \ + _CCC_TRY_CODE( !, FUNC, \ + cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \ + (U8*)locinput, TRUE)), \ + CLASS, STR) \ + case NNAME: \ + _CCC_TRY_CODE( PLACEHOLDER , FUNC, \ + cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \ + (U8*)locinput, TRUE)), \ + CLASS, STR) \ + +/* Generate the case statements for both locale and non-locale character + * classes in regmatch for classes that don't have special unicode semantics. + * Locales don't use an immediate swash, but an intermediary special locale + * function that is called on the pointer to the current place in the input + * string. That function will resolve to needing the same swash. One might + * think that because we don't know what the locale will match, we shouldn't + * check with the swash loading function that it loaded properly; ie, that we + * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the + * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is + * irrelevant here */ +#define CCC_TRY(NAME, NNAME, FUNC, \ + NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \ + NAMEA, NNAMEA, FUNCA, \ + CLASS, STR) \ + case NAMEL: \ + PL_reg_flags |= RF_tainted; \ + _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \ + case NNAMEL: \ + PL_reg_flags |= RF_tainted; \ + _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \ + CLASS, STR) \ + case NAMEA: \ + if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \ + sayNO; \ + } \ + /* Matched a utf8-invariant, so don't have to worry about utf8 */ \ + nextchr = UCHARAT(++locinput); \ + break; \ + case NNAMEA: \ + if (locinput >= PL_regeol || FUNCA(nextchr)) { \ + sayNO; \ + } \ + if (utf8_target) { \ + locinput += PL_utf8skip[nextchr]; \ + nextchr = UCHARAT(locinput); \ + } \ + else { \ + nextchr = UCHARAT(++locinput); \ + } \ + break; \ + /* Generate the non-locale cases */ \ + _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR) + +/* This is like CCC_TRY, but has an extra set of parameters for generating case + * statements to handle separate Unicode semantics nodes */ +#define CCC_TRY_U(NAME, NNAME, FUNC, \ + NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \ + NAMEU, NNAMEU, FUNCU, \ + NAMEA, NNAMEA, FUNCA, \ + CLASS, STR) \ + CCC_TRY(NAME, NNAME, FUNC, \ + NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \ + NAMEA, NNAMEA, FUNCA, \ + CLASS, STR) \ + _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR) /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ @@ -297,12 +318,13 @@ /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( (OP(rn)==EXACTFU || OP(rn)==EXACTF) || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU ) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -527,7 +549,7 @@ Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend, a) Anchored substring; b) Fixed substring; c) Whether we are anchored (beginning-of-line or \G); - d) First node (of those at offset 0) which may distingush positions; + d) First node (of those at offset 0) which may distinguish positions; We use a)b)d) and multiline-part of c), and try to find a position in the string which does not contradict any of them. */ @@ -1028,7 +1050,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, prog->float_substr = prog->float_utf8 = NULL; /* clear */ check = NULL; /* abort */ s = strpos; - /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevent flag + /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag see http://bugs.activestate.com/show_bug.cgi?id=87173 */ if (prog->intflags & PREGf_IMPLICIT) prog->extflags &= ~RXf_ANCH_MBOL; @@ -1054,7 +1076,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, even for \b or \B. But (minlen? 1 : 0) below assumes that regstclass does not come from lookahead... */ /* If regstclass takes bytelength more than 1: If charlength==1, OK. - This leaves EXACTF only, which is dealt with in find_byclass(). */ + This leaves EXACTF, EXACTFU only, which are dealt with in find_byclass(). */ const U8* const str = (U8*)STRING(progi->regstclass); const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT ? CHR_DIST(str+STR_LEN(progi->regstclass), str) @@ -1129,7 +1151,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, } if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ goto fail; - /* Check is floating subtring. */ + /* Check is floating substring. */ retry_floating_check: t = check_at - start_shift; DEBUG_EXECUTE_r( what = "floating" ); @@ -1244,12 +1266,18 @@ s += len #define REXEC_FBC_EXACTISH_SCAN(CoNd) \ STMT_START { \ + re_fold_t folder; \ + switch (OP(c)) { \ + case EXACTFU: folder = foldEQ_latin1; break; \ + case EXACTFL: folder = foldEQ_locale; break; \ + case EXACTF: folder = foldEQ; break; \ + default: \ + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(c)); \ + } \ while (s <= e) { \ if ( (CoNd) \ - && (ln == 1 || (OP(c) == EXACTF \ - ? foldEQ(s, m, ln) \ - : foldEQ_locale(s, m, ln))) \ - && (!reginfo || regtry(reginfo, &s)) ) \ + && (ln == 1 || folder(s, m, ln)) \ + && (!reginfo || regtry(reginfo, &s)) ) \ goto got_it; \ s++; \ } \ @@ -1305,8 +1333,7 @@ if ((!reginfo || regtry(reginfo, &s))) \ } \ else { \ REXEC_FBC_CLASS_SCAN(CoNd); \ - } \ - break + } #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \ if (utf8_target) { \ @@ -1315,8 +1342,7 @@ if ((!reginfo || regtry(reginfo, &s))) \ } \ else { \ REXEC_FBC_CLASS_SCAN(CoNd); \ - } \ - break + } #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \ PL_reg_flags |= RF_tainted; \ @@ -1325,12 +1351,91 @@ if ((!reginfo || regtry(reginfo, &s))) \ } \ else { \ REXEC_FBC_CLASS_SCAN(CoNd); \ - } \ - break + } #define DUMP_EXEC_POS(li,s,doutf8) \ dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8) + +#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ + tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \ + tmp = TEST_NON_UTF8(tmp); \ + REXEC_FBC_UTF8_SCAN( \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + tmp = !tmp; \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + +#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ + if (s == PL_bostr) { \ + tmp = '\n'; \ + } \ + else { \ + U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \ + tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \ + } \ + tmp = TeSt1_UtF8; \ + LOAD_UTF8_CHARCLASS_ALNUM(); \ + REXEC_FBC_UTF8_SCAN( \ + if (tmp == ! (TeSt2_UtF8)) { \ + tmp = !tmp; \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + +/* The only difference between the BOUND and NBOUND cases is that + * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in + * NBOUND. This is accomplished by passing it in either the if or else clause, + * with the other one being empty */ +#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ + FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) + +#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ + FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) + +#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ + FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) + +#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ + FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) + + +/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to + * be passed in completely with the variable name being tested, which isn't + * such a clean interface, but this is easier to read than it was before. We + * are looking for the boundary (or non-boundary between a word and non-word + * character. The utf8 and non-utf8 cases have the same logic, but the details + * must be different. Find the "wordness" of the character just prior to this + * one, and compare it with the wordness of this one. If they differ, we have + * a boundary. At the beginning of the string, pretend that the previous + * character was a new-line */ +#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ + if (utf8_target) { \ + UTF8_CODE \ + } \ + else { /* Not utf8 */ \ + tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \ + tmp = TEST_NON_UTF8(tmp); \ + REXEC_FBC_SCAN( \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + tmp = !tmp; \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + } \ + if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \ + goto got_it; + /* We know what class REx starts with. Try to find this position... */ /* if reginfo is NULL, its a dryrun */ /* annoyingly all the vars in this routine have different names from their counterparts @@ -1357,9 +1462,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* We know what class it must start with. */ switch (OP(c)) { + case ANYOFV: case ANYOF: - if (utf8_target) { - REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) || + if (utf8_target || OP(c) == ANYOFV) { + REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) || !UTF8_IS_INVARIANT((U8)s[0]) ? reginclass(prog, c, (U8*)s, 0, utf8_target) : REGINCLASS(prog, c, (U8*)s)); @@ -1392,6 +1498,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, tmp = doevery; ); break; + case EXACTFU: case EXACTF: m = STRING(c); ln = STR_LEN(c); /* length to match in octets/bytes */ @@ -1431,7 +1538,18 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } else { c1 = *(U8*)m; - c2 = PL_fold[c1]; + if (utf8_target || OP(c) == EXACTFU) { + + /* Micro sign folds to GREEK SMALL LETTER MU; + LATIN_SMALL_LETTER_SHARP_S folds to 'ss', and this sets + c2 to the first 's' of the pair, and the code below will + look for others */ + c2 = (c1 == MICRO_SIGN) + ? GREEK_SMALL_LETTER_MU + : (c1 == LATIN_SMALL_LETTER_SHARP_S) + ? 's' + : PL_fold_latin1[c1]; + } else c2 = PL_fold[c1]; } goto do_exactf; case EXACTFL: @@ -1510,183 +1628,215 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; case BOUNDL: PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case BOUND: - if (utf8_target) { - if (s == PL_bostr) - tmp = '\n'; - else { - U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); - } - tmp = ((OP(c) == BOUND ? - isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); - LOAD_UTF8_CHARCLASS_ALNUM(); - REXEC_FBC_UTF8_SCAN( - if (tmp == !(OP(c) == BOUND ? - cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) : - isALNUM_LC_utf8((U8*)s))) - { - tmp = !tmp; - REXEC_FBC_TRYIT; - } - ); - } - else { /* Not utf8 */ - tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; - tmp = cBOOL((OP(c) == BOUNDL) - ? isALNUM_LC(tmp) - : (isWORDCHAR_L1(tmp) - && (isASCII(tmp) || (FLAGS(c) & USE_UNI)))); - REXEC_FBC_SCAN( - if (tmp == - !((OP(c) == BOUNDL) - ? isALNUM_LC(*s) - : (isWORDCHAR_L1((U8) *s) - && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))))) - { - tmp = !tmp; - REXEC_FBC_TRYIT; - } - ); - } - if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) - goto got_it; + FBC_BOUND(isALNUM_LC, + isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)), + isALNUM_LC_utf8((U8*)s)); break; case NBOUNDL: PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ + FBC_NBOUND(isALNUM_LC, + isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)), + isALNUM_LC_utf8((U8*)s)); + break; + case BOUND: + FBC_BOUND(isWORDCHAR, + isALNUM_uni(tmp), + cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target))); + break; + case BOUNDA: + FBC_BOUND_NOLOAD(isWORDCHAR_A, + isWORDCHAR_A(tmp), + isWORDCHAR_A((U8*)s)); + break; case NBOUND: - if (utf8_target) { - if (s == PL_bostr) - tmp = '\n'; - else { - U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); - } - tmp = ((OP(c) == NBOUND ? - isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); - LOAD_UTF8_CHARCLASS_ALNUM(); - REXEC_FBC_UTF8_SCAN( - if (tmp == !(OP(c) == NBOUND ? - cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) : - isALNUM_LC_utf8((U8*)s))) - tmp = !tmp; - else REXEC_FBC_TRYIT; - ); - } - else { - tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; - tmp = cBOOL((OP(c) == NBOUNDL) - ? isALNUM_LC(tmp) - : (isWORDCHAR_L1(tmp) - && (isASCII(tmp) || (FLAGS(c) & USE_UNI)))); - REXEC_FBC_SCAN( - if (tmp == ! cBOOL( - (OP(c) == NBOUNDL) - ? isALNUM_LC(*s) - : (isWORDCHAR_L1((U8) *s) - && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))))) - { - tmp = !tmp; - } - else REXEC_FBC_TRYIT; - ); - } - if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s))) - goto got_it; + FBC_NBOUND(isWORDCHAR, + isALNUM_uni(tmp), + cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target))); + break; + case NBOUNDA: + FBC_NBOUND_NOLOAD(isWORDCHAR_A, + isWORDCHAR_A(tmp), + isWORDCHAR_A((U8*)s)); + break; + case BOUNDU: + FBC_BOUND(isWORDCHAR_L1, + isALNUM_uni(tmp), + cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target))); + break; + case NBOUNDU: + FBC_NBOUND(isWORDCHAR_L1, + isALNUM_uni(tmp), + cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target))); break; - case ALNUM: - REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_PERL_WORD(), - swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target), - (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s) - ); case ALNUML: REXEC_FBC_CSCAN_TAINT( isALNUM_LC_utf8((U8*)s), isALNUM_LC(*s) ); + break; + case ALNUMU: + REXEC_FBC_CSCAN_PRELOAD( + LOAD_UTF8_CHARCLASS_PERL_WORD(), + swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target), + isWORDCHAR_L1((U8) *s) + ); + break; + case ALNUM: + REXEC_FBC_CSCAN_PRELOAD( + LOAD_UTF8_CHARCLASS_PERL_WORD(), + swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target), + isWORDCHAR((U8) *s) + ); + break; + case ALNUMA: + /* Don't need to worry about utf8, as it can match only a single + * byte invariant character */ + REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s)); + break; + case NALNUMU: + REXEC_FBC_CSCAN_PRELOAD( + LOAD_UTF8_CHARCLASS_PERL_WORD(), + swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target), + ! isWORDCHAR_L1((U8) *s) + ); + break; case NALNUM: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_WORD(), !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target), - ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)) + ! isALNUM(*s) + ); + break; + case NALNUMA: + REXEC_FBC_CSCAN( + !isWORDCHAR_A(*s), + !isWORDCHAR_A(*s) ); + break; case NALNUML: REXEC_FBC_CSCAN_TAINT( !isALNUM_LC_utf8((U8*)s), !isALNUM_LC(*s) ); + break; + case SPACEU: + REXEC_FBC_CSCAN_PRELOAD( + LOAD_UTF8_CHARCLASS_PERL_SPACE(), + *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target), + isSPACE_L1((U8) *s) + ); + break; case SPACE: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_SPACE(), *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target), - isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)) + isSPACE((U8) *s) ); + break; + case SPACEA: + /* Don't need to worry about utf8, as it can match only a single + * byte invariant character */ + REXEC_FBC_CLASS_SCAN( isSPACE_A(*s)); + break; case SPACEL: REXEC_FBC_CSCAN_TAINT( - *s == ' ' || isSPACE_LC_utf8((U8*)s), + isSPACE_LC_utf8((U8*)s), isSPACE_LC(*s) ); + break; + case NSPACEU: + REXEC_FBC_CSCAN_PRELOAD( + LOAD_UTF8_CHARCLASS_PERL_SPACE(), + !( *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)), + ! isSPACE_L1((U8) *s) + ); + break; case NSPACE: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_SPACE(), !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)), - !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))) + ! isSPACE((U8) *s) + ); + break; + case NSPACEA: + REXEC_FBC_CSCAN( + !isSPACE_A(*s), + !isSPACE_A(*s) ); + break; case NSPACEL: REXEC_FBC_CSCAN_TAINT( - !(*s == ' ' || isSPACE_LC_utf8((U8*)s)), + !isSPACE_LC_utf8((U8*)s), !isSPACE_LC(*s) ); + break; case DIGIT: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_POSIX_DIGIT(), swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target), isDIGIT(*s) ); + break; + case DIGITA: + /* Don't need to worry about utf8, as it can match only a single + * byte invariant character */ + REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s)); + break; case DIGITL: REXEC_FBC_CSCAN_TAINT( isDIGIT_LC_utf8((U8*)s), isDIGIT_LC(*s) ); + break; case NDIGIT: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_POSIX_DIGIT(), !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target), !isDIGIT(*s) ); + break; + case NDIGITA: + REXEC_FBC_CSCAN( + !isDIGIT_A(*s), + !isDIGIT_A(*s) + ); + break; case NDIGITL: REXEC_FBC_CSCAN_TAINT( !isDIGIT_LC_utf8((U8*)s), !isDIGIT_LC(*s) ); + break; case LNBREAK: REXEC_FBC_CSCAN( is_LNBREAK_utf8(s), is_LNBREAK_latin1(s) ); + break; case VERTWS: REXEC_FBC_CSCAN( is_VERTWS_utf8(s), is_VERTWS_latin1(s) ); + break; case NVERTWS: REXEC_FBC_CSCAN( !is_VERTWS_utf8(s), !is_VERTWS_latin1(s) ); + break; case HORIZWS: REXEC_FBC_CSCAN( is_HORIZWS_utf8(s), is_HORIZWS_latin1(s) ); + break; case NHORIZWS: REXEC_FBC_CSCAN( !is_HORIZWS_utf8(s), !is_HORIZWS_latin1(s) ); + break; case AHOCORASICKC: case AHOCORASICK: { @@ -1780,10 +1930,16 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, PerlIO_printf( Perl_debug_log, " Scanning for legal start char...\n"); } - ); - while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { - uc++; - } + ); + if (utf8_target) { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc += UTF8SKIP(uc); + } + } else { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc++; + } + } s= (char *)uc; } if (uc >(U8*)last_start) break; @@ -2131,7 +2287,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre goto phooey; } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) { - /* the warning about reginfo.ganch being used without intialization + /* the warning about reginfo.ganch being used without initialization is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN and we only enter this block when the same bit is set. */ char *tmp_s = reginfo.ganch - prog->gofs; @@ -2758,7 +2914,7 @@ The only exceptions to this are lookahead/behind assertions and the cut, (?>A), which pop all the backtrack states associated with A before continuing. -Bascktrack state structs are allocated in slabs of about 4K in size. +Backtrack state structs are allocated in slabs of about 4K in size. PL_regmatch_state and st always point to the currently active state, and PL_regmatch_slab points to the slab currently containing PL_regmatch_state. The first time regmatch() is called, the first slab is @@ -2977,7 +3133,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) SV *popmark = NULL; /* are we looking for a mark? */ SV *sv_commit = NULL; /* last mark name seen in failure */ SV *sv_yes_mark = NULL; /* last mark name we have seen - during a successfull match */ + during a successful match */ U32 lastopen = 0; /* last open we saw */ bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; SV* const oreplsv = GvSV(PL_replgv); @@ -3217,7 +3373,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) "%*s %smatched empty string...%s\n", REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) ); - break; + if (!trie->jump) + break; } else { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, @@ -3467,7 +3624,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ST.nextword, tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII ) : "not compiled under -Dr", PL_colors[5] ); @@ -3531,11 +3688,27 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) nextchr = UCHARAT(locinput); break; } - case EXACTFL: + case EXACTFL: { + re_fold_t folder; + const U8 * fold_array; + const char * s; + PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case EXACTF: { - char * const s = STRING(scan); + folder = foldEQ_locale; + fold_array = PL_fold_locale; + goto do_exactf; + + case EXACTFU: + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + goto do_exactf; + + case EXACTF: + folder = foldEQ; + fold_array = PL_fold; + + do_exactf: + s = STRING(scan); ln = STR_LEN(scan); if (utf8_target || UTF_PATTERN) { @@ -3568,27 +3741,34 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* Inline the first character, for speed. */ if (UCHARAT(s) != nextchr && - UCHARAT(s) != ((OP(scan) == EXACTF) - ? PL_fold : PL_fold_locale)[nextchr]) + UCHARAT(s) != fold_array[nextchr]) + { sayNO; + } if (PL_regeol - locinput < ln) sayNO; - if (ln > 1 && (OP(scan) == EXACTF - ? ! foldEQ(s, locinput, ln) - : ! foldEQ_locale(s, locinput, ln))) + if (ln > 1 && ! folder(s, locinput, ln)) sayNO; locinput += ln; nextchr = UCHARAT(locinput); break; - } + } + + /* XXX Could improve efficiency by separating these all out using a + * macro or in-line function. At that point regcomp.c would no longer + * have to set the FLAGS fields of these */ case BOUNDL: case NBOUNDL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case BOUND: + case BOUNDU: + case BOUNDA: case NBOUND: + case NBOUNDU: + case NBOUNDA: /* was last char in word? */ - if (utf8_target) { + if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) { if (locinput == PL_bostr) ln = '\n'; else { @@ -3596,7 +3776,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags); } - if (OP(scan) == BOUND || OP(scan) == NBOUND) { + if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { ln = isALNUM_uni(ln); LOAD_UTF8_CHARCLASS_ALNUM(); n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target); @@ -3607,36 +3787,56 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } } else { + + /* Here the string isn't utf8, or is utf8 and only ascii + * characters are to match \w. In the latter case looking at + * the byte just prior to the current one may be just the final + * byte of a multi-byte character. This is ok. There are two + * cases: + * 1) it is a single byte character, and then the test is doing + * just what it's supposed to. + * 2) it is a multi-byte character, in which case the final + * byte is never mistakable for ASCII, and so the test + * will say it is not a word character, which is the + * correct answer. */ ln = (locinput != PL_bostr) ? UCHARAT(locinput - 1) : '\n'; - if (FLAGS(scan) & USE_UNI) { - - /* Here, can't be BOUNDL or NBOUNDL because they never set - * the flags to USE_UNI */ - ln = isWORDCHAR_L1(ln); - n = isWORDCHAR_L1(nextchr); - } - else if (OP(scan) == BOUND || OP(scan) == NBOUND) { - ln = isALNUM(ln); - n = isALNUM(nextchr); - } - else { - ln = isALNUM_LC(ln); - n = isALNUM_LC(nextchr); + switch (FLAGS(scan)) { + case REGEX_UNICODE_CHARSET: + ln = isWORDCHAR_L1(ln); + n = isWORDCHAR_L1(nextchr); + break; + case REGEX_LOCALE_CHARSET: + ln = isALNUM_LC(ln); + n = isALNUM_LC(nextchr); + break; + case REGEX_DEPENDS_CHARSET: + ln = isALNUM(ln); + n = isALNUM(nextchr); + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + ln = isWORDCHAR_A(ln); + n = isWORDCHAR_A(nextchr); + break; + default: + Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan)); + break; } } - if (((!ln) == (!n)) == (OP(scan) == BOUND || - OP(scan) == BOUNDL)) + /* Note requires that all BOUNDs be lower than all NBOUNDs in + * regcomp.sym */ + if (((!ln) == (!n)) == (OP(scan) < NBOUND)) sayNO; break; + case ANYOFV: case ANYOF: - if (utf8_target) { + if (utf8_target || state_num == ANYOFV) { STRLEN inclasslen = PL_regeol - locinput; if (locinput >= PL_regeol) sayNO; if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target)) - goto anyof_fail; + sayNO; locinput += inclasslen; nextchr = UCHARAT(locinput); break; @@ -3647,30 +3847,28 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) if (!nextchr && locinput >= PL_regeol) sayNO; if (!REGINCLASS(rex, scan, (U8*)locinput)) - goto anyof_fail; + sayNO; nextchr = UCHARAT(++locinput); break; } - anyof_fail: - /* If we might have the case of the German sharp s - * in a casefolding Unicode character class. */ - - if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) { - locinput += SHARP_S_SKIP; - nextchr = UCHARAT(locinput); - } - else - sayNO; break; /* Special char classes - The defines start on line 129 or so */ - CCC_TRY_AFF_U( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC); - CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC); - - CCC_TRY_AFF_U( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC); - CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC); - - CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); - CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); + CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR, + ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8, + ALNUMU, NALNUMU, isWORDCHAR_L1, + ALNUMA, NALNUMA, isWORDCHAR_A, + perl_word, "a"); + + CCC_TRY_U(SPACE, NSPACE, isSPACE, + SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8, + SPACEU, NSPACEU, isSPACE_L1, + SPACEA, NSPACEA, isSPACE_A, + perl_space, " "); + + CCC_TRY(DIGIT, NDIGIT, isDIGIT, + DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8, + DIGITA, NDIGITA, isDIGIT_A, + posix_digit, "0"); case CLUMP: /* Match \X: logical Unicode character. This is defined as a Unicode extended Grapheme Cluster */ @@ -3886,31 +4084,74 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) break; case NREFFL: - { + { /* The capture buffer cases. The ones beginning with N for the + named buffers just convert to the equivalent numbered and + pretend they were called as the corresponding numbered buffer + op. */ + /* don't initialize these, it makes C++ unhappy */ char *s; char type; + re_fold_t folder; + const U8 *fold_array; + PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NREF: + folder = foldEQ_locale; + fold_array = PL_fold_locale; + type = REFFL; + goto do_nref; + + case NREFFU: + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + type = REFFU; + goto do_nref; + case NREFF: - type = OP(scan); + folder = foldEQ; + fold_array = PL_fold; + type = REFF; + goto do_nref; + + case NREF: + type = REF; + folder = NULL; + fold_array = NULL; + do_nref: + + /* For the named back references, find the corresponding buffer + * number */ n = reg_check_named_buff_matched(rex,scan); - if ( n ) { - type = REF + ( type - NREF ); - goto do_ref; - } else { + if ( ! n ) { sayNO; - } - /* unreached */ + } + goto do_nref_ref_common; + case REFFL: PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ + folder = foldEQ_locale; + fold_array = PL_fold_locale; + goto do_ref; + + case REFFU: + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + goto do_ref; + + case REFF: + folder = foldEQ; + fold_array = PL_fold; + goto do_ref; + case REF: - case REFF: - n = ARG(scan); /* which paren pair */ + folder = NULL; + fold_array = NULL; + + do_ref: type = OP(scan); - do_ref: + n = ARG(scan); /* which paren pair */ + + do_nref_ref_common: ln = PL_regoffs[n].start; PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ if (*PL_reglastparen < n || ln == -1) @@ -3919,49 +4160,40 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) break; s = PL_bostr + ln; - if (utf8_target && type != REF) { /* REF can do byte comparison */ - char *l = locinput; - const char *e = PL_bostr + PL_regoffs[n].end; - /* - * Note that we can't do the "other character" lookup trick as - * in the 8-bit case (no pun intended) because in Unicode we - * have to map both upper and title case to lower case. - */ - if (type == REFF) { - while (s < e) { - STRLEN ulen1, ulen2; - U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; - U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; - - if (l >= PL_regeol) - sayNO; - toLOWER_utf8((U8*)s, tmpbuf1, &ulen1); - toLOWER_utf8((U8*)l, tmpbuf2, &ulen2); - if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1)) - sayNO; - s += ulen1; - l += ulen2; - } + if (type != REF /* REF can do byte comparison */ + && (utf8_target + || (type == REFFU + && (*s == (char) LATIN_SMALL_LETTER_SHARP_S + || *locinput == (char) LATIN_SMALL_LETTER_SHARP_S)))) + { /* XXX handle REFFL better */ + char * limit = PL_regeol; + + /* This call case insensitively compares the entire buffer + * at s, with the current input starting at locinput, but + * not going off the end given by PL_regeol, and returns in + * limit upon success, how much of the current input was + * matched */ + if (! foldEQ_utf8(s, NULL, PL_regoffs[n].end - ln, utf8_target, + locinput, &limit, 0, utf8_target)) + { + sayNO; } - locinput = l; + locinput = limit; nextchr = UCHARAT(locinput); break; } - /* Inline the first character, for speed. */ + /* Not utf8: Inline the first character, for speed. */ if (UCHARAT(s) != nextchr && (type == REF || - (UCHARAT(s) != (type == REFF - ? PL_fold : PL_fold_locale)[nextchr]))) + UCHARAT(s) != fold_array[nextchr])) sayNO; ln = PL_regoffs[n].end - ln; if (locinput + ln > PL_regeol) sayNO; if (ln > 1 && (type == REF ? memNE(s, locinput, ln) - : (type == REFF - ? ! foldEQ(s, locinput, ln) - : ! foldEQ_locale(s, locinput, ln)))) + : ! folder(s, locinput, ln))) sayNO; locinput += ln; nextchr = UCHARAT(locinput); @@ -4843,12 +5075,12 @@ NULL { ST.c1 = (U8)*STRING(text_node); - ST.c2 = - (IS_TEXTF(text_node)) - ? PL_fold[ST.c1] - : (IS_TEXTFL(text_node)) - ? PL_fold_locale[ST.c1] - : ST.c1; + switch (OP(text_node)) { + case EXACTF: ST.c2 = PL_fold[ST.c1]; break; + case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; + case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; + default: ST.c2 = ST.c1; + } } } } @@ -4995,14 +5227,16 @@ NULL if this changes back then the macro for IS_TEXT and friends need to change. */ if (!UTF_PATTERN) { - ST.c2 = ST.c1 = *s; - if (IS_TEXTF(text_node)) - ST.c2 = PL_fold[ST.c1]; - else if (IS_TEXTFL(text_node)) - ST.c2 = PL_fold_locale[ST.c1]; + ST.c1 = *s; + switch (OP(text_node)) { + case EXACTF: ST.c2 = PL_fold[ST.c1]; break; + case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; + case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; + default: ST.c2 = ST.c1; break; + } } else { /* UTF_PATTERN */ - if (IS_TEXTF(text_node)) { + if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) { STRLEN ulen1, ulen2; U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; @@ -5468,7 +5702,7 @@ NULL n = ARG(scan); if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) { locinput += ln; - } else if ( 0xDF == n && !utf8_target && !UTF_PATTERN ) { + } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) { sayNO; } else { U8 folded[UTF8_MAXBYTES_CASE+1]; @@ -5754,107 +5988,102 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) case CANY: scan = loceol; break; + case EXACT: + /* To get here, EXACTish nodes must have *byte* length == 1. That + * means they match only characters in the string that can be expressed + * as a single byte. For non-utf8 strings, that means a simple match. + * For utf8 strings, the character matched must be an invariant, or + * downgradable to a single byte. The pattern's utf8ness is + * irrelevant, as since it's a single byte, it either isn't utf8, or if + * it is, it's an invariant */ + + c = (U8)*STRING(p); + assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); + + if (! utf8_target || UNI_IS_INVARIANT(c)) { + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } + else { + + /* Here, the string is utf8, and the pattern char is different + * in utf8 than not, so can't compare them directly. Outside the + * loop, find find the two utf8 bytes that represent c, and then + * look for those in sequence in the utf8 string */ + U8 high = UTF8_TWO_BYTE_HI(c); + U8 low = UTF8_TWO_BYTE_LO(c); + loceol = PL_regeol; + + while (hardcount < max + && scan + 1 < loceol + && UCHARAT(scan) == high + && UCHARAT(scan + 1) == low) + { + scan += 2; + hardcount++; + } + } + break; case EXACTFL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ - case EXACT: case EXACTF: - /* To get here, EXACTish nodes must have *byte* length == 1. That means - * they match only characters in the string that can be expressed as a - * single byte. For non-utf8 strings, that means a simple match. For - * utf8 strings, the character matched must be an invariant, or - * downgradable to a single byte. The pattern's utf8ness is - * irrelevant, as it must be a single byte, so either it isn't utf8, or - * if it is it's an invariant */ + case EXACTFU: + + /* The comments for the EXACT case above apply as well to these fold + * ones */ c = (U8)*STRING(p); assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); - if ((! utf8_target) || UNI_IS_INVARIANT(c)) { + if (utf8_target) { /* Use full Unicode fold matching */ - /* Here, the string isn't utf8, or the character in the EXACT - * node is the same in utf8 as not, so can just do equality. - * Each matching char must be 1 byte long */ - switch (OP(p)) { - case EXACT: - while (scan < loceol && UCHARAT(scan) == c) { - scan++; - } - break; - case EXACTF: - while (scan < loceol && - (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c])) - { - scan++; - } - break; - case EXACTFL: - while (scan < loceol && - (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c])) - { - scan++; - } - break; - default: - Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); + /* For the EXACTFL case, It doesn't really make sense to compare + * locale and utf8, but it is best we can do. The documents warn + * against mixing them */ + + char *tmpeol = loceol; + while (hardcount < max + && foldEQ_utf8(scan, &tmpeol, 0, utf8_target, + STRING(p), NULL, 1, cBOOL(UTF_PATTERN))) + { + scan = tmpeol; + tmpeol = loceol; + hardcount++; } + + /* XXX Note that the above handles properly the German sharp s in + * the pattern matching ss in the string. But it doesn't handle + * properly cases where the string contains say 'LIGATURE ff' and + * the pattern is 'f+'. This would require, say, a new function or + * revised interface to foldEQ_utf8(), in which the maximum number + * of characters to match could be passed and it would return how + * many actually did. This is just one of many cases where + * multi-char folds don't work properly, and so the fix is being + * deferred */ } else { + U8 folded; - /* Here, the string is utf8, and the pattern char is different - * in utf8 than not. */ - + /* Here, the string isn't utf8 and c is a single byte; and either + * the pattern isn't utf8 or c is an invariant, so its utf8ness + * doesn't affect c. Can just do simple comparisons for exact or + * fold matching. */ switch (OP(p)) { - case EXACT: - { - /* Fastest to find the two utf8 bytes that represent c, and - * then look for those in sequence in the utf8 string */ - U8 high = UTF8_TWO_BYTE_HI(c); - U8 low = UTF8_TWO_BYTE_LO(c); - loceol = PL_regeol; - - while (hardcount < max - && scan + 1 < loceol - && UCHARAT(scan) == high - && UCHARAT(scan + 1) == low) - { - scan += 2; - hardcount++; - } - } - break; - case EXACTFL: /* Doesn't really make sense, but is best we can - do. The documents warn against mixing locale - and utf8 */ - case EXACTF: - { /* utf8 string, so use utf8 foldEQ */ - char *tmpeol = loceol; - while (hardcount < max - && foldEQ_utf8(scan, &tmpeol, 0, utf8_target, - STRING(p), NULL, 1, UTF_PATTERN)) - { - scan = tmpeol; - tmpeol = loceol; - hardcount++; - } - - /* XXX Note that the above handles properly the German - * sharp ss in the pattern matching ss in the string. But - * it doesn't handle properly cases where the string - * contains say 'LIGATURE ff' and the pattern is 'f+'. - * This would require, say, a new function or revised - * interface to foldEQ_utf8(), in which the maximum number - * of characters to match could be passed and it would - * return how many actually did. This is just one of many - * cases where multi-char folds don't work properly, and so - * the fix is being deferred */ - } - break; - default: - Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); + case EXACTF: folded = PL_fold[c]; break; + case EXACTFU: folded = PL_fold_latin1[c]; break; + case EXACTFL: folded = PL_fold_locale[c]; break; + default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); + } + while (scan < loceol && + (UCHARAT(scan) == c || UCHARAT(scan) == folded)) + { + scan++; } } break; + case ANYOFV: case ANYOF: if (utf8_target) { loceol = PL_regeol; @@ -5868,8 +6097,9 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; - case ALNUM: + case ALNUMU: if (utf8_target) { + utf8_wordchar: loceol = PL_regeol; LOAD_UTF8_CHARCLASS_ALNUM(); while (hardcount < max && scan < loceol && @@ -5878,14 +6108,22 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan += UTF8SKIP(scan); hardcount++; } - } else if (FLAGS(p) & USE_UNI) { + } else { while (scan < loceol && isWORDCHAR_L1((U8) *scan)) { scan++; } - } else { - while (scan < loceol && isALNUM((U8) *scan)) { - scan++; - } + } + break; + case ALNUM: + if (utf8_target) + goto utf8_wordchar; + while (scan < loceol && isALNUM((U8) *scan)) { + scan++; + } + break; + case ALNUMA: + while (scan < loceol && isWORDCHAR_A((U8) *scan)) { + scan++; } break; case ALNUML: @@ -5902,24 +6140,42 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; - case NALNUM: + case NALNUMU: if (utf8_target) { + + utf8_Nwordchar: + loceol = PL_regeol; LOAD_UTF8_CHARCLASS_ALNUM(); while (hardcount < max && scan < loceol && - !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) + ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) { scan += UTF8SKIP(scan); hardcount++; } - } else if (FLAGS(p) & USE_UNI) { + } else { while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) { scan++; } - } else { - while (scan < loceol && ! isALNUM((U8) *scan)) { - scan++; - } + } + break; + case NALNUM: + if (utf8_target) + goto utf8_Nwordchar; + while (scan < loceol && ! isALNUM((U8) *scan)) { + scan++; + } + break; + case NALNUMA: + if (utf8_target) { + while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) { + scan += UTF8SKIP(scan); + } + } + else { + while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) { + scan++; + } } break; case NALNUML: @@ -5936,8 +6192,11 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; - case SPACE: + case SPACEU: if (utf8_target) { + + utf8_space: + loceol = PL_regeol; LOAD_UTF8_CHARCLASS_SPACE(); while (hardcount < max && scan < loceol && @@ -5947,13 +6206,25 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan += UTF8SKIP(scan); hardcount++; } - } else if (FLAGS(p) & USE_UNI) { + break; + } + else { while (scan < loceol && isSPACE_L1((U8) *scan)) { scan++; } - } else { - while (scan < loceol && isSPACE((U8) *scan)) - scan++; + break; + } + case SPACE: + if (utf8_target) + goto utf8_space; + + while (scan < loceol && isSPACE((U8) *scan)) { + scan++; + } + break; + case SPACEA: + while (scan < loceol && isSPACE_A((U8) *scan)) { + scan++; } break; case SPACEL: @@ -5961,7 +6232,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && - (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { + isSPACE_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); hardcount++; } @@ -5970,25 +6241,46 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; - case NSPACE: + case NSPACEU: if (utf8_target) { + + utf8_Nspace: + loceol = PL_regeol; LOAD_UTF8_CHARCLASS_SPACE(); while (hardcount < max && scan < loceol && - !(*scan == ' ' || - swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) + ! (*scan == ' ' || + swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) { scan += UTF8SKIP(scan); hardcount++; } - } else if (FLAGS(p) & USE_UNI) { + break; + } + else { while (scan < loceol && ! isSPACE_L1((U8) *scan)) { scan++; } - } else { - while (scan < loceol && ! isSPACE((U8) *scan)) { - scan++; - } + } + break; + case NSPACE: + if (utf8_target) + goto utf8_Nspace; + + while (scan < loceol && ! isSPACE((U8) *scan)) { + scan++; + } + break; + case NSPACEA: + if (utf8_target) { + while (scan < loceol && ! isSPACE_A((U8) *scan)) { + scan += UTF8SKIP(scan); + } + } + else { + while (scan < loceol && ! isSPACE_A((U8) *scan)) { + scan++; + } } break; case NSPACEL: @@ -5996,7 +6288,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && - !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { + !isSPACE_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); hardcount++; } @@ -6019,6 +6311,25 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; + case DIGITA: + while (scan < loceol && isDIGIT_A((U8) *scan)) { + scan++; + } + break; + case DIGITL: + PL_reg_flags |= RF_tainted; + if (utf8_target) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && + isDIGIT_LC_utf8((U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && isDIGIT_LC(*scan)) + scan++; + } + break; case NDIGIT: if (utf8_target) { loceol = PL_regeol; @@ -6032,6 +6343,33 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) while (scan < loceol && !isDIGIT(*scan)) scan++; } + break; + case NDIGITA: + if (utf8_target) { + while (scan < loceol && ! isDIGIT_A((U8) *scan)) { + scan += UTF8SKIP(scan); + } + } + else { + while (scan < loceol && ! isDIGIT_A((U8) *scan)) { + scan++; + } + } + break; + case NDIGITL: + PL_reg_flags |= RF_tainted; + if (utf8_target) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && + !isDIGIT_LC_utf8((U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !isDIGIT_LC(*scan)) + scan++; + } + break; case LNBREAK: if (utf8_target) { loceol = PL_regeol; @@ -6196,6 +6534,10 @@ Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool bytes in p were matched. If there was no match, the value is undefined, possibly changed from the input. + Note that this can be a synthetic start class, a combination of various + nodes, so things you think might be mutually exclusive, such as locale, + aren't. It can match both locale and non-locale + */ STATIC bool @@ -6240,69 +6582,75 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, if (c < 256) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; - else if (flags & ANYOF_FOLD) { - U8 f; - - if (flags & ANYOF_LOCALE) { - PL_reg_flags |= RF_tainted; - f = PL_fold_locale[c]; - } - else - f = PL_fold[c]; - if (f != c && ANYOF_BITMAP_TEST(n, f)) - match = TRUE; + else if (flags & ANYOF_NON_UTF8_LATIN1_ALL + && ! utf8_target + && ! isASCII(c)) + { + match = TRUE; } - - if (!match && (flags & ANYOF_CLASS)) { + + else if (flags & ANYOF_LOCALE) { PL_reg_flags |= RF_tainted; - if ( - (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c)) - ) /* How's that for a conditional? */ + + if ((flags & ANYOF_LOC_NONBITMAP_FOLD) + && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { match = TRUE; } + else if (ANYOF_CLASS_TEST_ANY_SET(n) && + ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c)) + ) /* How's that for a conditional? */ + ) { + match = TRUE; + } } } /* If the bitmap didn't (or couldn't) match, and something outside the * bitmap could match, try that */ - if (! match && utf8_target || (flags & ANYOF_UNICODE)) { - if (utf8_target && !ANYOF_RUNTIME(n)) { - if (c < 256 && ANYOF_BITMAP_TEST(n, c)) + if (!match) { + if (utf8_target && (flags & ANYOF_UNICODE_ALL)) { + if (c >= 256 + || ((flags & ANYOF_LOC_NONBITMAP_FOLD) /* Latin1 1 that has a + non-Latin1 fold + should match */ + && _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c))) + { match = TRUE; + } } - if (!match && utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) - match = TRUE; - if (!match) { + if (!match && ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && flags & ANYOF_UTF8))) + { AV *av; SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av); @@ -6311,33 +6659,230 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, if (utf8_target) { utf8_p = (U8 *) p; } else { - STRLEN len = 1; + + /* Not utf8. Convert as much of the string as available up + * to the limit of how far the (single) character in the + * pattern can possibly match (no need to go further). If + * the node is a straight ANYOF or not folding, it can't + * match more than one. Otherwise, It can match up to how + * far a single char can fold to. Since not utf8, each + * character is a single byte, so the max it can be in + * bytes is the same as the max it can be in characters */ + STRLEN len = (OP(n) == ANYOF + || ! (flags & ANYOF_LOC_NONBITMAP_FOLD)) + ? 1 + : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND) + ? maxlen + : UTF8_MAX_FOLD_CHAR_EXPAND; utf8_p = bytes_to_utf8(p, &len); } - if (swash_fetch(sw, utf8_p, 1)) + + if (swash_fetch(sw, utf8_p, TRUE)) match = TRUE; - else if (flags & ANYOF_FOLD) { - if (!match && lenp && av) { + else if (flags & ANYOF_LOC_NONBITMAP_FOLD) { + + /* Here, we need to test if the fold of the target string + * matches. In the case of a multi-char fold that is + * caught by regcomp.c, it has stored all such folds into + * 'av'; we linearly check to see if any match the target + * string (folded). We know that the originals were each + * one character, but we don't currently know how many + * characters/bytes each folded to, except we do know that + * there are small limits imposed by Unicode. XXX A + * performance enhancement would be to have regcomp.c store + * the max number of chars/bytes that are in an av entry, + * as, say the 0th element. Even better would be to have a + * hash of the few characters that can start a multi-char + * fold to the max number of chars of those folds. + * + * Further down, if there isn't a + * match in the av, we will check if there is another + * fold-type match. For that, we also need the fold, but + * only the first character. No sense in folding it twice, + * so we do it here, even if there isn't any multi-char + * fold, so we always fold at least the first character. + * If the node is a straight ANYOF node, or there is only + * one character available in the string, or if there isn't + * any av, that's all we have to fold. In the case of a + * multi-char fold, we do have guarantees in Unicode that + * it can only expand up to so many characters and so many + * bytes. We keep track so don't exceed either. + * + * If there is a match, we will need to advance (if lenp is + * specified) the match pointer in the target string. But + * what we are comparing here isn't that string directly, + * but its fold, whose length may differ from the original. + * As we go along in constructing the fold, therefore, we + * create a map so that we know how many bytes in the + * source to advance given that we have matched a certain + * number of bytes in the fold. This map is stored in + * 'map_fold_len_back'. The first character in the fold + * has array element 1 contain the number of bytes in the + * source that folded to it; the 2nd is the cumulative + * number to match it; ... */ + U8 map_fold_len_back[UTF8_MAX_FOLD_CHAR_EXPAND] = { 0 }; + U8 folded[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen = 0; /* num bytes in fold of 1st char */ + STRLEN foldlen_for_av; /* num bytes in fold of all chars */ + + if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) { + + /* Here, only need to fold the first char of the target + * string */ + to_utf8_fold(utf8_p, folded, &foldlen); + foldlen_for_av = foldlen; + map_fold_len_back[1] = UTF8SKIP(utf8_p); + } + else { + + /* Here, need to fold more than the first char. Do so + * up to the limits */ + UV which_char = 0; + U8* source_ptr = utf8_p; /* The source for the fold + is the regex target + string */ + U8* folded_ptr = folded; + U8* e = utf8_p + maxlen; /* Can't go beyond last + available byte in the + target string */ + while (which_char < UTF8_MAX_FOLD_CHAR_EXPAND + && source_ptr < e) + { + + /* Fold the next character */ + U8 this_char_folded[UTF8_MAXBYTES_CASE+1]; + STRLEN this_char_foldlen; + to_utf8_fold(source_ptr, + this_char_folded, + &this_char_foldlen); + + /* Bail if it would exceed the byte limit for + * folding a single char. */ + if (this_char_foldlen + folded_ptr - folded > + UTF8_MAXBYTES_CASE) + { + break; + } + + /* Save the first character's folded length, in + * case we have to use it later */ + if (! foldlen) { + foldlen = this_char_foldlen; + } + + /* Here, add the fold of this character */ + Copy(this_char_folded, + folded_ptr, + this_char_foldlen, + U8); + which_char++; + map_fold_len_back[which_char] = + map_fold_len_back[which_char - 1] + + UTF8SKIP(source_ptr); + folded_ptr += this_char_foldlen; + source_ptr += UTF8SKIP(source_ptr); + } + *folded_ptr = '\0'; + foldlen_for_av = folded_ptr - folded; + } + + + /* Do the linear search to see if the fold is in the list + * of multi-char folds. (Useless to look if won't be able + * to store that it is a multi-char fold in *lenp) */ + if (lenp && av) { I32 i; for (i = 0; i <= av_len(av); i++) { SV* const sv = *av_fetch(av, i, FALSE); STRLEN len; const char * const s = SvPV_const(sv, len); - if (len <= maxlen && memEQ(s, (char*)utf8_p, len)) { - *lenp = len; + if (len <= foldlen_for_av && memEQ(s, + (char*)folded, + len)) + { + + /* Advance the target string ptr to account for + * this fold, but have to translate from the + * folded length to the corresponding source + * length. The array is indexed by how many + * characters in the match */ + *lenp = map_fold_len_back[ + utf8_length(folded, folded + len)]; match = TRUE; break; } } } - if (!match) { - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; +#if 0 + if (!match) { /* See if the folded version matches */ + SV** listp; + + /* Consider "k" =~ /[K]/i. The line above would have + * just folded the 'k' to itself, and that isn't going + * to match 'K'. So we look through the closure of + * everything that folds to 'k'. That will find the + * 'K'. Initialize the list, if necessary */ + if (! PL_utf8_foldclosures) { + + /* If the folds haven't been read in, call a fold + * function to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES+1]; + STRLEN dummy_len; + to_utf8_fold((U8*) "A", dummy, &dummy_len); + } + PL_utf8_foldclosures = + _swash_inversion_hash(PL_utf8_tofold); + } - STRLEN tmplen; - to_utf8_fold(utf8_p, tmpbuf, &tmplen); - if (swash_fetch(sw, tmpbuf, 1)) - match = TRUE; + /* The data structure is a hash with the keys every + * character that is folded to, like 'k', and the + * values each an array of everything that folds to its + * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) folded, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV i; + for (i = 0; i <= av_len(list); i++) { + SV** try_p = av_fetch(list, i, FALSE); + char* try_c; + if (try_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + /* Don't have to worry about embedded nulls + * since NULL isn't folded or foldable */ + try_c = SvPVX(*try_p); + + /* The fold in a few cases of an above Latin1 + * char is in the Latin1 range, and hence may + * be in the bitmap */ + if (UTF8_IS_INVARIANT(*try_c) + && ANYOF_BITMAP_TEST(n, + UNI_TO_NATIVE(*try_c))) + { + match = TRUE; + break; + } + else if + (UTF8_IS_DOWNGRADEABLE_START(*try_c) + && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE( + TWO_BYTE_UTF8_TO_UNI(try_c[0], + try_c[1])))) + { + /* Since the fold comes from internally + * generated data, we can safely assume it + * is valid utf8 in the test above */ + match = TRUE; + break; + } else if (swash_fetch(sw, (U8*) try_c, TRUE)) { + match = TRUE; + break; + } + } + } } +#endif } /* If we allocated a string above, free it */