X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/1c8f8eb1d500716bc7368dab00869ae3be841e0c..8f9aa6a33b8add4f811c7b17012eee9544cda48b:/regexec.c diff --git a/regexec.c b/regexec.c index 307e274..476a966 100644 --- a/regexec.c +++ b/regexec.c @@ -85,7 +85,7 @@ #define RF_utf8 8 /* Pattern contains multibyte chars? */ -#define UTF ((PL_reg_flags & RF_utf8) != 0) +#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0) #define RS_init 1 /* eval environment created */ #define RS_set 2 /* replsv value is set */ @@ -94,13 +94,17 @@ #define STATIC static #endif -#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c))) +/* Valid for non-utf8 strings only: avoids the reginclass call if there are no + * complications: i.e., if everything matchable is straight forward in the + * bitmap */ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \ + : ANYOF_BITMAP_TEST(p,*(c))) /* * Forwards. */ -#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv)) +#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv)) #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b) #define HOPc(pos,off) \ @@ -123,7 +127,7 @@ /* Doesn't do an assert to verify that is correct */ #define LOAD_UTF8_CHARCLASS_NO_CHECK(class) STMT_START { \ - if (!CAT2(PL_utf8_,class)) { bool ok; ENTER; save_re_context(); ok=CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END + if (!CAT2(PL_utf8_,class)) { bool throw_away; ENTER; save_re_context(); throw_away = CAT2(is_utf8_,class)((const U8*)" "); LEAVE; } } STMT_END #define LOAD_UTF8_CHARCLASS_ALNUM() LOAD_UTF8_CHARCLASS(alnum,"a") #define LOAD_UTF8_CHARCLASS_DIGIT() LOAD_UTF8_CHARCLASS(digit,"0") @@ -175,84 +179,135 @@ #define RE_utf8_posix_digit PL_utf8_posix_digit #endif - -#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ - case NAMEL: \ - PL_reg_flags |= RF_tainted; \ - /* FALL THROUGH */ \ - case NAME: \ - if (!nextchr) \ - sayNO; \ - if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \ - if (!CAT2(PL_utf8_,CLASS)) { \ - bool ok; \ - ENTER; \ - save_re_context(); \ - ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \ - assert(ok); \ - LEAVE; \ - } \ - if (!(OP(scan) == NAME \ - ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)) \ - : LCFUNC_utf8((U8*)locinput))) \ - { \ - sayNO; \ - } \ - locinput += PL_utf8skip[nextchr]; \ - nextchr = UCHARAT(locinput); \ - break; \ - } \ - if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \ - sayNO; \ - nextchr = UCHARAT(++locinput); \ - break - -#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ - case NAMEL: \ - PL_reg_flags |= RF_tainted; \ - /* FALL THROUGH */ \ - case NAME : \ - if (!nextchr && locinput >= PL_regeol) \ - sayNO; \ - if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \ - if (!CAT2(PL_utf8_,CLASS)) { \ - bool ok; \ - ENTER; \ - save_re_context(); \ - ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \ - assert(ok); \ - LEAVE; \ - } \ - if ((OP(scan) == NAME \ - ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)) \ - : LCFUNC_utf8((U8*)locinput))) \ - { \ - sayNO; \ - } \ - locinput += PL_utf8skip[nextchr]; \ - nextchr = UCHARAT(locinput); \ - break; \ - } \ - if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \ - sayNO; \ - nextchr = UCHARAT(++locinput); \ - break - - - - +#define PLACEHOLDER /* Something for the preprocessor to grab onto */ + +/* The actual code for CCC_TRY, which uses several variables from the routine + * it's callable from. It is designed to be the bulk of a case statement. + * FUNC is the macro or function to call on non-utf8 targets that indicate if + * nextchr matches the class. + * UTF8_TEST is the whole test string to use for utf8 targets + * LOAD is what to use to test, and if not present to load in the swash for the + * class + * POS_OR_NEG is either empty or ! to complement the results of FUNC or + * UTF8_TEST test. + * The logic is: Fail if we're at the end-of-string; otherwise if the target is + * utf8 and a variant, load the swash if necessary and test using the utf8 + * test. Advance to the next character if test is ok, otherwise fail; If not + * utf8 or an invariant under utf8, use the non-utf8 test, and fail if it + * fails, or advance to the next character */ + +#define _CCC_TRY_CODE(POS_OR_NEG, FUNC, UTF8_TEST, CLASS, STR) \ + if (locinput >= PL_regeol) { \ + sayNO; \ + } \ + if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \ + LOAD_UTF8_CHARCLASS(CLASS, STR); \ + if (POS_OR_NEG (UTF8_TEST)) { \ + sayNO; \ + } \ + locinput += PL_utf8skip[nextchr]; \ + nextchr = UCHARAT(locinput); \ + break; \ + } \ + if (POS_OR_NEG (FUNC(nextchr))) { \ + sayNO; \ + } \ + nextchr = UCHARAT(++locinput); \ + break; + +/* Handle the non-locale cases for a character class and its complement. It + * calls _CCC_TRY_CODE with a ! to complement the test for the character class. + * This is because that code fails when the test succeeds, so we want to have + * the test fail so that the code succeeds. The swash is stored in a + * predictable PL_ place */ +#define _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, \ + CLASS, STR) \ + case NAME: \ + _CCC_TRY_CODE( !, FUNC, \ + cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \ + (U8*)locinput, TRUE)), \ + CLASS, STR) \ + case NNAME: \ + _CCC_TRY_CODE( PLACEHOLDER , FUNC, \ + cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), \ + (U8*)locinput, TRUE)), \ + CLASS, STR) \ + +/* Generate the case statements for both locale and non-locale character + * classes in regmatch for classes that don't have special unicode semantics. + * Locales don't use an immediate swash, but an intermediary special locale + * function that is called on the pointer to the current place in the input + * string. That function will resolve to needing the same swash. One might + * think that because we don't know what the locale will match, we shouldn't + * check with the swash loading function that it loaded properly; ie, that we + * should use LOAD_UTF8_CHARCLASS_NO_CHECK for those, but what is passed to the + * regular LOAD_UTF8_CHARCLASS is in non-locale terms, and so locale is + * irrelevant here */ +#define CCC_TRY(NAME, NNAME, FUNC, \ + NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \ + NAMEA, NNAMEA, FUNCA, \ + CLASS, STR) \ + case NAMEL: \ + PL_reg_flags |= RF_tainted; \ + _CCC_TRY_CODE( !, LCFUNC, LCFUNC_utf8((U8*)locinput), CLASS, STR) \ + case NNAMEL: \ + PL_reg_flags |= RF_tainted; \ + _CCC_TRY_CODE( PLACEHOLDER, LCFUNC, LCFUNC_utf8((U8*)locinput), \ + CLASS, STR) \ + case NAMEA: \ + if (locinput >= PL_regeol || ! FUNCA(nextchr)) { \ + sayNO; \ + } \ + /* Matched a utf8-invariant, so don't have to worry about utf8 */ \ + nextchr = UCHARAT(++locinput); \ + break; \ + case NNAMEA: \ + if (locinput >= PL_regeol || FUNCA(nextchr)) { \ + sayNO; \ + } \ + if (utf8_target) { \ + locinput += PL_utf8skip[nextchr]; \ + nextchr = UCHARAT(locinput); \ + } \ + else { \ + nextchr = UCHARAT(++locinput); \ + } \ + break; \ + /* Generate the non-locale cases */ \ + _CCC_TRY_NONLOCALE(NAME, NNAME, FUNC, CLASS, STR) + +/* This is like CCC_TRY, but has an extra set of parameters for generating case + * statements to handle separate Unicode semantics nodes */ +#define CCC_TRY_U(NAME, NNAME, FUNC, \ + NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \ + NAMEU, NNAMEU, FUNCU, \ + NAMEA, NNAMEA, FUNCA, \ + CLASS, STR) \ + CCC_TRY(NAME, NNAME, FUNC, \ + NAMEL, NNAMEL, LCFUNC, LCFUNC_utf8, \ + NAMEA, NNAMEA, FUNCA, \ + CLASS, STR) \ + _CCC_TRY_NONLOCALE(NAMEU, NNAMEU, FUNCU, CLASS, STR) /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ /* for use after a quantifier and before an EXACT-like node -- japhy */ -/* it would be nice to rework regcomp.sym to generate this stuff. sigh */ +/* it would be nice to rework regcomp.sym to generate this stuff. sigh + * + * NOTE that *nothing* that affects backtracking should be in here, specifically + * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a + * node that is in between two EXACT like nodes when ascertaining what the required + * "follow" character is. This should probably be moved to regex compile time + * although it may be done at run time beause of the REF possibility - more + * investigation required. -- demerphq +*/ #define JUMPABLE(rn) ( \ OP(rn) == OPEN || \ (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ OP(rn) == EVAL || \ OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ OP(rn) == PLUS || OP(rn) == MINMOD || \ - OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \ + OP(rn) == KEEPS || \ (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ ) #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) @@ -263,12 +318,13 @@ /* Currently these are only used when PL_regkind[OP(rn)] == EXACT so we don't need this definition. */ #define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF ) -#define IS_TEXTF(rn) ( OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF ) +#define IS_TEXTF(rn) ( (OP(rn)==EXACTFU || OP(rn)==EXACTF) || OP(rn)==REFF || OP(rn)==NREFF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL ) #else /* ... so we use this as its faster. */ #define IS_TEXT(rn) ( OP(rn)==EXACT ) +#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU ) #define IS_TEXTF(rn) ( OP(rn)==EXACTF ) #define IS_TEXTFL(rn) ( OP(rn)==EXACTFL ) @@ -493,7 +549,7 @@ Perl_pregexec(pTHX_ REGEXP * const prog, char* stringarg, register char *strend, a) Anchored substring; b) Fixed substring; c) Whether we are anchored (beginning-of-line or \G); - d) First node (of those at offset 0) which may distingush positions; + d) First node (of those at offset 0) which may distinguish positions; We use a)b)d) and multiline-part of c), and try to find a position in the string which does not contradict any of them. */ @@ -515,7 +571,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, register SV *check; char *strbeg; char *t; - const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ + const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ I32 ml_anch; register char *other_last = NULL; /* other substr checked before this */ char *check_at = NULL; /* check substr found at this pos */ @@ -528,13 +584,13 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, PERL_ARGS_ASSERT_RE_INTUIT_START; - RX_MATCH_UTF8_set(rx,do_utf8); + RX_MATCH_UTF8_set(rx,utf8_target); if (RX_UTF8(rx)) { PL_reg_flags |= RF_utf8; } DEBUG_EXECUTE_r( - debug_start_match(rx, do_utf8, strpos, strend, + debug_start_match(rx, utf8_target, strpos, strend, sv ? "Guessing start of match in sv for" : "Guessing start of match in string for"); ); @@ -548,7 +604,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos; PL_regeol = strend; - if (do_utf8) { + if (utf8_target) { if (!prog->check_utf8 && prog->check_substr) to_utf8_substr(prog); check = prog->check_utf8; @@ -700,11 +756,11 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, unshift s. */ DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), SvPVX_const(check), RE_SV_DUMPLEN(check), 30); PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s", (s ? "Found" : "Did not find"), - (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) + (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"), quoted, RE_SV_TAIL(check), @@ -735,14 +791,14 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, Probably it is right to do no SCREAM here... */ - if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) + if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) { /* Take into account the "other" substring. */ /* XXXX May be hopelessly wrong for UTF... */ if (!other_last) other_last = strpos; - if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) { + if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) { do_other_anchored: { char * const last = HOP3c(s, -start_shift, strbeg); @@ -752,7 +808,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, t = s - prog->check_offset_max; if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!do_utf8 + && (!utf8_target || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos)) && t > strpos))) NOOP; @@ -771,7 +827,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, */ /* On end-of-str: see comment below. */ - must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr; + must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr; if (must == &PL_sv_undef) { s = (char*)NULL; DEBUG_r(must = prog->anchored_utf8); /* for debug */ @@ -785,7 +841,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, multiline ? FBMrf_MULTILINE : 0 ); DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), SvPVX_const(must), RE_SV_DUMPLEN(must), 30); PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s", (s ? "Found" : "Contradicts"), @@ -832,7 +888,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, if (s < other_last) s = other_last; /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ - must = do_utf8 ? prog->float_utf8 : prog->float_substr; + must = utf8_target ? prog->float_utf8 : prog->float_substr; /* fbm_instr() takes into account exact value of end-of-str if the check is SvTAIL(ed). Since false positives are OK, and end-of-str is not later than strend we are OK. */ @@ -846,7 +902,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, - (SvTAIL(must)!=0), must, multiline ? FBMrf_MULTILINE : 0); DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), SvPVX_const(must), RE_SV_DUMPLEN(must), 30); PerlIO_printf(Perl_debug_log, "%s floating substr %s%s", (s ? "Found" : "Contradicts"), @@ -893,7 +949,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, ); if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!do_utf8 + && (!utf8_target || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos))) && t > strpos))) { @@ -911,7 +967,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, while (t < strend - prog->minlen) { if (*t == '\n') { if (t < check_at - prog->check_offset_min) { - if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) { + if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { /* Since we moved from the found position, we definitely contradict the found anchored substr. Due to the above check we do not @@ -951,7 +1007,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, } s = t; set_useful: - ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ } else { /* The found string does not prohibit matching at strpos, @@ -975,7 +1031,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, ); success_at_start: if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */ - && (do_utf8 ? ( + && (utf8_target ? ( prog->check_utf8 /* Could be deleted already */ && --BmUSEFUL(prog->check_utf8) < 0 && (prog->check_utf8 == prog->float_utf8) @@ -987,17 +1043,22 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, { /* If flags & SOMETHING - do not do it many times on the same match */ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n")); - /* XXX Does the destruction order has to change with do_utf8? */ - SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr); - SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8); + /* XXX Does the destruction order has to change with utf8_target? */ + SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); + SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); prog->check_substr = prog->check_utf8 = NULL; /* disable */ prog->float_substr = prog->float_utf8 = NULL; /* clear */ check = NULL; /* abort */ s = strpos; + /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevant flag + see http://bugs.activestate.com/show_bug.cgi?id=87173 */ + if (prog->intflags & PREGf_IMPLICIT) + prog->extflags &= ~RXf_ANCH_MBOL; /* XXXX This is a remnant of the old implementation. It looks wasteful, since now INTUIT can use many other heuristics. */ prog->extflags &= ~RXf_USE_INTUIT; + /* XXXX What other flags might need to be cleared in this branch? */ } else s = strpos; @@ -1015,7 +1076,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, even for \b or \B. But (minlen? 1 : 0) below assumes that regstclass does not come from lookahead... */ /* If regstclass takes bytelength more than 1: If charlength==1, OK. - This leaves EXACTF only, which is dealt with in find_byclass(). */ + This leaves EXACTF, EXACTFU only, which are dealt with in find_byclass(). */ const U8* const str = (U8*)STRING(progi->regstclass); const int cl_l = (PL_regkind[OP(progi->regstclass)] == EXACT ? CHR_DIST(str+STR_LEN(progi->regstclass), str) @@ -1048,7 +1109,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, goto fail; /* Contradict one of substrings */ if (prog->anchored_substr || prog->anchored_utf8) { - if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) { + if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) { DEBUG_EXECUTE_r( what = "anchored" ); hop_and_restart: s = HOP3c(t, 1, strend); @@ -1088,9 +1149,9 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, PL_colors[0], PL_colors[1], (long)(t - i_strpos)) ); goto try_at_offset; } - if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ goto fail; - /* Check is floating subtring. */ + /* Check is floating substring. */ retry_floating_check: t = check_at - start_shift; DEBUG_EXECUTE_r( what = "floating" ); @@ -1116,7 +1177,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, fail_finish: /* Substring not found */ if (prog->check_substr || prog->check_utf8) /* could be removed already */ - BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ + BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ fail: DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", PL_colors[4], PL_colors[5])); @@ -1126,8 +1187,8 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, #define DECL_TRIE_TYPE(scan) \ const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ trie_type = (scan->flags != EXACT) \ - ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \ - : (do_utf8 ? trie_utf8 : trie_plain) + ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \ + : (utf8_target ? trie_utf8 : trie_plain) #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ @@ -1184,8 +1245,8 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ char *my_strend= (char *)strend; \ if ( (CoNd) \ && (ln == len || \ - !ibcmp_utf8(s, &my_strend, 0, do_utf8, \ - m, NULL, ln, cBOOL(UTF))) \ + foldEQ_utf8(s, &my_strend, 0, utf8_target, \ + m, NULL, ln, cBOOL(UTF_PATTERN))) \ && (!reginfo || regtry(reginfo, &s)) ) \ goto got_it; \ else { \ @@ -1195,8 +1256,8 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ if ( f != c \ && (f == c1 || f == c2) \ && (ln == len || \ - !ibcmp_utf8(s, &my_strend, 0, do_utf8,\ - m, NULL, ln, cBOOL(UTF)))\ + foldEQ_utf8(s, &my_strend, 0, utf8_target,\ + m, NULL, ln, cBOOL(UTF_PATTERN)))\ && (!reginfo || regtry(reginfo, &s)) ) \ goto got_it; \ } \ @@ -1205,12 +1266,18 @@ s += len #define REXEC_FBC_EXACTISH_SCAN(CoNd) \ STMT_START { \ + re_fold_t folder; \ + switch (OP(c)) { \ + case EXACTFU: folder = foldEQ_latin1; break; \ + case EXACTFL: folder = foldEQ_locale; break; \ + case EXACTF: folder = foldEQ; break; \ + default: \ + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(c)); \ + } \ while (s <= e) { \ if ( (CoNd) \ - && (ln == 1 || !(OP(c) == EXACTF \ - ? ibcmp(s, m, ln) \ - : ibcmp_locale(s, m, ln))) \ - && (!reginfo || regtry(reginfo, &s)) ) \ + && (ln == 1 || folder(s, m, ln)) \ + && (!reginfo || regtry(reginfo, &s)) ) \ goto got_it; \ s++; \ } \ @@ -1261,37 +1328,114 @@ if ((!reginfo || regtry(reginfo, &s))) \ goto got_it #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \ - if (do_utf8) { \ + if (utf8_target) { \ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ } \ else { \ REXEC_FBC_CLASS_SCAN(CoNd); \ - } \ - break + } #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \ - if (do_utf8) { \ + if (utf8_target) { \ UtFpReLoAd; \ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ } \ else { \ REXEC_FBC_CLASS_SCAN(CoNd); \ - } \ - break + } #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \ PL_reg_flags |= RF_tainted; \ - if (do_utf8) { \ + if (utf8_target) { \ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ } \ else { \ REXEC_FBC_CLASS_SCAN(CoNd); \ - } \ - break + } #define DUMP_EXEC_POS(li,s,doutf8) \ dump_exec_pos(li,s,(PL_regeol),(PL_bostr),(PL_reg_starttry),doutf8) + +#define UTF8_NOLOAD(TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ + tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \ + tmp = TEST_NON_UTF8(tmp); \ + REXEC_FBC_UTF8_SCAN( \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + tmp = !tmp; \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + +#define UTF8_LOAD(TeSt1_UtF8, TeSt2_UtF8, IF_SUCCESS, IF_FAIL) \ + if (s == PL_bostr) { \ + tmp = '\n'; \ + } \ + else { \ + U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); \ + tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); \ + } \ + tmp = TeSt1_UtF8; \ + LOAD_UTF8_CHARCLASS_ALNUM(); \ + REXEC_FBC_UTF8_SCAN( \ + if (tmp == ! (TeSt2_UtF8)) { \ + tmp = !tmp; \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + +/* The only difference between the BOUND and NBOUND cases is that + * REXEC_FBC_TRYIT is called when matched in BOUND, and when non-matched in + * NBOUND. This is accomplished by passing it in either the if or else clause, + * with the other one being empty */ +#define FBC_BOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ + FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) + +#define FBC_BOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ + FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER), TEST_NON_UTF8, REXEC_FBC_TRYIT, PLACEHOLDER) + +#define FBC_NBOUND(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ + FBC_BOUND_COMMON(UTF8_LOAD(TEST1_UTF8, TEST2_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) + +#define FBC_NBOUND_NOLOAD(TEST_NON_UTF8, TEST1_UTF8, TEST2_UTF8) \ + FBC_BOUND_COMMON(UTF8_NOLOAD(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) + + +/* Common to the BOUND and NBOUND cases. Unfortunately the UTF8 tests need to + * be passed in completely with the variable name being tested, which isn't + * such a clean interface, but this is easier to read than it was before. We + * are looking for the boundary (or non-boundary between a word and non-word + * character. The utf8 and non-utf8 cases have the same logic, but the details + * must be different. Find the "wordness" of the character just prior to this + * one, and compare it with the wordness of this one. If they differ, we have + * a boundary. At the beginning of the string, pretend that the previous + * character was a new-line */ +#define FBC_BOUND_COMMON(UTF8_CODE, TEST_NON_UTF8, IF_SUCCESS, IF_FAIL) \ + if (utf8_target) { \ + UTF8_CODE \ + } \ + else { /* Not utf8 */ \ + tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; \ + tmp = TEST_NON_UTF8(tmp); \ + REXEC_FBC_SCAN( \ + if (tmp == ! TEST_NON_UTF8((U8) *s)) { \ + tmp = !tmp; \ + IF_SUCCESS; \ + } \ + else { \ + IF_FAIL; \ + } \ + ); \ + } \ + if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) \ + goto got_it; + /* We know what class REx starts with. Try to find this position... */ /* if reginfo is NULL, its a dryrun */ /* annoyingly all the vars in this routine have different names from their counterparts @@ -1311,18 +1455,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, unsigned int c2; char *e; register I32 tmp = 1; /* Scratch variable? */ - register const bool do_utf8 = PL_reg_match_utf8; + register const bool utf8_target = PL_reg_match_utf8; RXi_GET_DECL(prog,progi); PERL_ARGS_ASSERT_FIND_BYCLASS; /* We know what class it must start with. */ switch (OP(c)) { + case ANYOFV: case ANYOF: - if (do_utf8) { - REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) || + if (utf8_target || OP(c) == ANYOFV) { + REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_NONBITMAP) || !UTF8_IS_INVARIANT((U8)s[0]) ? - reginclass(prog, c, (U8*)s, 0, do_utf8) : + reginclass(prog, c, (U8*)s, 0, utf8_target) : REGINCLASS(prog, c, (U8*)s)); } else { @@ -1353,11 +1498,12 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, tmp = doevery; ); break; + case EXACTFU: case EXACTF: m = STRING(c); ln = STR_LEN(c); /* length to match in octets/bytes */ lnc = (I32) ln; /* length to match in characters */ - if (UTF) { + if (UTF_PATTERN) { STRLEN ulen1, ulen2; U8 *sm = (U8 *) m; U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; @@ -1392,7 +1538,18 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } else { c1 = *(U8*)m; - c2 = PL_fold[c1]; + if (utf8_target || OP(c) == EXACTFU) { + + /* Micro sign folds to GREEK SMALL LETTER MU; + LATIN_SMALL_LETTER_SHARP_S folds to 'ss', and this sets + c2 to the first 's' of the pair, and the code below will + look for others */ + c2 = (c1 == MICRO_SIGN) + ? GREEK_SMALL_LETTER_MU + : (c1 == LATIN_SMALL_LETTER_SHARP_S) + ? 's' + : PL_fold_latin1[c1]; + } else c2 = PL_fold[c1]; } goto do_exactf; case EXACTFL: @@ -1416,9 +1573,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * than just upper and lower: one needs to use * the so-called folding case for case-insensitive * matching (called "loose matching" in Unicode). - * ibcmp_utf8() will do just that. */ + * foldEQ_utf8() will do just that. */ - if (do_utf8 || UTF) { + if (utf8_target || UTF_PATTERN) { UV c, f; U8 tmpbuf [UTF8_MAXBYTES+1]; STRLEN len = 1; @@ -1428,7 +1585,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* Upper and lower of 1st char are equal - * probably not a "letter". */ while (s <= e) { - if (do_utf8) { + if (utf8_target) { c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len, uniflags); } else { @@ -1439,7 +1596,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } else { while (s <= e) { - if (do_utf8) { + if (utf8_target) { c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len, uniflags); } else { @@ -1471,169 +1628,215 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, break; case BOUNDL: PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case BOUND: - if (do_utf8) { - if (s == PL_bostr) - tmp = '\n'; - else { - U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); - } - tmp = ((OP(c) == BOUND ? - isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); - LOAD_UTF8_CHARCLASS_ALNUM(); - REXEC_FBC_UTF8_SCAN( - if (tmp == !(OP(c) == BOUND ? - cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) : - isALNUM_LC_utf8((U8*)s))) - { - tmp = !tmp; - REXEC_FBC_TRYIT; - } - ); - } - else { - tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; - tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); - REXEC_FBC_SCAN( - if (tmp == - !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) { - tmp = !tmp; - REXEC_FBC_TRYIT; - } - ); - } - if ((!prog->minlen && tmp) && (!reginfo || regtry(reginfo, &s))) - goto got_it; + FBC_BOUND(isALNUM_LC, + isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)), + isALNUM_LC_utf8((U8*)s)); break; case NBOUNDL: PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ + FBC_NBOUND(isALNUM_LC, + isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)), + isALNUM_LC_utf8((U8*)s)); + break; + case BOUND: + FBC_BOUND(isWORDCHAR, + isALNUM_uni(tmp), + cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target))); + break; + case BOUNDA: + FBC_BOUND_NOLOAD(isWORDCHAR_A, + isWORDCHAR_A(tmp), + isWORDCHAR_A((U8*)s)); + break; case NBOUND: - if (do_utf8) { - if (s == PL_bostr) - tmp = '\n'; - else { - U8 * const r = reghop3((U8*)s, -1, (U8*)PL_bostr); - tmp = utf8n_to_uvchr(r, UTF8SKIP(r), 0, UTF8_ALLOW_DEFAULT); - } - tmp = ((OP(c) == NBOUND ? - isALNUM_uni(tmp) : isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp))) != 0); - LOAD_UTF8_CHARCLASS_ALNUM(); - REXEC_FBC_UTF8_SCAN( - if (tmp == !(OP(c) == NBOUND ? - cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) : - isALNUM_LC_utf8((U8*)s))) - tmp = !tmp; - else REXEC_FBC_TRYIT; - ); - } - else { - tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; - tmp = ((OP(c) == NBOUND ? - isALNUM(tmp) : isALNUM_LC(tmp)) != 0); - REXEC_FBC_SCAN( - if (tmp == - !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s))) - tmp = !tmp; - else REXEC_FBC_TRYIT; - ); - } - if ((!prog->minlen && !tmp) && (!reginfo || regtry(reginfo, &s))) - goto got_it; + FBC_NBOUND(isWORDCHAR, + isALNUM_uni(tmp), + cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target))); + break; + case NBOUNDA: + FBC_NBOUND_NOLOAD(isWORDCHAR_A, + isWORDCHAR_A(tmp), + isWORDCHAR_A((U8*)s)); + break; + case BOUNDU: + FBC_BOUND(isWORDCHAR_L1, + isALNUM_uni(tmp), + cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target))); + break; + case NBOUNDU: + FBC_NBOUND(isWORDCHAR_L1, + isALNUM_uni(tmp), + cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target))); break; - case ALNUM: - REXEC_FBC_CSCAN_PRELOAD( - LOAD_UTF8_CHARCLASS_PERL_WORD(), - swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8), - isALNUM(*s) - ); case ALNUML: REXEC_FBC_CSCAN_TAINT( isALNUM_LC_utf8((U8*)s), isALNUM_LC(*s) ); + break; + case ALNUMU: + REXEC_FBC_CSCAN_PRELOAD( + LOAD_UTF8_CHARCLASS_PERL_WORD(), + swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target), + isWORDCHAR_L1((U8) *s) + ); + break; + case ALNUM: + REXEC_FBC_CSCAN_PRELOAD( + LOAD_UTF8_CHARCLASS_PERL_WORD(), + swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target), + isWORDCHAR((U8) *s) + ); + break; + case ALNUMA: + /* Don't need to worry about utf8, as it can match only a single + * byte invariant character */ + REXEC_FBC_CLASS_SCAN( isWORDCHAR_A(*s)); + break; + case NALNUMU: + REXEC_FBC_CSCAN_PRELOAD( + LOAD_UTF8_CHARCLASS_PERL_WORD(), + swash_fetch(RE_utf8_perl_word,(U8*)s, utf8_target), + ! isWORDCHAR_L1((U8) *s) + ); + break; case NALNUM: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_WORD(), - !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8), - !isALNUM(*s) + !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target), + ! isALNUM(*s) ); + break; + case NALNUMA: + REXEC_FBC_CSCAN( + !isWORDCHAR_A(*s), + !isWORDCHAR_A(*s) + ); + break; case NALNUML: REXEC_FBC_CSCAN_TAINT( !isALNUM_LC_utf8((U8*)s), !isALNUM_LC(*s) ); + break; + case SPACEU: + REXEC_FBC_CSCAN_PRELOAD( + LOAD_UTF8_CHARCLASS_PERL_SPACE(), + *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target), + isSPACE_L1((U8) *s) + ); + break; case SPACE: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_SPACE(), - *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8), - isSPACE(*s) + *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target), + isSPACE((U8) *s) ); + break; + case SPACEA: + /* Don't need to worry about utf8, as it can match only a single + * byte invariant character */ + REXEC_FBC_CLASS_SCAN( isSPACE_A(*s)); + break; case SPACEL: REXEC_FBC_CSCAN_TAINT( - *s == ' ' || isSPACE_LC_utf8((U8*)s), + isSPACE_LC_utf8((U8*)s), isSPACE_LC(*s) ); + break; + case NSPACEU: + REXEC_FBC_CSCAN_PRELOAD( + LOAD_UTF8_CHARCLASS_PERL_SPACE(), + !( *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)), + ! isSPACE_L1((U8) *s) + ); + break; case NSPACE: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_SPACE(), - !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)), - !isSPACE(*s) + !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)), + ! isSPACE((U8) *s) ); + break; + case NSPACEA: + REXEC_FBC_CSCAN( + !isSPACE_A(*s), + !isSPACE_A(*s) + ); + break; case NSPACEL: REXEC_FBC_CSCAN_TAINT( - !(*s == ' ' || isSPACE_LC_utf8((U8*)s)), + !isSPACE_LC_utf8((U8*)s), !isSPACE_LC(*s) ); + break; case DIGIT: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_POSIX_DIGIT(), - swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8), + swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target), isDIGIT(*s) ); + break; + case DIGITA: + /* Don't need to worry about utf8, as it can match only a single + * byte invariant character */ + REXEC_FBC_CLASS_SCAN( isDIGIT_A(*s)); + break; case DIGITL: REXEC_FBC_CSCAN_TAINT( isDIGIT_LC_utf8((U8*)s), isDIGIT_LC(*s) ); + break; case NDIGIT: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_POSIX_DIGIT(), - !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8), + !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target), !isDIGIT(*s) ); + break; + case NDIGITA: + REXEC_FBC_CSCAN( + !isDIGIT_A(*s), + !isDIGIT_A(*s) + ); + break; case NDIGITL: REXEC_FBC_CSCAN_TAINT( !isDIGIT_LC_utf8((U8*)s), !isDIGIT_LC(*s) ); + break; case LNBREAK: REXEC_FBC_CSCAN( is_LNBREAK_utf8(s), is_LNBREAK_latin1(s) ); + break; case VERTWS: REXEC_FBC_CSCAN( is_VERTWS_utf8(s), is_VERTWS_latin1(s) ); + break; case NVERTWS: REXEC_FBC_CSCAN( !is_VERTWS_utf8(s), !is_VERTWS_latin1(s) ); + break; case HORIZWS: REXEC_FBC_CSCAN( is_HORIZWS_utf8(s), is_HORIZWS_latin1(s) ); + break; case NHORIZWS: REXEC_FBC_CSCAN( !is_HORIZWS_utf8(s), !is_HORIZWS_latin1(s) ); + break; case AHOCORASICKC: case AHOCORASICK: { @@ -1723,14 +1926,20 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, DEBUG_TRIE_EXECUTE_r( if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { dump_exec_pos( (char *)uc, c, strend, real_start, - (char *)uc, do_utf8 ); + (char *)uc, utf8_target ); PerlIO_printf( Perl_debug_log, " Scanning for legal start char...\n"); } - ); - while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { - uc++; - } + ); + if (utf8_target) { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc += UTF8SKIP(uc); + } + } else { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc++; + } + } s= (char *)uc; } if (uc >(U8*)last_start) break; @@ -1751,7 +1960,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, foldbuf, uniflags); DEBUG_TRIE_EXECUTE_r({ dump_exec_pos( (char *)uc, c, strend, real_start, - s, do_utf8 ); + s, utf8_target ); PerlIO_printf(Perl_debug_log, " Charid:%3u CP:%4"UVxf" ", charid, uvc); @@ -1766,7 +1975,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, DEBUG_TRIE_EXECUTE_r({ if (failed) dump_exec_pos( (char *)uc, c, strend, real_start, - s, do_utf8 ); + s, utf8_target ); PerlIO_printf( Perl_debug_log, "%sState: %4"UVxf", word=%"UVxf, failed ? " Fail transition to " : "", @@ -1774,14 +1983,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, }); if ( base ) { U32 tmp; + I32 offset; if (charid && - (base + charid > trie->uniquecharcount ) - && (base + charid - 1 - trie->uniquecharcount - < trie->lasttrans) - && trie->trans[base + charid - 1 - - trie->uniquecharcount].check == state - && (tmp=trie->trans[base + charid - 1 - - trie->uniquecharcount ].next)) + ( ((offset = base + charid + - 1 - trie->uniquecharcount)) >= 0) + && ((U32)offset < trie->lasttrans) + && trie->trans[offset].check == state + && (tmp=trie->trans[offset].next)) { DEBUG_TRIE_EXECUTE_r( PerlIO_printf( Perl_debug_log," - legal\n")); @@ -1878,7 +2086,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre I32 end_shift = 0; /* Same for the end. */ /* CC */ I32 scream_pos = -1; /* Internal iterator of scream. */ char *scream_olds = NULL; - const bool do_utf8 = cBOOL(DO_UTF8(sv)); + const bool utf8_target = cBOOL(DO_UTF8(sv)); I32 multiline; RXi_GET_DECL(prog,progi); regmatch_info reginfo; /* create some info to pass to regtry etc */ @@ -1897,9 +2105,9 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre multiline = prog->extflags & RXf_PMf_MULTILINE; reginfo.prog = rx; /* Yes, sorry that this is confusing. */ - RX_MATCH_UTF8_set(rx, do_utf8); + RX_MATCH_UTF8_set(rx, utf8_target); DEBUG_EXECUTE_r( - debug_start_match(rx, do_utf8, startpos, strend, + debug_start_match(rx, utf8_target, startpos, strend, "Matching"); ); @@ -2014,37 +2222,72 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre end = HOP3c(strend, -dontbother, strbeg) - 1; /* for multiline we only have to try after newlines */ if (prog->check_substr || prog->check_utf8) { - if (s == startpos) - goto after_try; - while (1) { - if (regtry(®info, &s)) - goto got_it; - after_try: - if (s > end) - goto phooey; - if (prog->extflags & RXf_USE_INTUIT) { - s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL); - if (!s) - goto phooey; - } - else - s++; - } - } else { - if (s > startpos) + /* because of the goto we can not easily reuse the macros for bifurcating the + unicode/non-unicode match modes here like we do elsewhere - demerphq */ + if (utf8_target) { + if (s == startpos) + goto after_try_utf8; + while (1) { + if (regtry(®info, &s)) { + goto got_it; + } + after_try_utf8: + if (s > end) { + goto phooey; + } + if (prog->extflags & RXf_USE_INTUIT) { + s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL); + if (!s) { + goto phooey; + } + } + else { + s += UTF8SKIP(s); + } + } + } /* end search for check string in unicode */ + else { + if (s == startpos) { + goto after_try_latin; + } + while (1) { + if (regtry(®info, &s)) { + goto got_it; + } + after_try_latin: + if (s > end) { + goto phooey; + } + if (prog->extflags & RXf_USE_INTUIT) { + s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL); + if (!s) { + goto phooey; + } + } + else { + s++; + } + } + } /* end search for check string in latin*/ + } /* end search for check string */ + else { /* search for newline */ + if (s > startpos) { + /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/ s--; + } + /* We can use a more efficient search as newlines are the same in unicode as they are in latin */ while (s < end) { if (*s++ == '\n') { /* don't need PL_utf8skip here */ if (regtry(®info, &s)) goto got_it; } - } - } - } + } + } /* end search for newline */ + } /* end anchored/multiline check string search */ goto phooey; } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) { - /* the warning about reginfo.ganch being used without intialization + /* the warning about reginfo.ganch being used without initialization is bogus -- we set it above, when prog->extflags & RXf_GPOS_SEEN and we only enter this block when the same bit is set. */ char *tmp_s = reginfo.ganch - prog->gofs; @@ -2057,16 +2300,16 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre /* Messy cases: unanchored match. */ if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) { /* we have /x+whatever/ */ - /* it must be a one character string (XXXX Except UTF?) */ + /* it must be a one character string (XXXX Except UTF_PATTERN?) */ char ch; #ifdef DEBUGGING int did_match = 0; #endif - if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)) - do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); - ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0]; + if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)) + utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog); + ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0]; - if (do_utf8) { + if (utf8_target) { REXEC_FBC_SCAN( if (*s == ch) { DEBUG_EXECUTE_r( did_match = 1 ); @@ -2106,14 +2349,14 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre int did_match = 0; #endif if (prog->anchored_substr || prog->anchored_utf8) { - if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)) - do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); - must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr; + if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)) + utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog); + must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr; back_max = back_min = prog->anchored_offset; } else { - if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) - do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); - must = do_utf8 ? prog->float_utf8 : prog->float_substr; + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) + utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog); + must = utf8_target ? prog->float_utf8 : prog->float_substr; back_max = prog->float_max_offset; back_min = prog->float_min_offset; } @@ -2161,7 +2404,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre last1 = HOPc(s, -back_min); s = t; } - if (do_utf8) { + if (utf8_target) { while (s <= last1) { if (regtry(®info, &s)) goto got_it; @@ -2177,7 +2420,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre } } DEBUG_EXECUTE_r(if (!did_match) { - RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), SvPVX_const(must), RE_SV_DUMPLEN(must), 30); PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n", ((must == prog->anchored_substr || must == prog->anchored_utf8) @@ -2197,7 +2440,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre SV * const prop = sv_newmortal(); regprop(prog, prop, c); { - RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1), + RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), s,strend-s,60); PerlIO_printf(Perl_debug_log, "Matching stclass %.*s against %s (%d bytes)\n", @@ -2216,9 +2459,9 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre char *last; SV* float_real; - if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) - do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); - float_real = do_utf8 ? prog->float_utf8 : prog->float_substr; + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) + utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog); + float_real = utf8_target ? prog->float_utf8 : prog->float_substr; if (flags & REXEC_SCREAM) { last = screaminstr(sv, float_real, s - strbeg, @@ -2262,7 +2505,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre dontbother = minlen - 1; strend -= dontbother; /* this one's always in bytes! */ /* We don't know much -- general case. */ - if (do_utf8) { + if (utf8_target) { for (;;) { if (regtry(®info, &s)) goto got_it; @@ -2671,7 +2914,7 @@ The only exceptions to this are lookahead/behind assertions and the cut, (?>A), which pop all the backtrack states associated with A before continuing. -Bascktrack state structs are allocated in slabs of about 4K in size. +Backtrack state structs are allocated in slabs of about 4K in size. PL_regmatch_state and st always point to the currently active state, and PL_regmatch_slab points to the slab currently containing PL_regmatch_state. The first time regmatch() is called, the first slab is @@ -2684,7 +2927,7 @@ regmatch(), slabs allocated since entry are freed. #define DEBUG_STATE_pp(pp) \ DEBUG_STATE_r({ \ - DUMP_EXEC_POS(locinput, scan, do_utf8); \ + DUMP_EXEC_POS(locinput, scan, utf8_target); \ PerlIO_printf(Perl_debug_log, \ " %*s"pp" %s%s%s%s%s\n", \ depth*2, "", \ @@ -2702,7 +2945,7 @@ regmatch(), slabs allocated since entry are freed. #ifdef DEBUGGING STATIC void -S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8, +S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target, const char *start, const char *end, const char *blurb) { const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; @@ -2715,18 +2958,18 @@ S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8, RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), RX_PRECOMP_const(prog), RX_PRELEN(prog), 60); - RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1), + RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1), start, end - start, 60); PerlIO_printf(Perl_debug_log, "%s%s REx%s %s against %s\n", PL_colors[4], blurb, PL_colors[5], s0, s1); - if (do_utf8||utf8_pat) + if (utf8_target||utf8_pat) PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n", utf8_pat ? "pattern" : "", - utf8_pat && do_utf8 ? " and " : "", - do_utf8 ? "string" : "" + utf8_pat && utf8_target ? " and " : "", + utf8_target ? "string" : "" ); } } @@ -2737,7 +2980,7 @@ S_dump_exec_pos(pTHX_ const char *locinput, const char *loc_regeol, const char *loc_bostr, const char *loc_reg_starttry, - const bool do_utf8) + const bool utf8_target) { const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ @@ -2754,20 +2997,20 @@ S_dump_exec_pos(pTHX_ const char *locinput, PERL_ARGS_ASSERT_DUMP_EXEC_POS; - while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) + while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) pref_len++; pref0_len = pref_len - (locinput - loc_reg_starttry); if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) l = ( loc_regeol - locinput > (5 + taill) - pref_len ? (5 + taill) - pref_len : loc_regeol - locinput); - while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) + while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) l--; if (pref0_len < 0) pref0_len = 0; if (pref0_len > pref_len) pref0_len = pref_len; { - const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0; + const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0; RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), (locinput - pref_len),pref0_len, 60, 4, 5); @@ -2854,7 +3097,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) dMY_CXT; #endif dVAR; - register const bool do_utf8 = PL_reg_match_utf8; + register const bool utf8_target = PL_reg_match_utf8; const U32 uniflags = UTF8_ALLOW_DEFAULT; REGEXP *rex_sv = reginfo->prog; regexp *rex = (struct regexp *)SvANY(rex_sv); @@ -2890,7 +3133,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) SV *popmark = NULL; /* are we looking for a mark? */ SV *sv_commit = NULL; /* last mark name seen in failure */ SV *sv_yes_mark = NULL; /* last mark name we have seen - during a successfull match */ + during a successful match */ U32 lastopen = 0; /* last open we saw */ bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; SV* const oreplsv = GvSV(PL_replgv); @@ -2943,7 +3186,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) DEBUG_EXECUTE_r( { SV * const prop = sv_newmortal(); regnode *rnext=regnext(scan); - DUMP_EXEC_POS( locinput, scan, do_utf8 ); + DUMP_EXEC_POS( locinput, scan, utf8_target ); regprop(rex, prop, scan); PerlIO_printf(Perl_debug_log, @@ -3021,7 +3264,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case SANY: if (!nextchr && locinput >= PL_regeol) sayNO; - if (do_utf8) { + if (utf8_target) { locinput += PL_utf8skip[nextchr]; if (locinput > PL_regeol) sayNO; @@ -3038,7 +3281,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case REG_ANY: if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n') sayNO; - if (do_utf8) { + if (utf8_target) { locinput += PL_utf8skip[nextchr]; if (locinput > PL_regeol) sayNO; @@ -3054,7 +3297,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* In this case the charclass data is available inline so we can fail fast without a lot of extra overhead. */ - if (scan->flags == EXACT || !do_utf8) { + if (scan->flags == EXACT || !utf8_target) { if(!ANYOF_BITMAP_TEST(scan, *locinput)) { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, @@ -3130,7 +3373,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) "%*s %smatched empty string...%s\n", REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) ); - break; + if (!trie->jump) + break; } else { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, @@ -3165,7 +3409,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) while ( state && uc <= (U8*)PL_regeol ) { U32 base = trie->states[ state ].trans.base; UV uvc = 0; - U16 charid; + U16 charid = 0; U16 wordnum; wordnum = trie->states[ state ].wordnum; @@ -3188,7 +3432,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } DEBUG_TRIE_EXECUTE_r({ - DUMP_EXEC_POS( (char *)uc, scan, do_utf8 ); + DUMP_EXEC_POS( (char *)uc, scan, utf8_target ); PerlIO_printf( Perl_debug_log, "%*s %sState: %4"UVxf" Accepted: %c ", 2+depth * 2, "", PL_colors[4], @@ -3197,6 +3441,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* read a char and goto next state */ if ( base ) { + I32 offset; REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags); @@ -3204,14 +3449,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) if (foldlen>0) ST.longfold = TRUE; if (charid && - (base + charid > trie->uniquecharcount ) - && (base + charid - 1 - trie->uniquecharcount - < trie->lasttrans) - && trie->trans[base + charid - 1 - - trie->uniquecharcount].check == state) + ( ((offset = + base + charid - 1 - trie->uniquecharcount)) >= 0) + + && ((U32)offset < trie->lasttrans) + && trie->trans[offset].check == state) { - state = trie->trans[base + charid - 1 - - trie->uniquecharcount ].next; + state = trie->trans[offset].next; } else { state = 0; @@ -3317,7 +3561,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) U8 *uscan; while (chars) { - if (do_utf8) { + if (utf8_target) { uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len, uniflags); uc += len; @@ -3339,7 +3583,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } } else { - if (do_utf8) + if (utf8_target) while (chars--) uc += UTF8SKIP(uc); else @@ -3380,7 +3624,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ST.nextword, tmp ? pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 0, PL_colors[0], PL_colors[1], - (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) + (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)|PERL_PV_ESCAPE_NONASCII ) : "not compiled under -Dr", PL_colors[5] ); @@ -3395,12 +3639,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case EXACT: { char *s = STRING(scan); ln = STR_LEN(scan); - if (do_utf8 != UTF) { + if (utf8_target != UTF_PATTERN) { /* The target and the pattern have differing utf8ness. */ char *l = locinput; const char * const e = s + ln; - if (do_utf8) { + if (utf8_target) { /* The target is utf8, the pattern is not utf8. */ while (s < e) { STRLEN ulen; @@ -3444,26 +3688,42 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) nextchr = UCHARAT(locinput); break; } - case EXACTFL: + case EXACTFL: { + re_fold_t folder; + const U8 * fold_array; + const char * s; + PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case EXACTF: { - char * const s = STRING(scan); + folder = foldEQ_locale; + fold_array = PL_fold_locale; + goto do_exactf; + + case EXACTFU: + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + goto do_exactf; + + case EXACTF: + folder = foldEQ; + fold_array = PL_fold; + + do_exactf: + s = STRING(scan); ln = STR_LEN(scan); - if (do_utf8 || UTF) { + if (utf8_target || UTF_PATTERN) { /* Either target or the pattern are utf8. */ const char * const l = locinput; char *e = PL_regeol; - if (ibcmp_utf8(s, 0, ln, cBOOL(UTF), - l, &e, 0, do_utf8)) { + if (! foldEQ_utf8(s, 0, ln, cBOOL(UTF_PATTERN), + l, &e, 0, utf8_target)) { /* One more case for the sharp s: * pack("U0U*", 0xDF) =~ /ss/i, * the 0xC3 0x9F are the UTF-8 * byte sequence for the U+00DF. */ - if (!(do_utf8 && + if (!(utf8_target && toLOWER(s[0]) == 's' && ln >= 2 && toLOWER(s[1]) == 's' && @@ -3481,27 +3741,34 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* Inline the first character, for speed. */ if (UCHARAT(s) != nextchr && - UCHARAT(s) != ((OP(scan) == EXACTF) - ? PL_fold : PL_fold_locale)[nextchr]) + UCHARAT(s) != fold_array[nextchr]) + { sayNO; + } if (PL_regeol - locinput < ln) sayNO; - if (ln > 1 && (OP(scan) == EXACTF - ? ibcmp(s, locinput, ln) - : ibcmp_locale(s, locinput, ln))) + if (ln > 1 && ! folder(s, locinput, ln)) sayNO; locinput += ln; nextchr = UCHARAT(locinput); break; - } + } + + /* XXX Could improve efficiency by separating these all out using a + * macro or in-line function. At that point regcomp.c would no longer + * have to set the FLAGS fields of these */ case BOUNDL: case NBOUNDL: PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case BOUND: + case BOUNDU: + case BOUNDA: case NBOUND: + case NBOUNDU: + case NBOUNDA: /* was last char in word? */ - if (do_utf8) { + if (utf8_target && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET) { if (locinput == PL_bostr) ln = '\n'; else { @@ -3509,10 +3776,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags); } - if (OP(scan) == BOUND || OP(scan) == NBOUND) { + if (FLAGS(scan) != REGEX_LOCALE_CHARSET) { ln = isALNUM_uni(ln); LOAD_UTF8_CHARCLASS_ALNUM(); - n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8); + n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target); } else { ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln)); @@ -3520,63 +3787,88 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } } else { + + /* Here the string isn't utf8, or is utf8 and only ascii + * characters are to match \w. In the latter case looking at + * the byte just prior to the current one may be just the final + * byte of a multi-byte character. This is ok. There are two + * cases: + * 1) it is a single byte character, and then the test is doing + * just what it's supposed to. + * 2) it is a multi-byte character, in which case the final + * byte is never mistakable for ASCII, and so the test + * will say it is not a word character, which is the + * correct answer. */ ln = (locinput != PL_bostr) ? UCHARAT(locinput - 1) : '\n'; - if (OP(scan) == BOUND || OP(scan) == NBOUND) { - ln = isALNUM(ln); - n = isALNUM(nextchr); - } - else { - ln = isALNUM_LC(ln); - n = isALNUM_LC(nextchr); + switch (FLAGS(scan)) { + case REGEX_UNICODE_CHARSET: + ln = isWORDCHAR_L1(ln); + n = isWORDCHAR_L1(nextchr); + break; + case REGEX_LOCALE_CHARSET: + ln = isALNUM_LC(ln); + n = isALNUM_LC(nextchr); + break; + case REGEX_DEPENDS_CHARSET: + ln = isALNUM(ln); + n = isALNUM(nextchr); + break; + case REGEX_ASCII_RESTRICTED_CHARSET: + ln = isWORDCHAR_A(ln); + n = isWORDCHAR_A(nextchr); + break; + default: + Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan)); + break; } } - if (((!ln) == (!n)) == (OP(scan) == BOUND || - OP(scan) == BOUNDL)) + /* Note requires that all BOUNDs be lower than all NBOUNDs in + * regcomp.sym */ + if (((!ln) == (!n)) == (OP(scan) < NBOUND)) sayNO; break; + case ANYOFV: case ANYOF: - if (do_utf8) { + if (utf8_target || state_num == ANYOFV) { STRLEN inclasslen = PL_regeol - locinput; - - if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8)) - goto anyof_fail; if (locinput >= PL_regeol) sayNO; - locinput += inclasslen ? inclasslen : UTF8SKIP(locinput); + + if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target)) + sayNO; + locinput += inclasslen; nextchr = UCHARAT(locinput); break; } else { if (nextchr < 0) nextchr = UCHARAT(locinput); - if (!REGINCLASS(rex, scan, (U8*)locinput)) - goto anyof_fail; if (!nextchr && locinput >= PL_regeol) sayNO; + if (!REGINCLASS(rex, scan, (U8*)locinput)) + sayNO; nextchr = UCHARAT(++locinput); break; } - anyof_fail: - /* If we might have the case of the German sharp s - * in a casefolding Unicode character class. */ - - if (ANYOF_FOLD_SHARP_S(scan, locinput, PL_regeol)) { - locinput += SHARP_S_SKIP; - nextchr = UCHARAT(locinput); - } - else - sayNO; break; /* Special char classes - The defines start on line 129 or so */ - CCC_TRY_AFF( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC); - CCC_TRY_NEG(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC); - - CCC_TRY_AFF( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC); - CCC_TRY_NEG(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC); - - CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); - CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); + CCC_TRY_U(ALNUM, NALNUM, isWORDCHAR, + ALNUML, NALNUML, isALNUM_LC, isALNUM_LC_utf8, + ALNUMU, NALNUMU, isWORDCHAR_L1, + ALNUMA, NALNUMA, isWORDCHAR_A, + perl_word, "a"); + + CCC_TRY_U(SPACE, NSPACE, isSPACE, + SPACEL, NSPACEL, isSPACE_LC, isSPACE_LC_utf8, + SPACEU, NSPACEU, isSPACE_L1, + SPACEA, NSPACEA, isSPACE_A, + perl_space, " "); + + CCC_TRY(DIGIT, NDIGIT, isDIGIT, + DIGITL, NDIGITL, isDIGIT_LC, isDIGIT_LC_utf8, + DIGITA, NDIGITA, isDIGIT_A, + posix_digit, "0"); case CLUMP: /* Match \X: logical Unicode character. This is defined as a Unicode extended Grapheme Cluster */ @@ -3637,7 +3929,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) if (locinput >= PL_regeol) sayNO; - if (! do_utf8) { + if (! utf8_target) { /* Match either CR LF or '.', as all the other possibilities * require utf8 */ @@ -3665,7 +3957,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* Match (prepend)* */ while (locinput < PL_regeol && swash_fetch(PL_utf8_X_prepend, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { previous_prepend = locinput; locinput += UTF8SKIP(locinput); @@ -3677,7 +3969,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) if (previous_prepend && (locinput >= PL_regeol || ! swash_fetch(PL_utf8_X_begin, - (U8*)locinput, do_utf8))) + (U8*)locinput, utf8_target))) { locinput = previous_prepend; } @@ -3687,7 +3979,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * moved locinput forward, we tested the result just above * and it either passed, or we backed off so that it will * now pass */ - if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) { + if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) { /* Here did not match the required 'Begin' in the * second term. So just match the very first @@ -3699,7 +3991,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * an extender. It is either a hangul syllable, or a * non-control */ if (swash_fetch(PL_utf8_X_non_hangul, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { /* Here not a Hangul syllable, must be a @@ -3711,11 +4003,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * of several individual characters. One * possibility is T+ */ if (swash_fetch(PL_utf8_X_T, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { while (locinput < PL_regeol && swash_fetch(PL_utf8_X_T, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { locinput += UTF8SKIP(locinput); } @@ -3729,7 +4021,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* Match L* */ while (locinput < PL_regeol && swash_fetch(PL_utf8_X_L, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { locinput += UTF8SKIP(locinput); } @@ -3742,13 +4034,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) if (locinput < PL_regeol && swash_fetch(PL_utf8_X_LV_LVT_V, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { /* Otherwise keep going. Must be LV, LVT * or V. See if LVT */ if (swash_fetch(PL_utf8_X_LVT, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { locinput += UTF8SKIP(locinput); } else { @@ -3758,7 +4050,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) locinput += UTF8SKIP(locinput); while (locinput < PL_regeol && swash_fetch(PL_utf8_X_V, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { locinput += UTF8SKIP(locinput); } @@ -3769,7 +4061,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) while (locinput < PL_regeol && swash_fetch(PL_utf8_X_T, (U8*)locinput, - do_utf8)) + utf8_target)) { locinput += UTF8SKIP(locinput); } @@ -3780,7 +4072,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* Match any extender */ while (locinput < PL_regeol && swash_fetch(PL_utf8_X_extend, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { locinput += UTF8SKIP(locinput); } @@ -3792,31 +4084,74 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) break; case NREFFL: - { + { /* The capture buffer cases. The ones beginning with N for the + named buffers just convert to the equivalent numbered and + pretend they were called as the corresponding numbered buffer + op. */ + /* don't initialize these, it makes C++ unhappy */ char *s; char type; + re_fold_t folder; + const U8 *fold_array; + PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ - case NREF: + folder = foldEQ_locale; + fold_array = PL_fold_locale; + type = REFFL; + goto do_nref; + + case NREFFU: + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + type = REFFU; + goto do_nref; + case NREFF: - type = OP(scan); + folder = foldEQ; + fold_array = PL_fold; + type = REFF; + goto do_nref; + + case NREF: + type = REF; + folder = NULL; + fold_array = NULL; + do_nref: + + /* For the named back references, find the corresponding buffer + * number */ n = reg_check_named_buff_matched(rex,scan); - if ( n ) { - type = REF + ( type - NREF ); - goto do_ref; - } else { + if ( ! n ) { sayNO; - } - /* unreached */ + } + goto do_nref_ref_common; + case REFFL: PL_reg_flags |= RF_tainted; - /* FALL THROUGH */ + folder = foldEQ_locale; + fold_array = PL_fold_locale; + goto do_ref; + + case REFFU: + folder = foldEQ_latin1; + fold_array = PL_fold_latin1; + goto do_ref; + + case REFF: + folder = foldEQ; + fold_array = PL_fold; + goto do_ref; + case REF: - case REFF: - n = ARG(scan); /* which paren pair */ + folder = NULL; + fold_array = NULL; + + do_ref: type = OP(scan); - do_ref: + n = ARG(scan); /* which paren pair */ + + do_nref_ref_common: ln = PL_regoffs[n].start; PL_reg_leftiter = PL_reg_maxiter; /* Void cache */ if (*PL_reglastparen < n || ln == -1) @@ -3825,49 +4160,40 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) break; s = PL_bostr + ln; - if (do_utf8 && type != REF) { /* REF can do byte comparison */ - char *l = locinput; - const char *e = PL_bostr + PL_regoffs[n].end; - /* - * Note that we can't do the "other character" lookup trick as - * in the 8-bit case (no pun intended) because in Unicode we - * have to map both upper and title case to lower case. - */ - if (type == REFF) { - while (s < e) { - STRLEN ulen1, ulen2; - U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; - U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; - - if (l >= PL_regeol) - sayNO; - toLOWER_utf8((U8*)s, tmpbuf1, &ulen1); - toLOWER_utf8((U8*)l, tmpbuf2, &ulen2); - if (ulen1 != ulen2 || memNE((char *)tmpbuf1, (char *)tmpbuf2, ulen1)) - sayNO; - s += ulen1; - l += ulen2; - } + if (type != REF /* REF can do byte comparison */ + && (utf8_target + || (type == REFFU + && (*s == (char) LATIN_SMALL_LETTER_SHARP_S + || *locinput == (char) LATIN_SMALL_LETTER_SHARP_S)))) + { /* XXX handle REFFL better */ + char * limit = PL_regeol; + + /* This call case insensitively compares the entire buffer + * at s, with the current input starting at locinput, but + * not going off the end given by PL_regeol, and returns in + * limit upon success, how much of the current input was + * matched */ + if (! foldEQ_utf8(s, NULL, PL_regoffs[n].end - ln, utf8_target, + locinput, &limit, 0, utf8_target)) + { + sayNO; } - locinput = l; + locinput = limit; nextchr = UCHARAT(locinput); break; } - /* Inline the first character, for speed. */ + /* Not utf8: Inline the first character, for speed. */ if (UCHARAT(s) != nextchr && (type == REF || - (UCHARAT(s) != (type == REFF - ? PL_fold : PL_fold_locale)[nextchr]))) + UCHARAT(s) != fold_array[nextchr])) sayNO; ln = PL_regoffs[n].end - ln; if (locinput + ln > PL_regeol) sayNO; if (ln > 1 && (type == REF ? memNE(s, locinput, ln) - : (type == REFF - ? ibcmp(s, locinput, ln) - : ibcmp_locale(s, locinput, ln)))) + : ! folder(s, locinput, ln))) sayNO; locinput += ln; nextchr = UCHARAT(locinput); @@ -3928,7 +4254,24 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) COP * const ocurcop = PL_curcop; PAD *old_comppad; char *saved_regeol = PL_regeol; - + struct re_save_state saved_state; + + /* To not corrupt the existing regex state while executing the + * eval we would normally put it on the save stack, like with + * save_re_context. However, re-evals have a weird scoping so we + * can't just add ENTER/LEAVE here. With that, things like + * + * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a}) + * + * would break, as they expect the localisation to be unwound + * only when the re-engine backtracks through the bit that + * localised it. + * + * What we do instead is just saving the state in a local c + * variable. + */ + Copy(&PL_reg_state, &saved_state, 1, struct re_save_state); + n = ARG(scan); PL_op = (OP_4tree*)rexi->data->data[n]; DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, @@ -3950,6 +4293,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) PUTBACK; } + Copy(&saved_state, &PL_reg_state, 1, struct re_save_state); + PL_op = oop; PAD_RESTORE_LOCAL(old_comppad); PL_curcop = ocurcop; @@ -4038,7 +4383,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) re->sublen = rex->sublen; rei = RXi_GET(re); DEBUG_EXECUTE_r( - debug_start_match(re_sv, do_utf8, locinput, PL_regeol, + debug_start_match(re_sv, utf8_target, locinput, PL_regeol, "Matching embedded"); ); startpoint = rei->program + 1; @@ -4326,9 +4671,7 @@ NULL /* these fields contain the state of the current curly. * they are accessed by subsequent WHILEMs */ ST.parenfloor = parenfloor; - ST.min = ARG1(scan); - ST.max = ARG2(scan); - ST.A = NEXTOPER(scan) + EXTRA_STEP_2ARGS; + ST.me = scan; ST.B = next; ST.minmod = minmod; minmod = 0; @@ -4359,6 +4702,10 @@ NULL { /* see the discussion above about CURLYX/WHILEM */ I32 n; + int min = ARG1(cur_curlyx->u.curlyx.me); + int max = ARG2(cur_curlyx->u.curlyx.me); + regnode *A = NEXTOPER(cur_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS; + assert(cur_curlyx); /* keep Coverity happy */ n = ++cur_curlyx->u.curlyx.count; /* how many A's matched */ ST.save_lastloc = cur_curlyx->u.curlyx.lastloc; @@ -4368,17 +4715,15 @@ NULL PL_reginput = locinput; DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, - "%*s whilem: matched %ld out of %ld..%ld\n", - REPORT_CODE_OFF+depth*2, "", (long)n, - (long)cur_curlyx->u.curlyx.min, - (long)cur_curlyx->u.curlyx.max) + "%*s whilem: matched %ld out of %d..%d\n", + REPORT_CODE_OFF+depth*2, "", (long)n, min, max) ); /* First just match a string of min A's. */ - if (n < cur_curlyx->u.curlyx.min) { + if (n < min) { cur_curlyx->u.curlyx.lastloc = locinput; - PUSH_STATE_GOTO(WHILEM_A_pre, cur_curlyx->u.curlyx.A); + PUSH_STATE_GOTO(WHILEM_A_pre, A); /* NOTREACHED */ } @@ -4458,11 +4803,11 @@ NULL /* Prefer A over B for maximal matching. */ - if (n < cur_curlyx->u.curlyx.max) { /* More greed allowed? */ + if (n < max) { /* More greed allowed? */ ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor); cur_curlyx->u.curlyx.lastloc = locinput; REGCP_SET(ST.lastcp); - PUSH_STATE_GOTO(WHILEM_A_max, cur_curlyx->u.curlyx.A); + PUSH_STATE_GOTO(WHILEM_A_max, A); /* NOTREACHED */ } goto do_whilem_B_max; @@ -4522,7 +4867,7 @@ NULL REGCP_UNWIND(ST.lastcp); regcppop(rex); - if (cur_curlyx->u.curlyx.count >= cur_curlyx->u.curlyx.max) { + if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) { /* Maximum greed exceeded */ if (cur_curlyx->u.curlyx.count >= REG_INFTY && ckWARN(WARN_REGEXP) @@ -4546,7 +4891,8 @@ NULL cur_curlyx->u.curlyx.lastloc = locinput; ST.cp = regcppush(cur_curlyx->u.curlyx.parenfloor); REGCP_SET(ST.lastcp); - PUSH_STATE_GOTO(WHILEM_A_min, ST.save_curlyx->u.curlyx.A); + PUSH_STATE_GOTO(WHILEM_A_min, + /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS); /* NOTREACHED */ #undef ST @@ -4729,12 +5075,12 @@ NULL { ST.c1 = (U8)*STRING(text_node); - ST.c2 = - (IS_TEXTF(text_node)) - ? PL_fold[ST.c1] - : (IS_TEXTFL(text_node)) - ? PL_fold_locale[ST.c1] - : ST.c1; + switch (OP(text_node)) { + case EXACTF: ST.c2 = PL_fold[ST.c1]; break; + case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; + case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; + default: ST.c2 = ST.c1; + } } } } @@ -4880,15 +5226,17 @@ NULL if this changes back then the macro for IS_TEXT and friends need to change. */ - if (!UTF) { - ST.c2 = ST.c1 = *s; - if (IS_TEXTF(text_node)) - ST.c2 = PL_fold[ST.c1]; - else if (IS_TEXTFL(text_node)) - ST.c2 = PL_fold_locale[ST.c1]; + if (!UTF_PATTERN) { + ST.c1 = *s; + switch (OP(text_node)) { + case EXACTF: ST.c2 = PL_fold[ST.c1]; break; + case EXACTFU: ST.c2 = PL_fold_latin1[ST.c1]; break; + case EXACTFL: ST.c2 = PL_fold_locale[ST.c1]; break; + default: ST.c2 = ST.c1; break; + } } - else { /* UTF */ - if (IS_TEXTF(text_node)) { + else { /* UTF_PATTERN */ + if (IS_TEXTFU(text_node) || IS_TEXTF(text_node)) { STRLEN ulen1, ulen2; U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; U8 tmpbuf2[UTF8_MAXBYTES_CASE+1]; @@ -4939,11 +5287,11 @@ NULL * string that could possibly match */ if (ST.max == REG_INFTY) { ST.maxpos = PL_regeol - 1; - if (do_utf8) + if (utf8_target) while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) ST.maxpos--; } - else if (do_utf8) { + else if (utf8_target) { int m = ST.max - ST.min; for (ST.maxpos = locinput; m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--) @@ -4989,7 +5337,7 @@ NULL REGCP_UNWIND(ST.cp); /* Couldn't or didn't -- move forward. */ ST.oldloc = locinput; - if (do_utf8) + if (utf8_target) locinput += UTF8SKIP(locinput); else locinput++; @@ -4998,7 +5346,7 @@ NULL /* find the next place where 'B' could work, then call B */ { int n; - if (do_utf8) { + if (utf8_target) { n = (ST.oldloc == locinput) ? 0 : 1; if (ST.c1 == ST.c2) { STRLEN len; @@ -5094,7 +5442,7 @@ NULL { UV c = 0; if (ST.c1 != CHRTEST_VOID) - c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput, + c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput, UTF8_MAXBYTES, 0, uniflags) : (UV) UCHARAT(PL_reginput); /* If it could work, try it. */ @@ -5352,9 +5700,9 @@ NULL #undef ST case FOLDCHAR: n = ARG(scan); - if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) { + if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) { locinput += ln; - } else if ( 0xDF == n && !do_utf8 && !UTF ) { + } else if ( LATIN_SMALL_LETTER_SHARP_S == n && !utf8_target && !UTF_PATTERN ) { sayNO; } else { U8 folded[UTF8_MAXBYTES_CASE+1]; @@ -5363,8 +5711,8 @@ NULL char *e = PL_regeol; to_uni_fold(n, folded, &foldlen); - if (ibcmp_utf8((const char*) folded, 0, foldlen, 1, - l, &e, 0, do_utf8)) { + if (! foldEQ_utf8((const char*) folded, 0, foldlen, 1, + l, &e, 0, utf8_target)) { sayNO; } locinput = e; @@ -5372,7 +5720,7 @@ NULL nextchr = UCHARAT(locinput); break; case LNBREAK: - if ((n=is_LNBREAK(locinput,do_utf8))) { + if ((n=is_LNBREAK(locinput,utf8_target))) { locinput += n; nextchr = UCHARAT(locinput); } else @@ -5381,14 +5729,14 @@ NULL #define CASE_CLASS(nAmE) \ case nAmE: \ - if ((n=is_##nAmE(locinput,do_utf8))) { \ + if ((n=is_##nAmE(locinput,utf8_target))) { \ locinput += n; \ nextchr = UCHARAT(locinput); \ } else \ sayNO; \ break; \ case N##nAmE: \ - if ((n=is_##nAmE(locinput,do_utf8))) { \ + if ((n=is_##nAmE(locinput,utf8_target))) { \ sayNO; \ } else { \ locinput += UTF8SKIP(locinput); \ @@ -5601,7 +5949,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) register I32 c; register char *loceol = PL_regeol; register I32 hardcount = 0; - register bool do_utf8 = PL_reg_match_utf8; + register bool utf8_target = PL_reg_match_utf8; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -5615,7 +5963,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) loceol = scan + max; switch (OP(p)) { case REG_ANY: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (scan < loceol && hardcount < max && *scan != '\n') { scan += UTF8SKIP(scan); @@ -5627,7 +5975,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case SANY: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (scan < loceol && hardcount < max) { scan += UTF8SKIP(scan); @@ -5640,29 +5988,107 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) case CANY: scan = loceol; break; - case EXACT: /* length of string is 1 */ - c = (U8)*STRING(p); - while (scan < loceol && UCHARAT(scan) == c) - scan++; - break; - case EXACTF: /* length of string is 1 */ + case EXACT: + /* To get here, EXACTish nodes must have *byte* length == 1. That + * means they match only characters in the string that can be expressed + * as a single byte. For non-utf8 strings, that means a simple match. + * For utf8 strings, the character matched must be an invariant, or + * downgradable to a single byte. The pattern's utf8ness is + * irrelevant, as since it's a single byte, it either isn't utf8, or if + * it is, it's an invariant */ + c = (U8)*STRING(p); - while (scan < loceol && - (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c])) - scan++; + assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); + + if (! utf8_target || UNI_IS_INVARIANT(c)) { + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + } + else { + + /* Here, the string is utf8, and the pattern char is different + * in utf8 than not, so can't compare them directly. Outside the + * loop, find find the two utf8 bytes that represent c, and then + * look for those in sequence in the utf8 string */ + U8 high = UTF8_TWO_BYTE_HI(c); + U8 low = UTF8_TWO_BYTE_LO(c); + loceol = PL_regeol; + + while (hardcount < max + && scan + 1 < loceol + && UCHARAT(scan) == high + && UCHARAT(scan + 1) == low) + { + scan += 2; + hardcount++; + } + } break; - case EXACTFL: /* length of string is 1 */ + case EXACTFL: PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case EXACTF: + case EXACTFU: + + /* The comments for the EXACT case above apply as well to these fold + * ones */ + c = (U8)*STRING(p); - while (scan < loceol && - (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c])) - scan++; + assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); + + if (utf8_target) { /* Use full Unicode fold matching */ + + /* For the EXACTFL case, It doesn't really make sense to compare + * locale and utf8, but it is best we can do. The documents warn + * against mixing them */ + + char *tmpeol = loceol; + while (hardcount < max + && foldEQ_utf8(scan, &tmpeol, 0, utf8_target, + STRING(p), NULL, 1, cBOOL(UTF_PATTERN))) + { + scan = tmpeol; + tmpeol = loceol; + hardcount++; + } + + /* XXX Note that the above handles properly the German sharp s in + * the pattern matching ss in the string. But it doesn't handle + * properly cases where the string contains say 'LIGATURE ff' and + * the pattern is 'f+'. This would require, say, a new function or + * revised interface to foldEQ_utf8(), in which the maximum number + * of characters to match could be passed and it would return how + * many actually did. This is just one of many cases where + * multi-char folds don't work properly, and so the fix is being + * deferred */ + } + else { + U8 folded; + + /* Here, the string isn't utf8 and c is a single byte; and either + * the pattern isn't utf8 or c is an invariant, so its utf8ness + * doesn't affect c. Can just do simple comparisons for exact or + * fold matching. */ + switch (OP(p)) { + case EXACTF: folded = PL_fold[c]; break; + case EXACTFU: folded = PL_fold_latin1[c]; break; + case EXACTFL: folded = PL_fold_locale[c]; break; + default: Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); + } + while (scan < loceol && + (UCHARAT(scan) == c || UCHARAT(scan) == folded)) + { + scan++; + } + } break; + case ANYOFV: case ANYOF: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && - reginclass(prog, p, (U8*)scan, 0, do_utf8)) { + reginclass(prog, p, (U8*)scan, 0, utf8_target)) { scan += UTF8SKIP(scan); hardcount++; } @@ -5671,23 +6097,38 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; - case ALNUM: - if (do_utf8) { + case ALNUMU: + if (utf8_target) { + utf8_wordchar: loceol = PL_regeol; LOAD_UTF8_CHARCLASS_ALNUM(); while (hardcount < max && scan < loceol && - swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) { + swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) + { scan += UTF8SKIP(scan); hardcount++; } - } else { - while (scan < loceol && isALNUM(*scan)) - scan++; + } else { + while (scan < loceol && isWORDCHAR_L1((U8) *scan)) { + scan++; + } + } + break; + case ALNUM: + if (utf8_target) + goto utf8_wordchar; + while (scan < loceol && isALNUM((U8) *scan)) { + scan++; + } + break; + case ALNUMA: + while (scan < loceol && isWORDCHAR_A((U8) *scan)) { + scan++; } break; case ALNUML: PL_reg_flags |= RF_tainted; - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && isALNUM_LC_utf8((U8*)scan)) { @@ -5699,23 +6140,47 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; - case NALNUM: - if (do_utf8) { + case NALNUMU: + if (utf8_target) { + + utf8_Nwordchar: + loceol = PL_regeol; LOAD_UTF8_CHARCLASS_ALNUM(); while (hardcount < max && scan < loceol && - !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) { + ! swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) + { scan += UTF8SKIP(scan); hardcount++; } - } else { - while (scan < loceol && !isALNUM(*scan)) + } else { + while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) { + scan++; + } + } + break; + case NALNUM: + if (utf8_target) + goto utf8_Nwordchar; + while (scan < loceol && ! isALNUM((U8) *scan)) { + scan++; + } + break; + case NALNUMA: + if (utf8_target) { + while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) { + scan += UTF8SKIP(scan); + } + } + else { + while (scan < loceol && ! isWORDCHAR_A((U8) *scan)) { scan++; + } } break; case NALNUML: PL_reg_flags |= RF_tainted; - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && !isALNUM_LC_utf8((U8*)scan)) { @@ -5727,27 +6192,47 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; - case SPACE: - if (do_utf8) { + case SPACEU: + if (utf8_target) { + + utf8_space: + loceol = PL_regeol; LOAD_UTF8_CHARCLASS_SPACE(); while (hardcount < max && scan < loceol && (*scan == ' ' || - swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) { + swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) + { scan += UTF8SKIP(scan); hardcount++; } - } else { - while (scan < loceol && isSPACE(*scan)) - scan++; + break; + } + else { + while (scan < loceol && isSPACE_L1((U8) *scan)) { + scan++; + } + break; + } + case SPACE: + if (utf8_target) + goto utf8_space; + + while (scan < loceol && isSPACE((U8) *scan)) { + scan++; + } + break; + case SPACEA: + while (scan < loceol && isSPACE_A((U8) *scan)) { + scan++; } break; case SPACEL: PL_reg_flags |= RF_tainted; - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && - (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { + isSPACE_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); hardcount++; } @@ -5756,27 +6241,54 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; - case NSPACE: - if (do_utf8) { + case NSPACEU: + if (utf8_target) { + + utf8_Nspace: + loceol = PL_regeol; LOAD_UTF8_CHARCLASS_SPACE(); while (hardcount < max && scan < loceol && - !(*scan == ' ' || - swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) { + ! (*scan == ' ' || + swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) + { scan += UTF8SKIP(scan); hardcount++; } - } else { - while (scan < loceol && !isSPACE(*scan)) + break; + } + else { + while (scan < loceol && ! isSPACE_L1((U8) *scan)) { + scan++; + } + } + break; + case NSPACE: + if (utf8_target) + goto utf8_Nspace; + + while (scan < loceol && ! isSPACE((U8) *scan)) { + scan++; + } + break; + case NSPACEA: + if (utf8_target) { + while (scan < loceol && ! isSPACE_A((U8) *scan)) { + scan += UTF8SKIP(scan); + } + } + else { + while (scan < loceol && ! isSPACE_A((U8) *scan)) { scan++; + } } break; case NSPACEL: PL_reg_flags |= RF_tainted; - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && - !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { + !isSPACE_LC_utf8((U8*)scan)) { scan += UTF8SKIP(scan); hardcount++; } @@ -5786,11 +6298,11 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case DIGIT: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; LOAD_UTF8_CHARCLASS_DIGIT(); while (hardcount < max && scan < loceol && - swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) { + swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) { scan += UTF8SKIP(scan); hardcount++; } @@ -5799,12 +6311,31 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } break; + case DIGITA: + while (scan < loceol && isDIGIT_A((U8) *scan)) { + scan++; + } + break; + case DIGITL: + PL_reg_flags |= RF_tainted; + if (utf8_target) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && + isDIGIT_LC_utf8((U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && isDIGIT_LC(*scan)) + scan++; + } + break; case NDIGIT: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; LOAD_UTF8_CHARCLASS_DIGIT(); while (hardcount < max && scan < loceol && - !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) { + !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) { scan += UTF8SKIP(scan); hardcount++; } @@ -5812,8 +6343,35 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) while (scan < loceol && !isDIGIT(*scan)) scan++; } + break; + case NDIGITA: + if (utf8_target) { + while (scan < loceol && ! isDIGIT_A((U8) *scan)) { + scan += UTF8SKIP(scan); + } + } + else { + while (scan < loceol && ! isDIGIT_A((U8) *scan)) { + scan++; + } + } + break; + case NDIGITL: + PL_reg_flags |= RF_tainted; + if (utf8_target) { + loceol = PL_regeol; + while (hardcount < max && scan < loceol && + !isDIGIT_LC_utf8((U8*)scan)) { + scan += UTF8SKIP(scan); + hardcount++; + } + } else { + while (scan < loceol && !isDIGIT_LC(*scan)) + scan++; + } + break; case LNBREAK: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) { scan += c; @@ -5832,7 +6390,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case HORIZWS: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) { scan += c; @@ -5844,7 +6402,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case NHORIZWS: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) { scan += UTF8SKIP(scan); @@ -5857,7 +6415,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case VERTWS: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) { scan += c; @@ -5870,7 +6428,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case NVERTWS: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) { scan += UTF8SKIP(scan); @@ -5964,142 +6522,371 @@ Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool /* - reginclass - determine if a character falls into a character class - The n is the ANYOF regnode, the p is the target string, lenp - is pointer to the maximum length of how far to go in the p - (if the lenp is zero, UTF8SKIP(p) is used), - do_utf8 tells whether the target string is in UTF-8. + n is the ANYOF regnode + p is the target string + lenp is pointer to the maximum number of bytes of how far to go in p + (This is assumed wthout checking to always be at least the current + character's size) + utf8_target tells whether p is in UTF-8. + + Returns true if matched; false otherwise. If lenp is not NULL, on return + from a successful match, the value it points to will be updated to how many + bytes in p were matched. If there was no match, the value is undefined, + possibly changed from the input. + + Note that this can be a synthetic start class, a combination of various + nodes, so things you think might be mutually exclusive, such as locale, + aren't. It can match both locale and non-locale */ STATIC bool -S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8) +S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target) { dVAR; const char flags = ANYOF_FLAGS(n); bool match = FALSE; UV c = *p; - STRLEN len = 0; - STRLEN plen; + STRLEN c_len = 0; + STRLEN maxlen; PERL_ARGS_ASSERT_REGINCLASS; - if (do_utf8 && !UTF8_IS_INVARIANT(c)) { - c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len, + /* If c is not already the code point, get it */ + if (utf8_target && !UTF8_IS_INVARIANT(c)) { + c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len, (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for * UTF8_ALLOW_FFFF */ - if (len == (STRLEN)-1) + if (c_len == (STRLEN)-1) Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); } + else { + c_len = 1; + } + + /* Use passed in max length, or one character if none passed in or less + * than one character. And assume will match just one character. This is + * overwritten later if matched more. */ + if (lenp) { + maxlen = (*lenp > c_len) ? *lenp : c_len; + *lenp = c_len; - plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c)); - if (do_utf8 || (flags & ANYOF_UNICODE)) { - if (lenp) - *lenp = 0; - if (do_utf8 && !ANYOF_RUNTIME(n)) { - if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c)) + } + else { + maxlen = c_len; + } + + /* If this character is potentially in the bitmap, check it */ + if (c < 256) { + if (ANYOF_BITMAP_TEST(n, c)) + match = TRUE; + else if (flags & ANYOF_NON_UTF8_LATIN1_ALL + && ! utf8_target + && ! isASCII(c)) + { + match = TRUE; + } + + else if (flags & ANYOF_LOCALE) { + PL_reg_flags |= RF_tainted; + + if ((flags & ANYOF_LOC_NONBITMAP_FOLD) + && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) + { match = TRUE; + } + else if (ANYOF_CLASS_TEST_ANY_SET(n) && + ((ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) || + (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c)) + ) /* How's that for a conditional? */ + ) { + match = TRUE; + } } - if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256) - match = TRUE; - if (!match) { + } + + /* If the bitmap didn't (or couldn't) match, and something outside the + * bitmap could match, try that */ + if (!match) { + if (utf8_target && (flags & ANYOF_UNICODE_ALL)) { + if (c >= 256 + || ((flags & ANYOF_LOC_NONBITMAP_FOLD) /* Latin1 1 that has a + non-Latin1 fold + should match */ + && _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c))) + { + match = TRUE; + } + } + if (!match && ((flags & ANYOF_NONBITMAP_NON_UTF8) + || (utf8_target && flags & ANYOF_UTF8))) + { AV *av; SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av); - + if (sw) { U8 * utf8_p; - if (do_utf8) { + if (utf8_target) { utf8_p = (U8 *) p; } else { - STRLEN len = 1; + + /* Not utf8. Convert as much of the string as available up + * to the limit of how far the (single) character in the + * pattern can possibly match (no need to go further). If + * the node is a straight ANYOF or not folding, it can't + * match more than one. Otherwise, It can match up to how + * far a single char can fold to. Since not utf8, each + * character is a single byte, so the max it can be in + * bytes is the same as the max it can be in characters */ + STRLEN len = (OP(n) == ANYOF + || ! (flags & ANYOF_LOC_NONBITMAP_FOLD)) + ? 1 + : (maxlen < UTF8_MAX_FOLD_CHAR_EXPAND) + ? maxlen + : UTF8_MAX_FOLD_CHAR_EXPAND; utf8_p = bytes_to_utf8(p, &len); } - if (swash_fetch(sw, utf8_p, 1)) + + if (swash_fetch(sw, utf8_p, TRUE)) match = TRUE; - else if (flags & ANYOF_FOLD) { - if (!match && lenp && av) { + else if (flags & ANYOF_LOC_NONBITMAP_FOLD) { + + /* Here, we need to test if the fold of the target string + * matches. In the case of a multi-char fold that is + * caught by regcomp.c, it has stored all such folds into + * 'av'; we linearly check to see if any match the target + * string (folded). We know that the originals were each + * one character, but we don't currently know how many + * characters/bytes each folded to, except we do know that + * there are small limits imposed by Unicode. XXX A + * performance enhancement would be to have regcomp.c store + * the max number of chars/bytes that are in an av entry, + * as, say the 0th element. Even better would be to have a + * hash of the few characters that can start a multi-char + * fold to the max number of chars of those folds. + * + * Further down, if there isn't a + * match in the av, we will check if there is another + * fold-type match. For that, we also need the fold, but + * only the first character. No sense in folding it twice, + * so we do it here, even if there isn't any multi-char + * fold, so we always fold at least the first character. + * If the node is a straight ANYOF node, or there is only + * one character available in the string, or if there isn't + * any av, that's all we have to fold. In the case of a + * multi-char fold, we do have guarantees in Unicode that + * it can only expand up to so many characters and so many + * bytes. We keep track so don't exceed either. + * + * If there is a match, we will need to advance (if lenp is + * specified) the match pointer in the target string. But + * what we are comparing here isn't that string directly, + * but its fold, whose length may differ from the original. + * As we go along in constructing the fold, therefore, we + * create a map so that we know how many bytes in the + * source to advance given that we have matched a certain + * number of bytes in the fold. This map is stored in + * 'map_fold_len_back'. The first character in the fold + * has array element 1 contain the number of bytes in the + * source that folded to it; the 2nd is the cumulative + * number to match it; ... */ + U8 map_fold_len_back[UTF8_MAX_FOLD_CHAR_EXPAND] = { 0 }; + U8 folded[UTF8_MAXBYTES_CASE+1]; + STRLEN foldlen = 0; /* num bytes in fold of 1st char */ + STRLEN foldlen_for_av; /* num bytes in fold of all chars */ + + if (OP(n) == ANYOF || maxlen == 1 || ! lenp || ! av) { + + /* Here, only need to fold the first char of the target + * string */ + to_utf8_fold(utf8_p, folded, &foldlen); + foldlen_for_av = foldlen; + map_fold_len_back[1] = UTF8SKIP(utf8_p); + } + else { + + /* Here, need to fold more than the first char. Do so + * up to the limits */ + UV which_char = 0; + U8* source_ptr = utf8_p; /* The source for the fold + is the regex target + string */ + U8* folded_ptr = folded; + U8* e = utf8_p + maxlen; /* Can't go beyond last + available byte in the + target string */ + while (which_char < UTF8_MAX_FOLD_CHAR_EXPAND + && source_ptr < e) + { + + /* Fold the next character */ + U8 this_char_folded[UTF8_MAXBYTES_CASE+1]; + STRLEN this_char_foldlen; + to_utf8_fold(source_ptr, + this_char_folded, + &this_char_foldlen); + + /* Bail if it would exceed the byte limit for + * folding a single char. */ + if (this_char_foldlen + folded_ptr - folded > + UTF8_MAXBYTES_CASE) + { + break; + } + + /* Save the first character's folded length, in + * case we have to use it later */ + if (! foldlen) { + foldlen = this_char_foldlen; + } + + /* Here, add the fold of this character */ + Copy(this_char_folded, + folded_ptr, + this_char_foldlen, + U8); + which_char++; + map_fold_len_back[which_char] = + map_fold_len_back[which_char - 1] + + UTF8SKIP(source_ptr); + folded_ptr += this_char_foldlen; + source_ptr += UTF8SKIP(source_ptr); + } + *folded_ptr = '\0'; + foldlen_for_av = folded_ptr - folded; + } + + + /* Do the linear search to see if the fold is in the list + * of multi-char folds. (Useless to look if won't be able + * to store that it is a multi-char fold in *lenp) */ + if (lenp && av) { I32 i; for (i = 0; i <= av_len(av); i++) { SV* const sv = *av_fetch(av, i, FALSE); STRLEN len; const char * const s = SvPV_const(sv, len); - if (len <= plen && memEQ(s, (char*)utf8_p, len)) { - *lenp = len; + if (len <= foldlen_for_av && memEQ(s, + (char*)folded, + len)) + { + + /* Advance the target string ptr to account for + * this fold, but have to translate from the + * folded length to the corresponding source + * length. The array is indexed by how many + * characters in the match */ + *lenp = map_fold_len_back[ + utf8_length(folded, folded + len)]; match = TRUE; break; } } } - if (!match) { - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; +#if 0 + if (!match) { /* See if the folded version matches */ + SV** listp; + + /* Consider "k" =~ /[K]/i. The line above would have + * just folded the 'k' to itself, and that isn't going + * to match 'K'. So we look through the closure of + * everything that folds to 'k'. That will find the + * 'K'. Initialize the list, if necessary */ + if (! PL_utf8_foldclosures) { + + /* If the folds haven't been read in, call a fold + * function to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES+1]; + STRLEN dummy_len; + to_utf8_fold((U8*) "A", dummy, &dummy_len); + } + PL_utf8_foldclosures = + _swash_inversion_hash(PL_utf8_tofold); + } - STRLEN tmplen; - to_utf8_fold(utf8_p, tmpbuf, &tmplen); - if (swash_fetch(sw, tmpbuf, 1)) - match = TRUE; + /* The data structure is a hash with the keys every + * character that is folded to, like 'k', and the + * values each an array of everything that folds to its + * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) folded, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV i; + for (i = 0; i <= av_len(list); i++) { + SV** try_p = av_fetch(list, i, FALSE); + char* try_c; + if (try_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + /* Don't have to worry about embedded nulls + * since NULL isn't folded or foldable */ + try_c = SvPVX(*try_p); + + /* The fold in a few cases of an above Latin1 + * char is in the Latin1 range, and hence may + * be in the bitmap */ + if (UTF8_IS_INVARIANT(*try_c) + && ANYOF_BITMAP_TEST(n, + UNI_TO_NATIVE(*try_c))) + { + match = TRUE; + break; + } + else if + (UTF8_IS_DOWNGRADEABLE_START(*try_c) + && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE( + TWO_BYTE_UTF8_TO_UNI(try_c[0], + try_c[1])))) + { + /* Since the fold comes from internally + * generated data, we can safely assume it + * is valid utf8 in the test above */ + match = TRUE; + break; + } else if (swash_fetch(sw, (U8*) try_c, TRUE)) { + match = TRUE; + break; + } + } + } } +#endif } /* If we allocated a string above, free it */ - if (! do_utf8) Safefree(utf8_p); - } - } - if (match && lenp && *lenp == 0) - *lenp = UNISKIP(NATIVE_TO_UNI(c)); - } - if (!match && c < 256) { - if (ANYOF_BITMAP_TEST(n, c)) - match = TRUE; - else if (flags & ANYOF_FOLD) { - U8 f; - - if (flags & ANYOF_LOCALE) { - PL_reg_flags |= RF_tainted; - f = PL_fold_locale[c]; - } - else - f = PL_fold[c]; - if (f != c && ANYOF_BITMAP_TEST(n, f)) - match = TRUE; - } - - if (!match && (flags & ANYOF_CLASS)) { - PL_reg_flags |= RF_tainted; - if ( - (ANYOF_CLASS_TEST(n, ANYOF_ALNUM) && isALNUM_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NALNUM) && !isALNUM_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_SPACE) && isSPACE_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NSPACE) && !isSPACE_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_DIGIT) && isDIGIT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NDIGIT) && !isDIGIT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_ALNUMC) && isALNUMC_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NALNUMC) && !isALNUMC_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_ALPHA) && isALPHA_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NALPHA) && !isALPHA_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_ASCII) && isASCII(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NASCII) && !isASCII(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_CNTRL) && isCNTRL_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NCNTRL) && !isCNTRL_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_GRAPH) && isGRAPH_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NGRAPH) && !isGRAPH_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_LOWER) && isLOWER_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NLOWER) && !isLOWER_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_PRINT) && isPRINT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NPRINT) && !isPRINT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_PUNCT) && isPUNCT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NPUNCT) && !isPUNCT_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_UPPER) && isUPPER_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NUPPER) && !isUPPER_LC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_XDIGIT) && isXDIGIT(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NXDIGIT) && !isXDIGIT(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_PSXSPC) && isPSXSPC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NPSXSPC) && !isPSXSPC(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_BLANK) && isBLANK(c)) || - (ANYOF_CLASS_TEST(n, ANYOF_NBLANK) && !isBLANK(c)) - ) /* How's that for a conditional? */ - { - match = TRUE; + if (! utf8_target) Safefree(utf8_p); } } }