X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/d02d6d97d5eefad4e164003699595f59abb06506..9946686ee0b3ada8e0512262416bb5150e0e0108:/regexec.c diff --git a/regexec.c b/regexec.c index a8338ed..a6da6ce 100644 --- a/regexec.c +++ b/regexec.c @@ -85,7 +85,7 @@ #define RF_utf8 8 /* Pattern contains multibyte chars? */ -#define UTF ((PL_reg_flags & RF_utf8) != 0) +#define UTF_PATTERN ((PL_reg_flags & RF_utf8) != 0) #define RS_init 1 /* eval environment created */ #define RS_set 2 /* replsv value is set */ @@ -94,13 +94,17 @@ #define STATIC static #endif -#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) : ANYOF_BITMAP_TEST(p,*(c))) +/* Valid for non-utf8 strings only: avoids the reginclass call if there are no + * complications: i.e., if everything matchable is straight forward in the + * bitmap */ +#define REGINCLASS(prog,p,c) (ANYOF_FLAGS(p) ? reginclass(prog,p,c,0,0) \ + : ANYOF_BITMAP_TEST(p,*(c))) /* * Forwards. */ -#define CHR_SVLEN(sv) (do_utf8 ? sv_len_utf8(sv) : SvCUR(sv)) +#define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv)) #define CHR_DIST(a,b) (PL_reg_match_utf8 ? utf8_distance(a,b) : a - b) #define HOPc(pos,off) \ @@ -176,83 +180,113 @@ #endif -#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ - case NAMEL: \ - PL_reg_flags |= RF_tainted; \ - /* FALL THROUGH */ \ - case NAME: \ - if (!nextchr) \ - sayNO; \ - if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \ - if (!CAT2(PL_utf8_,CLASS)) { \ - bool ok; \ - ENTER; \ - save_re_context(); \ - ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \ - assert(ok); \ - LEAVE; \ - } \ - if (!(OP(scan) == NAME \ - ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)) \ - : LCFUNC_utf8((U8*)locinput))) \ - { \ - sayNO; \ - } \ - locinput += PL_utf8skip[nextchr]; \ - nextchr = UCHARAT(locinput); \ - break; \ - } \ - if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \ - sayNO; \ - nextchr = UCHARAT(++locinput); \ +#define _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ + case NAMEL: \ + PL_reg_flags |= RF_tainted; \ + /* FALL THROUGH */ \ + case NAME: \ + if (!nextchr) \ + sayNO; \ + if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \ + if (!CAT2(PL_utf8_,CLASS)) { \ + bool ok; \ + ENTER; \ + save_re_context(); \ + ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \ + assert(ok); \ + LEAVE; \ + } \ + if (!(OP(scan) == NAME \ + ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \ + : LCFUNC_utf8((U8*)locinput))) \ + { \ + sayNO; \ + } \ + locinput += PL_utf8skip[nextchr]; \ + nextchr = UCHARAT(locinput); \ + break; \ + } \ + /* Drops through to the macro that calls this one */ + +#define CCC_TRY_AFF(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ + _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ + if (!(OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \ + sayNO; \ + nextchr = UCHARAT(++locinput); \ break -#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ - case NAMEL: \ - PL_reg_flags |= RF_tainted; \ - /* FALL THROUGH */ \ - case NAME : \ - if (!nextchr && locinput >= PL_regeol) \ - sayNO; \ - if (do_utf8 && UTF8_IS_CONTINUED(nextchr)) { \ - if (!CAT2(PL_utf8_,CLASS)) { \ - bool ok; \ - ENTER; \ - save_re_context(); \ - ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \ - assert(ok); \ - LEAVE; \ - } \ - if ((OP(scan) == NAME \ - ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, do_utf8)) \ - : LCFUNC_utf8((U8*)locinput))) \ - { \ - sayNO; \ - } \ - locinput += PL_utf8skip[nextchr]; \ - nextchr = UCHARAT(locinput); \ - break; \ - } \ - if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \ - sayNO; \ - nextchr = UCHARAT(++locinput); \ +/* Almost identical to the above, but has a case for a node that matches chars + * between 128 and 255 using Unicode (latin1) semantics. */ +#define CCC_TRY_AFF_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \ + _CCC_TRY_AFF_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ + if (!(OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \ + sayNO; \ + nextchr = UCHARAT(++locinput); \ break +#define _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ + case NAMEL: \ + PL_reg_flags |= RF_tainted; \ + /* FALL THROUGH */ \ + case NAME : \ + if (!nextchr && locinput >= PL_regeol) \ + sayNO; \ + if (utf8_target && UTF8_IS_CONTINUED(nextchr)) { \ + if (!CAT2(PL_utf8_,CLASS)) { \ + bool ok; \ + ENTER; \ + save_re_context(); \ + ok=CAT2(is_utf8_,CLASS)((const U8*)STR); \ + assert(ok); \ + LEAVE; \ + } \ + if ((OP(scan) == NAME \ + ? cBOOL(swash_fetch(CAT2(PL_utf8_,CLASS), (U8*)locinput, utf8_target)) \ + : LCFUNC_utf8((U8*)locinput))) \ + { \ + sayNO; \ + } \ + locinput += PL_utf8skip[nextchr]; \ + nextchr = UCHARAT(locinput); \ + break; \ + } + +#define CCC_TRY_NEG(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC,LCFUNC) \ + _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNC) \ + if ((OP(scan) == NAME ? FUNC(nextchr) : LCFUNC(nextchr))) \ + sayNO; \ + nextchr = UCHARAT(++locinput); \ + break +#define CCC_TRY_NEG_U(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU,LCFUNC) \ + _CCC_TRY_NEG_COMMON(NAME,NAMEL,CLASS,STR,LCFUNC_utf8,FUNCU) \ + if ((OP(scan) == NAMEL ? LCFUNC(nextchr) : (FUNCU(nextchr) && (isASCII(nextchr) || (FLAGS(scan) & USE_UNI))))) \ + sayNO; \ + nextchr = UCHARAT(++locinput); \ + break + /* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */ /* for use after a quantifier and before an EXACT-like node -- japhy */ -/* it would be nice to rework regcomp.sym to generate this stuff. sigh */ +/* it would be nice to rework regcomp.sym to generate this stuff. sigh + * + * NOTE that *nothing* that affects backtracking should be in here, specifically + * VERBS must NOT be included. JUMPABLE is used to determine if we can ignore a + * node that is in between two EXACT like nodes when ascertaining what the required + * "follow" character is. This should probably be moved to regex compile time + * although it may be done at run time beause of the REF possibility - more + * investigation required. -- demerphq +*/ #define JUMPABLE(rn) ( \ OP(rn) == OPEN || \ (OP(rn) == CLOSE && (!cur_eval || cur_eval->u.eval.close_paren != ARG(rn))) || \ OP(rn) == EVAL || \ OP(rn) == SUSPEND || OP(rn) == IFMATCH || \ OP(rn) == PLUS || OP(rn) == MINMOD || \ - OP(rn) == KEEPS || (PL_regkind[OP(rn)] == VERB) || \ + OP(rn) == KEEPS || \ (PL_regkind[OP(rn)] == CURLY && ARG1(rn) > 0) \ ) #define IS_EXACT(rn) (PL_regkind[OP(rn)] == EXACT) @@ -515,7 +549,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, register SV *check; char *strbeg; char *t; - const bool do_utf8 = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ + const bool utf8_target = (sv && SvUTF8(sv)) ? 1 : 0; /* if no sv we have to assume bytes */ I32 ml_anch; register char *other_last = NULL; /* other substr checked before this */ char *check_at = NULL; /* check substr found at this pos */ @@ -528,13 +562,13 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, PERL_ARGS_ASSERT_RE_INTUIT_START; - RX_MATCH_UTF8_set(rx,do_utf8); + RX_MATCH_UTF8_set(rx,utf8_target); if (RX_UTF8(rx)) { PL_reg_flags |= RF_utf8; } DEBUG_EXECUTE_r( - debug_start_match(rx, do_utf8, strpos, strend, + debug_start_match(rx, utf8_target, strpos, strend, sv ? "Guessing start of match in sv for" : "Guessing start of match in string for"); ); @@ -548,7 +582,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, strbeg = (sv && SvPOK(sv)) ? strend - SvCUR(sv) : strpos; PL_regeol = strend; - if (do_utf8) { + if (utf8_target) { if (!prog->check_utf8 && prog->check_substr) to_utf8_substr(prog); check = prog->check_utf8; @@ -700,11 +734,11 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, unshift s. */ DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), SvPVX_const(check), RE_SV_DUMPLEN(check), 30); PerlIO_printf(Perl_debug_log, "%s %s substr %s%s%s", (s ? "Found" : "Did not find"), - (check == (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) + (check == (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) ? "anchored" : "floating"), quoted, RE_SV_TAIL(check), @@ -735,14 +769,14 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, Probably it is right to do no SCREAM here... */ - if (do_utf8 ? (prog->float_utf8 && prog->anchored_utf8) + if (utf8_target ? (prog->float_utf8 && prog->anchored_utf8) : (prog->float_substr && prog->anchored_substr)) { /* Take into account the "other" substring. */ /* XXXX May be hopelessly wrong for UTF... */ if (!other_last) other_last = strpos; - if (check == (do_utf8 ? prog->float_utf8 : prog->float_substr)) { + if (check == (utf8_target ? prog->float_utf8 : prog->float_substr)) { do_other_anchored: { char * const last = HOP3c(s, -start_shift, strbeg); @@ -752,7 +786,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, t = s - prog->check_offset_max; if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!do_utf8 + && (!utf8_target || ((t = (char*)reghopmaybe3((U8*)s, -(prog->check_offset_max), (U8*)strpos)) && t > strpos))) NOOP; @@ -771,7 +805,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, */ /* On end-of-str: see comment below. */ - must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr; + must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr; if (must == &PL_sv_undef) { s = (char*)NULL; DEBUG_r(must = prog->anchored_utf8); /* for debug */ @@ -785,7 +819,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, multiline ? FBMrf_MULTILINE : 0 ); DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), SvPVX_const(must), RE_SV_DUMPLEN(must), 30); PerlIO_printf(Perl_debug_log, "%s anchored substr %s%s", (s ? "Found" : "Contradicts"), @@ -832,7 +866,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, if (s < other_last) s = other_last; /* XXXX It is not documented what units *_offsets are in. Assume bytes. */ - must = do_utf8 ? prog->float_utf8 : prog->float_substr; + must = utf8_target ? prog->float_utf8 : prog->float_substr; /* fbm_instr() takes into account exact value of end-of-str if the check is SvTAIL(ed). Since false positives are OK, and end-of-str is not later than strend we are OK. */ @@ -846,7 +880,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, - (SvTAIL(must)!=0), must, multiline ? FBMrf_MULTILINE : 0); DEBUG_EXECUTE_r({ - RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), SvPVX_const(must), RE_SV_DUMPLEN(must), 30); PerlIO_printf(Perl_debug_log, "%s floating substr %s%s", (s ? "Found" : "Contradicts"), @@ -893,7 +927,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, ); if (s - strpos > prog->check_offset_max /* signed-corrected t > strpos */ - && (!do_utf8 + && (!utf8_target || ((t = (char*)reghopmaybe3((U8*)s, -prog->check_offset_max, (U8*) ((prog->check_offset_max<0) ? strend : strpos))) && t > strpos))) { @@ -911,7 +945,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, while (t < strend - prog->minlen) { if (*t == '\n') { if (t < check_at - prog->check_offset_min) { - if (do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) { + if (utf8_target ? prog->anchored_utf8 : prog->anchored_substr) { /* Since we moved from the found position, we definitely contradict the found anchored substr. Due to the above check we do not @@ -951,7 +985,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, } s = t; set_useful: - ++BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ + ++BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr); /* hooray/5 */ } else { /* The found string does not prohibit matching at strpos, @@ -975,7 +1009,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, ); success_at_start: if (!(prog->intflags & PREGf_NAUGHTY) /* XXXX If strpos moved? */ - && (do_utf8 ? ( + && (utf8_target ? ( prog->check_utf8 /* Could be deleted already */ && --BmUSEFUL(prog->check_utf8) < 0 && (prog->check_utf8 == prog->float_utf8) @@ -987,17 +1021,22 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, { /* If flags & SOMETHING - do not do it many times on the same match */ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "... Disabling check substring...\n")); - /* XXX Does the destruction order has to change with do_utf8? */ - SvREFCNT_dec(do_utf8 ? prog->check_utf8 : prog->check_substr); - SvREFCNT_dec(do_utf8 ? prog->check_substr : prog->check_utf8); + /* XXX Does the destruction order has to change with utf8_target? */ + SvREFCNT_dec(utf8_target ? prog->check_utf8 : prog->check_substr); + SvREFCNT_dec(utf8_target ? prog->check_substr : prog->check_utf8); prog->check_substr = prog->check_utf8 = NULL; /* disable */ prog->float_substr = prog->float_utf8 = NULL; /* clear */ check = NULL; /* abort */ s = strpos; + /* XXXX If the check string was an implicit check MBOL, then we need to unset the relevent flag + see http://bugs.activestate.com/show_bug.cgi?id=87173 */ + if (prog->intflags & PREGf_IMPLICIT) + prog->extflags &= ~RXf_ANCH_MBOL; /* XXXX This is a remnant of the old implementation. It looks wasteful, since now INTUIT can use many other heuristics. */ prog->extflags &= ~RXf_USE_INTUIT; + /* XXXX What other flags might need to be cleared in this branch? */ } else s = strpos; @@ -1048,7 +1087,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, goto fail; /* Contradict one of substrings */ if (prog->anchored_substr || prog->anchored_utf8) { - if ((do_utf8 ? prog->anchored_utf8 : prog->anchored_substr) == check) { + if ((utf8_target ? prog->anchored_utf8 : prog->anchored_substr) == check) { DEBUG_EXECUTE_r( what = "anchored" ); hop_and_restart: s = HOP3c(t, 1, strend); @@ -1088,7 +1127,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, PL_colors[0], PL_colors[1], (long)(t - i_strpos)) ); goto try_at_offset; } - if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) /* Could have been deleted */ goto fail; /* Check is floating subtring. */ retry_floating_check: @@ -1116,7 +1155,7 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, fail_finish: /* Substring not found */ if (prog->check_substr || prog->check_utf8) /* could be removed already */ - BmUSEFUL(do_utf8 ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ + BmUSEFUL(utf8_target ? prog->check_utf8 : prog->check_substr) += 5; /* hooray */ fail: DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch rejected by optimizer%s\n", PL_colors[4], PL_colors[5])); @@ -1126,8 +1165,8 @@ Perl_re_intuit_start(pTHX_ REGEXP * const rx, SV *sv, char *strpos, #define DECL_TRIE_TYPE(scan) \ const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold } \ trie_type = (scan->flags != EXACT) \ - ? (do_utf8 ? trie_utf8_fold : (UTF ? trie_latin_utf8_fold : trie_plain)) \ - : (do_utf8 ? trie_utf8 : trie_plain) + ? (utf8_target ? trie_utf8_fold : (UTF_PATTERN ? trie_latin_utf8_fold : trie_plain)) \ + : (utf8_target ? trie_utf8 : trie_plain) #define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, \ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ @@ -1184,8 +1223,8 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ char *my_strend= (char *)strend; \ if ( (CoNd) \ && (ln == len || \ - foldEQ_utf8(s, &my_strend, 0, do_utf8, \ - m, NULL, ln, cBOOL(UTF))) \ + foldEQ_utf8(s, &my_strend, 0, utf8_target, \ + m, NULL, ln, cBOOL(UTF_PATTERN))) \ && (!reginfo || regtry(reginfo, &s)) ) \ goto got_it; \ else { \ @@ -1195,8 +1234,8 @@ uvc, charid, foldlen, foldbuf, uniflags) STMT_START { \ if ( f != c \ && (f == c1 || f == c2) \ && (ln == len || \ - foldEQ_utf8(s, &my_strend, 0, do_utf8,\ - m, NULL, ln, cBOOL(UTF)))\ + foldEQ_utf8(s, &my_strend, 0, utf8_target,\ + m, NULL, ln, cBOOL(UTF_PATTERN)))\ && (!reginfo || regtry(reginfo, &s)) ) \ goto got_it; \ } \ @@ -1261,7 +1300,7 @@ if ((!reginfo || regtry(reginfo, &s))) \ goto got_it #define REXEC_FBC_CSCAN(CoNdUtF8,CoNd) \ - if (do_utf8) { \ + if (utf8_target) { \ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ } \ else { \ @@ -1270,7 +1309,7 @@ if ((!reginfo || regtry(reginfo, &s))) \ break #define REXEC_FBC_CSCAN_PRELOAD(UtFpReLoAd,CoNdUtF8,CoNd) \ - if (do_utf8) { \ + if (utf8_target) { \ UtFpReLoAd; \ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ } \ @@ -1281,7 +1320,7 @@ if ((!reginfo || regtry(reginfo, &s))) \ #define REXEC_FBC_CSCAN_TAINT(CoNdUtF8,CoNd) \ PL_reg_flags |= RF_tainted; \ - if (do_utf8) { \ + if (utf8_target) { \ REXEC_FBC_UTF8_CLASS_SCAN(CoNdUtF8); \ } \ else { \ @@ -1311,7 +1350,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, unsigned int c2; char *e; register I32 tmp = 1; /* Scratch variable? */ - register const bool do_utf8 = PL_reg_match_utf8; + register const bool utf8_target = PL_reg_match_utf8; RXi_GET_DECL(prog,progi); PERL_ARGS_ASSERT_FIND_BYCLASS; @@ -1319,10 +1358,10 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* We know what class it must start with. */ switch (OP(c)) { case ANYOF: - if (do_utf8) { + if (utf8_target) { REXEC_FBC_UTF8_CLASS_SCAN((ANYOF_FLAGS(c) & ANYOF_UNICODE) || !UTF8_IS_INVARIANT((U8)s[0]) ? - reginclass(prog, c, (U8*)s, 0, do_utf8) : + reginclass(prog, c, (U8*)s, 0, utf8_target) : REGINCLASS(prog, c, (U8*)s)); } else { @@ -1357,7 +1396,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, m = STRING(c); ln = STR_LEN(c); /* length to match in octets/bytes */ lnc = (I32) ln; /* length to match in characters */ - if (UTF) { + if (UTF_PATTERN) { STRLEN ulen1, ulen2; U8 *sm = (U8 *) m; U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; @@ -1418,7 +1457,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * matching (called "loose matching" in Unicode). * foldEQ_utf8() will do just that. */ - if (do_utf8 || UTF) { + if (utf8_target || UTF_PATTERN) { UV c, f; U8 tmpbuf [UTF8_MAXBYTES+1]; STRLEN len = 1; @@ -1428,7 +1467,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* Upper and lower of 1st char are equal - * probably not a "letter". */ while (s <= e) { - if (do_utf8) { + if (utf8_target) { c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len, uniflags); } else { @@ -1439,7 +1478,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } else { while (s <= e) { - if (do_utf8) { + if (utf8_target) { c = utf8n_to_uvchr((U8*)s, UTF8_MAXBYTES, &len, uniflags); } else { @@ -1473,7 +1512,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case BOUND: - if (do_utf8) { + if (utf8_target) { if (s == PL_bostr) tmp = '\n'; else { @@ -1485,7 +1524,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, LOAD_UTF8_CHARCLASS_ALNUM(); REXEC_FBC_UTF8_SCAN( if (tmp == !(OP(c) == BOUND ? - cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) : + cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) : isALNUM_LC_utf8((U8*)s))) { tmp = !tmp; @@ -1493,12 +1532,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } ); } - else { + else { /* Not utf8 */ tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; - tmp = ((OP(c) == BOUND ? isALNUM(tmp) : isALNUM_LC(tmp)) != 0); + tmp = cBOOL((OP(c) == BOUNDL) + ? isALNUM_LC(tmp) + : (isWORDCHAR_L1(tmp) + && (isASCII(tmp) || (FLAGS(c) & USE_UNI)))); REXEC_FBC_SCAN( if (tmp == - !(OP(c) == BOUND ? isALNUM(*s) : isALNUM_LC(*s))) { + !((OP(c) == BOUNDL) + ? isALNUM_LC(*s) + : (isWORDCHAR_L1((U8) *s) + && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))))) + { tmp = !tmp; REXEC_FBC_TRYIT; } @@ -1511,7 +1557,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, PL_reg_flags |= RF_tainted; /* FALL THROUGH */ case NBOUND: - if (do_utf8) { + if (utf8_target) { if (s == PL_bostr) tmp = '\n'; else { @@ -1523,7 +1569,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, LOAD_UTF8_CHARCLASS_ALNUM(); REXEC_FBC_UTF8_SCAN( if (tmp == !(OP(c) == NBOUND ? - cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, do_utf8)) : + cBOOL(swash_fetch(PL_utf8_alnum, (U8*)s, utf8_target)) : isALNUM_LC_utf8((U8*)s))) tmp = !tmp; else REXEC_FBC_TRYIT; @@ -1531,12 +1577,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } else { tmp = (s != PL_bostr) ? UCHARAT(s - 1) : '\n'; - tmp = ((OP(c) == NBOUND ? - isALNUM(tmp) : isALNUM_LC(tmp)) != 0); + tmp = cBOOL((OP(c) == NBOUNDL) + ? isALNUM_LC(tmp) + : (isWORDCHAR_L1(tmp) + && (isASCII(tmp) || (FLAGS(c) & USE_UNI)))); REXEC_FBC_SCAN( - if (tmp == - !(OP(c) == NBOUND ? isALNUM(*s) : isALNUM_LC(*s))) + if (tmp == ! cBOOL( + (OP(c) == NBOUNDL) + ? isALNUM_LC(*s) + : (isWORDCHAR_L1((U8) *s) + && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))))) + { tmp = !tmp; + } else REXEC_FBC_TRYIT; ); } @@ -1546,8 +1599,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case ALNUM: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_WORD(), - swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8), - isALNUM(*s) + swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target), + (FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s) ); case ALNUML: REXEC_FBC_CSCAN_TAINT( @@ -1557,8 +1610,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case NALNUM: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_WORD(), - !swash_fetch(RE_utf8_perl_word, (U8*)s, do_utf8), - !isALNUM(*s) + !swash_fetch(RE_utf8_perl_word, (U8*)s, utf8_target), + ! ((FLAGS(c) & USE_UNI) ? isWORDCHAR_L1((U8) *s) : isALNUM(*s)) ); case NALNUML: REXEC_FBC_CSCAN_TAINT( @@ -1568,8 +1621,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case SPACE: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_SPACE(), - *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8), - isSPACE(*s) + *s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target), + isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI)) ); case SPACEL: REXEC_FBC_CSCAN_TAINT( @@ -1579,8 +1632,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case NSPACE: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_PERL_SPACE(), - !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, do_utf8)), - !isSPACE(*s) + !(*s == ' ' || swash_fetch(RE_utf8_perl_space,(U8*)s, utf8_target)), + !(isSPACE_L1((U8) *s) && (isASCII((U8) *s) || (FLAGS(c) & USE_UNI))) ); case NSPACEL: REXEC_FBC_CSCAN_TAINT( @@ -1590,7 +1643,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case DIGIT: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_POSIX_DIGIT(), - swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8), + swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target), isDIGIT(*s) ); case DIGITL: @@ -1601,7 +1654,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case NDIGIT: REXEC_FBC_CSCAN_PRELOAD( LOAD_UTF8_CHARCLASS_POSIX_DIGIT(), - !swash_fetch(RE_utf8_posix_digit,(U8*)s, do_utf8), + !swash_fetch(RE_utf8_posix_digit,(U8*)s, utf8_target), !isDIGIT(*s) ); case NDIGITL: @@ -1723,14 +1776,20 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, DEBUG_TRIE_EXECUTE_r( if ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { dump_exec_pos( (char *)uc, c, strend, real_start, - (char *)uc, do_utf8 ); + (char *)uc, utf8_target ); PerlIO_printf( Perl_debug_log, " Scanning for legal start char...\n"); } - ); - while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { - uc++; - } + ); + if (utf8_target) { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc += UTF8SKIP(uc); + } + } else { + while ( uc <= (U8*)last_start && !BITMAP_TEST(bitmap,*uc) ) { + uc++; + } + } s= (char *)uc; } if (uc >(U8*)last_start) break; @@ -1751,7 +1810,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, foldbuf, uniflags); DEBUG_TRIE_EXECUTE_r({ dump_exec_pos( (char *)uc, c, strend, real_start, - s, do_utf8 ); + s, utf8_target ); PerlIO_printf(Perl_debug_log, " Charid:%3u CP:%4"UVxf" ", charid, uvc); @@ -1766,7 +1825,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, DEBUG_TRIE_EXECUTE_r({ if (failed) dump_exec_pos( (char *)uc, c, strend, real_start, - s, do_utf8 ); + s, utf8_target ); PerlIO_printf( Perl_debug_log, "%sState: %4"UVxf", word=%"UVxf, failed ? " Fail transition to " : "", @@ -1877,7 +1936,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre I32 end_shift = 0; /* Same for the end. */ /* CC */ I32 scream_pos = -1; /* Internal iterator of scream. */ char *scream_olds = NULL; - const bool do_utf8 = cBOOL(DO_UTF8(sv)); + const bool utf8_target = cBOOL(DO_UTF8(sv)); I32 multiline; RXi_GET_DECL(prog,progi); regmatch_info reginfo; /* create some info to pass to regtry etc */ @@ -1896,9 +1955,9 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre multiline = prog->extflags & RXf_PMf_MULTILINE; reginfo.prog = rx; /* Yes, sorry that this is confusing. */ - RX_MATCH_UTF8_set(rx, do_utf8); + RX_MATCH_UTF8_set(rx, utf8_target); DEBUG_EXECUTE_r( - debug_start_match(rx, do_utf8, startpos, strend, + debug_start_match(rx, utf8_target, startpos, strend, "Matching"); ); @@ -2013,33 +2072,68 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre end = HOP3c(strend, -dontbother, strbeg) - 1; /* for multiline we only have to try after newlines */ if (prog->check_substr || prog->check_utf8) { - if (s == startpos) - goto after_try; - while (1) { - if (regtry(®info, &s)) - goto got_it; - after_try: - if (s > end) - goto phooey; - if (prog->extflags & RXf_USE_INTUIT) { - s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL); - if (!s) - goto phooey; - } - else - s++; - } - } else { - if (s > startpos) + /* because of the goto we can not easily reuse the macros for bifurcating the + unicode/non-unicode match modes here like we do elsewhere - demerphq */ + if (utf8_target) { + if (s == startpos) + goto after_try_utf8; + while (1) { + if (regtry(®info, &s)) { + goto got_it; + } + after_try_utf8: + if (s > end) { + goto phooey; + } + if (prog->extflags & RXf_USE_INTUIT) { + s = re_intuit_start(rx, sv, s + UTF8SKIP(s), strend, flags, NULL); + if (!s) { + goto phooey; + } + } + else { + s += UTF8SKIP(s); + } + } + } /* end search for check string in unicode */ + else { + if (s == startpos) { + goto after_try_latin; + } + while (1) { + if (regtry(®info, &s)) { + goto got_it; + } + after_try_latin: + if (s > end) { + goto phooey; + } + if (prog->extflags & RXf_USE_INTUIT) { + s = re_intuit_start(rx, sv, s + 1, strend, flags, NULL); + if (!s) { + goto phooey; + } + } + else { + s++; + } + } + } /* end search for check string in latin*/ + } /* end search for check string */ + else { /* search for newline */ + if (s > startpos) { + /*XXX: The s-- is almost definitely wrong here under unicode - demeprhq*/ s--; + } + /* We can use a more efficient search as newlines are the same in unicode as they are in latin */ while (s < end) { if (*s++ == '\n') { /* don't need PL_utf8skip here */ if (regtry(®info, &s)) goto got_it; } - } - } - } + } + } /* end search for newline */ + } /* end anchored/multiline check string search */ goto phooey; } else if (RXf_GPOS_CHECK == (prog->extflags & RXf_GPOS_CHECK)) { @@ -2056,16 +2150,16 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre /* Messy cases: unanchored match. */ if ((prog->anchored_substr || prog->anchored_utf8) && prog->intflags & PREGf_SKIP) { /* we have /x+whatever/ */ - /* it must be a one character string (XXXX Except UTF?) */ + /* it must be a one character string (XXXX Except UTF_PATTERN?) */ char ch; #ifdef DEBUGGING int did_match = 0; #endif - if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)) - do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); - ch = SvPVX_const(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)[0]; + if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)) + utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog); + ch = SvPVX_const(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)[0]; - if (do_utf8) { + if (utf8_target) { REXEC_FBC_SCAN( if (*s == ch) { DEBUG_EXECUTE_r( did_match = 1 ); @@ -2105,14 +2199,14 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre int did_match = 0; #endif if (prog->anchored_substr || prog->anchored_utf8) { - if (!(do_utf8 ? prog->anchored_utf8 : prog->anchored_substr)) - do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); - must = do_utf8 ? prog->anchored_utf8 : prog->anchored_substr; + if (!(utf8_target ? prog->anchored_utf8 : prog->anchored_substr)) + utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog); + must = utf8_target ? prog->anchored_utf8 : prog->anchored_substr; back_max = back_min = prog->anchored_offset; } else { - if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) - do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); - must = do_utf8 ? prog->float_utf8 : prog->float_substr; + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) + utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog); + must = utf8_target ? prog->float_utf8 : prog->float_substr; back_max = prog->float_max_offset; back_min = prog->float_min_offset; } @@ -2160,7 +2254,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre last1 = HOPc(s, -back_min); s = t; } - if (do_utf8) { + if (utf8_target) { while (s <= last1) { if (regtry(®info, &s)) goto got_it; @@ -2176,7 +2270,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre } } DEBUG_EXECUTE_r(if (!did_match) { - RE_PV_QUOTED_DECL(quoted, do_utf8, PERL_DEBUG_PAD_ZERO(0), + RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0), SvPVX_const(must), RE_SV_DUMPLEN(must), 30); PerlIO_printf(Perl_debug_log, "Did not find %s substr %s%s...\n", ((must == prog->anchored_substr || must == prog->anchored_utf8) @@ -2196,7 +2290,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre SV * const prop = sv_newmortal(); regprop(prog, prop, c); { - RE_PV_QUOTED_DECL(quoted,do_utf8,PERL_DEBUG_PAD_ZERO(1), + RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), s,strend-s,60); PerlIO_printf(Perl_debug_log, "Matching stclass %.*s against %s (%d bytes)\n", @@ -2215,9 +2309,9 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre char *last; SV* float_real; - if (!(do_utf8 ? prog->float_utf8 : prog->float_substr)) - do_utf8 ? to_utf8_substr(prog) : to_byte_substr(prog); - float_real = do_utf8 ? prog->float_utf8 : prog->float_substr; + if (!(utf8_target ? prog->float_utf8 : prog->float_substr)) + utf8_target ? to_utf8_substr(prog) : to_byte_substr(prog); + float_real = utf8_target ? prog->float_utf8 : prog->float_substr; if (flags & REXEC_SCREAM) { last = screaminstr(sv, float_real, s - strbeg, @@ -2261,7 +2355,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, register char *stre dontbother = minlen - 1; strend -= dontbother; /* this one's always in bytes! */ /* We don't know much -- general case. */ - if (do_utf8) { + if (utf8_target) { for (;;) { if (regtry(®info, &s)) goto got_it; @@ -2683,7 +2777,7 @@ regmatch(), slabs allocated since entry are freed. #define DEBUG_STATE_pp(pp) \ DEBUG_STATE_r({ \ - DUMP_EXEC_POS(locinput, scan, do_utf8); \ + DUMP_EXEC_POS(locinput, scan, utf8_target); \ PerlIO_printf(Perl_debug_log, \ " %*s"pp" %s%s%s%s%s\n", \ depth*2, "", \ @@ -2701,7 +2795,7 @@ regmatch(), slabs allocated since entry are freed. #ifdef DEBUGGING STATIC void -S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8, +S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target, const char *start, const char *end, const char *blurb) { const bool utf8_pat = RX_UTF8(prog) ? 1 : 0; @@ -2714,18 +2808,18 @@ S_debug_start_match(pTHX_ const REGEXP *prog, const bool do_utf8, RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), RX_PRECOMP_const(prog), RX_PRELEN(prog), 60); - RE_PV_QUOTED_DECL(s1, do_utf8, PERL_DEBUG_PAD_ZERO(1), + RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1), start, end - start, 60); PerlIO_printf(Perl_debug_log, "%s%s REx%s %s against %s\n", PL_colors[4], blurb, PL_colors[5], s0, s1); - if (do_utf8||utf8_pat) + if (utf8_target||utf8_pat) PerlIO_printf(Perl_debug_log, "UTF-8 %s%s%s...\n", utf8_pat ? "pattern" : "", - utf8_pat && do_utf8 ? " and " : "", - do_utf8 ? "string" : "" + utf8_pat && utf8_target ? " and " : "", + utf8_target ? "string" : "" ); } } @@ -2736,7 +2830,7 @@ S_dump_exec_pos(pTHX_ const char *locinput, const char *loc_regeol, const char *loc_bostr, const char *loc_reg_starttry, - const bool do_utf8) + const bool utf8_target) { const int docolor = *PL_colors[0] || *PL_colors[2] || *PL_colors[4]; const int taill = (docolor ? 10 : 7); /* 3 chars for "> <" */ @@ -2753,20 +2847,20 @@ S_dump_exec_pos(pTHX_ const char *locinput, PERL_ARGS_ASSERT_DUMP_EXEC_POS; - while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) + while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput - pref_len))) pref_len++; pref0_len = pref_len - (locinput - loc_reg_starttry); if (l + pref_len < (5 + taill) && l < loc_regeol - locinput) l = ( loc_regeol - locinput > (5 + taill) - pref_len ? (5 + taill) - pref_len : loc_regeol - locinput); - while (do_utf8 && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) + while (utf8_target && UTF8_IS_CONTINUATION(*(U8*)(locinput + l))) l--; if (pref0_len < 0) pref0_len = 0; if (pref0_len > pref_len) pref0_len = pref_len; { - const int is_uni = (do_utf8 && OP(scan) != CANY) ? 1 : 0; + const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0; RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), (locinput - pref_len),pref0_len, 60, 4, 5); @@ -2853,7 +2947,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) dMY_CXT; #endif dVAR; - register const bool do_utf8 = PL_reg_match_utf8; + register const bool utf8_target = PL_reg_match_utf8; const U32 uniflags = UTF8_ALLOW_DEFAULT; REGEXP *rex_sv = reginfo->prog; regexp *rex = (struct regexp *)SvANY(rex_sv); @@ -2942,7 +3036,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) DEBUG_EXECUTE_r( { SV * const prop = sv_newmortal(); regnode *rnext=regnext(scan); - DUMP_EXEC_POS( locinput, scan, do_utf8 ); + DUMP_EXEC_POS( locinput, scan, utf8_target ); regprop(rex, prop, scan); PerlIO_printf(Perl_debug_log, @@ -3020,7 +3114,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case SANY: if (!nextchr && locinput >= PL_regeol) sayNO; - if (do_utf8) { + if (utf8_target) { locinput += PL_utf8skip[nextchr]; if (locinput > PL_regeol) sayNO; @@ -3037,7 +3131,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case REG_ANY: if ((!nextchr && locinput >= PL_regeol) || nextchr == '\n') sayNO; - if (do_utf8) { + if (utf8_target) { locinput += PL_utf8skip[nextchr]; if (locinput > PL_regeol) sayNO; @@ -3053,7 +3147,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* In this case the charclass data is available inline so we can fail fast without a lot of extra overhead. */ - if (scan->flags == EXACT || !do_utf8) { + if (scan->flags == EXACT || !utf8_target) { if(!ANYOF_BITMAP_TEST(scan, *locinput)) { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, @@ -3129,7 +3223,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) "%*s %smatched empty string...%s\n", REPORT_CODE_OFF+depth*2, "", PL_colors[4], PL_colors[5]) ); - break; + if (!trie->jump) + break; } else { DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log, @@ -3164,7 +3259,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) while ( state && uc <= (U8*)PL_regeol ) { U32 base = trie->states[ state ].trans.base; UV uvc = 0; - U16 charid; + U16 charid = 0; U16 wordnum; wordnum = trie->states[ state ].wordnum; @@ -3187,7 +3282,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } DEBUG_TRIE_EXECUTE_r({ - DUMP_EXEC_POS( (char *)uc, scan, do_utf8 ); + DUMP_EXEC_POS( (char *)uc, scan, utf8_target ); PerlIO_printf( Perl_debug_log, "%*s %sState: %4"UVxf" Accepted: %c ", 2+depth * 2, "", PL_colors[4], @@ -3316,7 +3411,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) U8 *uscan; while (chars) { - if (do_utf8) { + if (utf8_target) { uvc = utf8n_to_uvuni((U8*)uc, UTF8_MAXLEN, &len, uniflags); uc += len; @@ -3338,7 +3433,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) } } else { - if (do_utf8) + if (utf8_target) while (chars--) uc += UTF8SKIP(uc); else @@ -3394,12 +3489,12 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case EXACT: { char *s = STRING(scan); ln = STR_LEN(scan); - if (do_utf8 != UTF) { + if (utf8_target != UTF_PATTERN) { /* The target and the pattern have differing utf8ness. */ char *l = locinput; const char * const e = s + ln; - if (do_utf8) { + if (utf8_target) { /* The target is utf8, the pattern is not utf8. */ while (s < e) { STRLEN ulen; @@ -3450,19 +3545,19 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) char * const s = STRING(scan); ln = STR_LEN(scan); - if (do_utf8 || UTF) { + if (utf8_target || UTF_PATTERN) { /* Either target or the pattern are utf8. */ const char * const l = locinput; char *e = PL_regeol; - if (! foldEQ_utf8(s, 0, ln, cBOOL(UTF), - l, &e, 0, do_utf8)) { + if (! foldEQ_utf8(s, 0, ln, cBOOL(UTF_PATTERN), + l, &e, 0, utf8_target)) { /* One more case for the sharp s: * pack("U0U*", 0xDF) =~ /ss/i, * the 0xC3 0x9F are the UTF-8 * byte sequence for the U+00DF. */ - if (!(do_utf8 && + if (!(utf8_target && toLOWER(s[0]) == 's' && ln >= 2 && toLOWER(s[1]) == 's' && @@ -3500,7 +3595,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) case BOUND: case NBOUND: /* was last char in word? */ - if (do_utf8) { + if (utf8_target) { if (locinput == PL_bostr) ln = '\n'; else { @@ -3511,7 +3606,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) if (OP(scan) == BOUND || OP(scan) == NBOUND) { ln = isALNUM_uni(ln); LOAD_UTF8_CHARCLASS_ALNUM(); - n = swash_fetch(PL_utf8_alnum, (U8*)locinput, do_utf8); + n = swash_fetch(PL_utf8_alnum, (U8*)locinput, utf8_target); } else { ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln)); @@ -3521,7 +3616,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) else { ln = (locinput != PL_bostr) ? UCHARAT(locinput - 1) : '\n'; - if (OP(scan) == BOUND || OP(scan) == NBOUND) { + if (FLAGS(scan) & USE_UNI) { + + /* Here, can't be BOUNDL or NBOUNDL because they never set + * the flags to USE_UNI */ + ln = isWORDCHAR_L1(ln); + n = isWORDCHAR_L1(nextchr); + } + else if (OP(scan) == BOUND || OP(scan) == NBOUND) { ln = isALNUM(ln); n = isALNUM(nextchr); } @@ -3535,24 +3637,24 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) sayNO; break; case ANYOF: - if (do_utf8) { + if (utf8_target) { STRLEN inclasslen = PL_regeol - locinput; - - if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, do_utf8)) - goto anyof_fail; if (locinput >= PL_regeol) sayNO; - locinput += inclasslen ? inclasslen : UTF8SKIP(locinput); + + if (!reginclass(rex, scan, (U8*)locinput, &inclasslen, utf8_target)) + goto anyof_fail; + locinput += inclasslen; nextchr = UCHARAT(locinput); break; } else { if (nextchr < 0) nextchr = UCHARAT(locinput); - if (!REGINCLASS(rex, scan, (U8*)locinput)) - goto anyof_fail; if (!nextchr && locinput >= PL_regeol) sayNO; + if (!REGINCLASS(rex, scan, (U8*)locinput)) + goto anyof_fail; nextchr = UCHARAT(++locinput); break; } @@ -3568,11 +3670,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) sayNO; break; /* Special char classes - The defines start on line 129 or so */ - CCC_TRY_AFF( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC); - CCC_TRY_NEG(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isALNUM, isALNUM_LC); + CCC_TRY_AFF_U( ALNUM, ALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC); + CCC_TRY_NEG_U(NALNUM, NALNUML, perl_word, "a", isALNUM_LC_utf8, isWORDCHAR_L1, isALNUM_LC); - CCC_TRY_AFF( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC); - CCC_TRY_NEG(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE, isSPACE_LC); + CCC_TRY_AFF_U( SPACE, SPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC); + CCC_TRY_NEG_U(NSPACE, NSPACEL, perl_space, " ", isSPACE_LC_utf8, isSPACE_L1, isSPACE_LC); CCC_TRY_AFF( DIGIT, DIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); CCC_TRY_NEG(NDIGIT, NDIGITL, posix_digit, "0", isDIGIT_LC_utf8, isDIGIT, isDIGIT_LC); @@ -3636,7 +3738,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) if (locinput >= PL_regeol) sayNO; - if (! do_utf8) { + if (! utf8_target) { /* Match either CR LF or '.', as all the other possibilities * require utf8 */ @@ -3664,7 +3766,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* Match (prepend)* */ while (locinput < PL_regeol && swash_fetch(PL_utf8_X_prepend, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { previous_prepend = locinput; locinput += UTF8SKIP(locinput); @@ -3676,7 +3778,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) if (previous_prepend && (locinput >= PL_regeol || ! swash_fetch(PL_utf8_X_begin, - (U8*)locinput, do_utf8))) + (U8*)locinput, utf8_target))) { locinput = previous_prepend; } @@ -3686,7 +3788,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * moved locinput forward, we tested the result just above * and it either passed, or we backed off so that it will * now pass */ - if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, do_utf8)) { + if (! swash_fetch(PL_utf8_X_begin, (U8*)locinput, utf8_target)) { /* Here did not match the required 'Begin' in the * second term. So just match the very first @@ -3698,7 +3800,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * an extender. It is either a hangul syllable, or a * non-control */ if (swash_fetch(PL_utf8_X_non_hangul, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { /* Here not a Hangul syllable, must be a @@ -3710,11 +3812,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) * of several individual characters. One * possibility is T+ */ if (swash_fetch(PL_utf8_X_T, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { while (locinput < PL_regeol && swash_fetch(PL_utf8_X_T, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { locinput += UTF8SKIP(locinput); } @@ -3728,7 +3830,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* Match L* */ while (locinput < PL_regeol && swash_fetch(PL_utf8_X_L, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { locinput += UTF8SKIP(locinput); } @@ -3741,13 +3843,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) if (locinput < PL_regeol && swash_fetch(PL_utf8_X_LV_LVT_V, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { /* Otherwise keep going. Must be LV, LVT * or V. See if LVT */ if (swash_fetch(PL_utf8_X_LVT, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { locinput += UTF8SKIP(locinput); } else { @@ -3757,7 +3859,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) locinput += UTF8SKIP(locinput); while (locinput < PL_regeol && swash_fetch(PL_utf8_X_V, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { locinput += UTF8SKIP(locinput); } @@ -3768,7 +3870,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) while (locinput < PL_regeol && swash_fetch(PL_utf8_X_T, (U8*)locinput, - do_utf8)) + utf8_target)) { locinput += UTF8SKIP(locinput); } @@ -3779,7 +3881,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) /* Match any extender */ while (locinput < PL_regeol && swash_fetch(PL_utf8_X_extend, - (U8*)locinput, do_utf8)) + (U8*)locinput, utf8_target)) { locinput += UTF8SKIP(locinput); } @@ -3824,7 +3926,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) break; s = PL_bostr + ln; - if (do_utf8 && type != REF) { /* REF can do byte comparison */ + if (utf8_target && type != REF) { /* REF can do byte comparison */ char *l = locinput; const char *e = PL_bostr + PL_regoffs[n].end; /* @@ -3927,7 +4029,24 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) COP * const ocurcop = PL_curcop; PAD *old_comppad; char *saved_regeol = PL_regeol; - + struct re_save_state saved_state; + + /* To not corrupt the existing regex state while executing the + * eval we would normally put it on the save stack, like with + * save_re_context. However, re-evals have a weird scoping so we + * can't just add ENTER/LEAVE here. With that, things like + * + * (?{$a=2})(a(?{local$a=$a+1}))*aak*c(?{$b=$a}) + * + * would break, as they expect the localisation to be unwound + * only when the re-engine backtracks through the bit that + * localised it. + * + * What we do instead is just saving the state in a local c + * variable. + */ + Copy(&PL_reg_state, &saved_state, 1, struct re_save_state); + n = ARG(scan); PL_op = (OP_4tree*)rexi->data->data[n]; DEBUG_STATE_r( PerlIO_printf(Perl_debug_log, @@ -3949,6 +4068,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) PUTBACK; } + Copy(&saved_state, &PL_reg_state, 1, struct re_save_state); + PL_op = oop; PAD_RESTORE_LOCAL(old_comppad); PL_curcop = ocurcop; @@ -4037,7 +4158,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, regnode *prog) re->sublen = rex->sublen; rei = RXi_GET(re); DEBUG_EXECUTE_r( - debug_start_match(re_sv, do_utf8, locinput, PL_regeol, + debug_start_match(re_sv, utf8_target, locinput, PL_regeol, "Matching embedded"); ); startpoint = rei->program + 1; @@ -4880,14 +5001,14 @@ NULL if this changes back then the macro for IS_TEXT and friends need to change. */ - if (!UTF) { + if (!UTF_PATTERN) { ST.c2 = ST.c1 = *s; if (IS_TEXTF(text_node)) ST.c2 = PL_fold[ST.c1]; else if (IS_TEXTFL(text_node)) ST.c2 = PL_fold_locale[ST.c1]; } - else { /* UTF */ + else { /* UTF_PATTERN */ if (IS_TEXTF(text_node)) { STRLEN ulen1, ulen2; U8 tmpbuf1[UTF8_MAXBYTES_CASE+1]; @@ -4939,11 +5060,11 @@ NULL * string that could possibly match */ if (ST.max == REG_INFTY) { ST.maxpos = PL_regeol - 1; - if (do_utf8) + if (utf8_target) while (UTF8_IS_CONTINUATION(*(U8*)ST.maxpos)) ST.maxpos--; } - else if (do_utf8) { + else if (utf8_target) { int m = ST.max - ST.min; for (ST.maxpos = locinput; m >0 && ST.maxpos + UTF8SKIP(ST.maxpos) <= PL_regeol; m--) @@ -4989,7 +5110,7 @@ NULL REGCP_UNWIND(ST.cp); /* Couldn't or didn't -- move forward. */ ST.oldloc = locinput; - if (do_utf8) + if (utf8_target) locinput += UTF8SKIP(locinput); else locinput++; @@ -4998,7 +5119,7 @@ NULL /* find the next place where 'B' could work, then call B */ { int n; - if (do_utf8) { + if (utf8_target) { n = (ST.oldloc == locinput) ? 0 : 1; if (ST.c1 == ST.c2) { STRLEN len; @@ -5094,7 +5215,7 @@ NULL { UV c = 0; if (ST.c1 != CHRTEST_VOID) - c = do_utf8 ? utf8n_to_uvchr((U8*)PL_reginput, + c = utf8_target ? utf8n_to_uvchr((U8*)PL_reginput, UTF8_MAXBYTES, 0, uniflags) : (UV) UCHARAT(PL_reginput); /* If it could work, try it. */ @@ -5352,9 +5473,9 @@ NULL #undef ST case FOLDCHAR: n = ARG(scan); - if ( n == (U32)what_len_TRICKYFOLD(locinput,do_utf8,ln) ) { + if ( n == (U32)what_len_TRICKYFOLD(locinput,utf8_target,ln) ) { locinput += ln; - } else if ( 0xDF == n && !do_utf8 && !UTF ) { + } else if ( 0xDF == n && !utf8_target && !UTF_PATTERN ) { sayNO; } else { U8 folded[UTF8_MAXBYTES_CASE+1]; @@ -5364,7 +5485,7 @@ NULL to_uni_fold(n, folded, &foldlen); if (! foldEQ_utf8((const char*) folded, 0, foldlen, 1, - l, &e, 0, do_utf8)) { + l, &e, 0, utf8_target)) { sayNO; } locinput = e; @@ -5372,7 +5493,7 @@ NULL nextchr = UCHARAT(locinput); break; case LNBREAK: - if ((n=is_LNBREAK(locinput,do_utf8))) { + if ((n=is_LNBREAK(locinput,utf8_target))) { locinput += n; nextchr = UCHARAT(locinput); } else @@ -5381,14 +5502,14 @@ NULL #define CASE_CLASS(nAmE) \ case nAmE: \ - if ((n=is_##nAmE(locinput,do_utf8))) { \ + if ((n=is_##nAmE(locinput,utf8_target))) { \ locinput += n; \ nextchr = UCHARAT(locinput); \ } else \ sayNO; \ break; \ case N##nAmE: \ - if ((n=is_##nAmE(locinput,do_utf8))) { \ + if ((n=is_##nAmE(locinput,utf8_target))) { \ sayNO; \ } else { \ locinput += UTF8SKIP(locinput); \ @@ -5601,7 +5722,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) register I32 c; register char *loceol = PL_regeol; register I32 hardcount = 0; - register bool do_utf8 = PL_reg_match_utf8; + register bool utf8_target = PL_reg_match_utf8; #ifndef DEBUGGING PERL_UNUSED_ARG(depth); #endif @@ -5615,7 +5736,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) loceol = scan + max; switch (OP(p)) { case REG_ANY: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (scan < loceol && hardcount < max && *scan != '\n') { scan += UTF8SKIP(scan); @@ -5627,7 +5748,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case SANY: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (scan < loceol && hardcount < max) { scan += UTF8SKIP(scan); @@ -5640,29 +5761,112 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) case CANY: scan = loceol; break; - case EXACT: /* length of string is 1 */ - c = (U8)*STRING(p); - while (scan < loceol && UCHARAT(scan) == c) - scan++; - break; - case EXACTF: /* length of string is 1 */ - c = (U8)*STRING(p); - while (scan < loceol && - (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c])) - scan++; - break; - case EXACTFL: /* length of string is 1 */ + case EXACTFL: PL_reg_flags |= RF_tainted; + /* FALL THROUGH */ + case EXACT: + case EXACTF: + /* To get here, EXACTish nodes must have *byte* length == 1. That means + * they match only characters in the string that can be expressed as a + * single byte. For non-utf8 strings, that means a simple match. For + * utf8 strings, the character matched must be an invariant, or + * downgradable to a single byte. The pattern's utf8ness is + * irrelevant, as it must be a single byte, so either it isn't utf8, or + * if it is it's an invariant */ + c = (U8)*STRING(p); - while (scan < loceol && - (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c])) - scan++; + assert(! UTF_PATTERN || UNI_IS_INVARIANT(c)); + + if ((! utf8_target) || UNI_IS_INVARIANT(c)) { + + /* Here, the string isn't utf8, or the character in the EXACT + * node is the same in utf8 as not, so can just do equality. + * Each matching char must be 1 byte long */ + switch (OP(p)) { + case EXACT: + while (scan < loceol && UCHARAT(scan) == c) { + scan++; + } + break; + case EXACTF: + while (scan < loceol && + (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold[c])) + { + scan++; + } + break; + case EXACTFL: + while (scan < loceol && + (UCHARAT(scan) == c || UCHARAT(scan) == PL_fold_locale[c])) + { + scan++; + } + break; + default: + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); + } + } + else { + + /* Here, the string is utf8, and the pattern char is different + * in utf8 than not. */ + + switch (OP(p)) { + case EXACT: + { + /* Fastest to find the two utf8 bytes that represent c, and + * then look for those in sequence in the utf8 string */ + U8 high = UTF8_TWO_BYTE_HI(c); + U8 low = UTF8_TWO_BYTE_LO(c); + loceol = PL_regeol; + + while (hardcount < max + && scan + 1 < loceol + && UCHARAT(scan) == high + && UCHARAT(scan + 1) == low) + { + scan += 2; + hardcount++; + } + } + break; + case EXACTFL: /* Doesn't really make sense, but is best we can + do. The documents warn against mixing locale + and utf8 */ + case EXACTF: + { /* utf8 string, so use utf8 foldEQ */ + char *tmpeol = loceol; + while (hardcount < max + && foldEQ_utf8(scan, &tmpeol, 0, utf8_target, + STRING(p), NULL, 1, UTF_PATTERN)) + { + scan = tmpeol; + tmpeol = loceol; + hardcount++; + } + + /* XXX Note that the above handles properly the German + * sharp ss in the pattern matching ss in the string. But + * it doesn't handle properly cases where the string + * contains say 'LIGATURE ff' and the pattern is 'f+'. + * This would require, say, a new function or revised + * interface to foldEQ_utf8(), in which the maximum number + * of characters to match could be passed and it would + * return how many actually did. This is just one of many + * cases where multi-char folds don't work properly, and so + * the fix is being deferred */ + } + break; + default: + Perl_croak(aTHX_ "panic: Unexpected op %u", OP(p)); + } + } break; case ANYOF: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && - reginclass(prog, p, (U8*)scan, 0, do_utf8)) { + reginclass(prog, p, (U8*)scan, 0, utf8_target)) { scan += UTF8SKIP(scan); hardcount++; } @@ -5672,22 +5876,28 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case ALNUM: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; LOAD_UTF8_CHARCLASS_ALNUM(); while (hardcount < max && scan < loceol && - swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) { + swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) + { scan += UTF8SKIP(scan); hardcount++; } + } else if (FLAGS(p) & USE_UNI) { + while (scan < loceol && isWORDCHAR_L1((U8) *scan)) { + scan++; + } } else { - while (scan < loceol && isALNUM(*scan)) - scan++; + while (scan < loceol && isALNUM((U8) *scan)) { + scan++; + } } break; case ALNUML: PL_reg_flags |= RF_tainted; - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && isALNUM_LC_utf8((U8*)scan)) { @@ -5700,22 +5910,28 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case NALNUM: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; LOAD_UTF8_CHARCLASS_ALNUM(); while (hardcount < max && scan < loceol && - !swash_fetch(PL_utf8_alnum, (U8*)scan, do_utf8)) { + !swash_fetch(PL_utf8_alnum, (U8*)scan, utf8_target)) + { scan += UTF8SKIP(scan); hardcount++; } + } else if (FLAGS(p) & USE_UNI) { + while (scan < loceol && ! isWORDCHAR_L1((U8) *scan)) { + scan++; + } } else { - while (scan < loceol && !isALNUM(*scan)) - scan++; + while (scan < loceol && ! isALNUM((U8) *scan)) { + scan++; + } } break; case NALNUML: PL_reg_flags |= RF_tainted; - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && !isALNUM_LC_utf8((U8*)scan)) { @@ -5728,23 +5944,28 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case SPACE: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; LOAD_UTF8_CHARCLASS_SPACE(); while (hardcount < max && scan < loceol && (*scan == ' ' || - swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) { + swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) + { scan += UTF8SKIP(scan); hardcount++; } + } else if (FLAGS(p) & USE_UNI) { + while (scan < loceol && isSPACE_L1((U8) *scan)) { + scan++; + } } else { - while (scan < loceol && isSPACE(*scan)) - scan++; + while (scan < loceol && isSPACE((U8) *scan)) + scan++; } break; case SPACEL: PL_reg_flags |= RF_tainted; - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && (*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { @@ -5757,23 +5978,29 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case NSPACE: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; LOAD_UTF8_CHARCLASS_SPACE(); while (hardcount < max && scan < loceol && !(*scan == ' ' || - swash_fetch(PL_utf8_space,(U8*)scan, do_utf8))) { + swash_fetch(PL_utf8_space,(U8*)scan, utf8_target))) + { scan += UTF8SKIP(scan); hardcount++; } + } else if (FLAGS(p) & USE_UNI) { + while (scan < loceol && ! isSPACE_L1((U8) *scan)) { + scan++; + } } else { - while (scan < loceol && !isSPACE(*scan)) - scan++; + while (scan < loceol && ! isSPACE((U8) *scan)) { + scan++; + } } break; case NSPACEL: PL_reg_flags |= RF_tainted; - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && !(*scan == ' ' || isSPACE_LC_utf8((U8*)scan))) { @@ -5786,11 +6013,11 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case DIGIT: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; LOAD_UTF8_CHARCLASS_DIGIT(); while (hardcount < max && scan < loceol && - swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) { + swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) { scan += UTF8SKIP(scan); hardcount++; } @@ -5800,11 +6027,11 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case NDIGIT: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; LOAD_UTF8_CHARCLASS_DIGIT(); while (hardcount < max && scan < loceol && - !swash_fetch(PL_utf8_digit, (U8*)scan, do_utf8)) { + !swash_fetch(PL_utf8_digit, (U8*)scan, utf8_target)) { scan += UTF8SKIP(scan); hardcount++; } @@ -5813,7 +6040,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) scan++; } case LNBREAK: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && (c=is_LNBREAK_utf8(scan))) { scan += c; @@ -5832,7 +6059,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case HORIZWS: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && (c=is_HORIZWS_utf8(scan))) { scan += c; @@ -5844,7 +6071,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case NHORIZWS: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && !is_HORIZWS_utf8(scan)) { scan += UTF8SKIP(scan); @@ -5857,7 +6084,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case VERTWS: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && (c=is_VERTWS_utf8(scan))) { scan += c; @@ -5870,7 +6097,7 @@ S_regrepeat(pTHX_ const regexp *prog, const regnode *p, I32 max, int depth) } break; case NVERTWS: - if (do_utf8) { + if (utf8_target) { loceol = PL_regeol; while (hardcount < max && scan < loceol && !is_VERTWS_utf8(scan)) { scan += UTF8SKIP(scan); @@ -5964,91 +6191,60 @@ Perl_regclass_swash(pTHX_ const regexp *prog, register const regnode* node, bool /* - reginclass - determine if a character falls into a character class - The n is the ANYOF regnode, the p is the target string, lenp - is pointer to the maximum length of how far to go in the p - (if the lenp is zero, UTF8SKIP(p) is used), - do_utf8 tells whether the target string is in UTF-8. + n is the ANYOF regnode + p is the target string + lenp is pointer to the maximum number of bytes of how far to go in p + (This is assumed wthout checking to always be at least the current + character's size) + utf8_target tells whether p is in UTF-8. + + Returns true if matched; false otherwise. If lenp is not NULL, on return + from a successful match, the value it points to will be updated to how many + bytes in p were matched. If there was no match, the value is undefined, + possibly changed from the input. */ STATIC bool -S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const U8* p, STRLEN* lenp, register bool do_utf8) +S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n, register const U8* const p, STRLEN* lenp, register const bool utf8_target) { dVAR; const char flags = ANYOF_FLAGS(n); bool match = FALSE; UV c = *p; - STRLEN len = 0; - STRLEN plen; + STRLEN c_len = 0; + STRLEN maxlen; PERL_ARGS_ASSERT_REGINCLASS; - if (do_utf8 && !UTF8_IS_INVARIANT(c)) { - c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &len, + /* If c is not already the code point, get it */ + if (utf8_target && !UTF8_IS_INVARIANT(c)) { + c = utf8n_to_uvchr(p, UTF8_MAXBYTES, &c_len, (UTF8_ALLOW_DEFAULT & UTF8_ALLOW_ANYUV) | UTF8_ALLOW_FFFF | UTF8_CHECK_ONLY); /* see [perl #37836] for UTF8_ALLOW_ANYUV; [perl #38293] for * UTF8_ALLOW_FFFF */ - if (len == (STRLEN)-1) + if (c_len == (STRLEN)-1) Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); } + else { + c_len = 1; + } - plen = lenp ? *lenp : UNISKIP(NATIVE_TO_UNI(c)); - if (do_utf8 || (flags & ANYOF_UNICODE)) { - if (lenp) - *lenp = 0; - if (do_utf8 && !ANYOF_RUNTIME(n)) { - if (len != (STRLEN)-1 && c < 256 && ANYOF_BITMAP_TEST(n, c)) - match = TRUE; - } - if (!match && do_utf8 && (flags & ANYOF_UNICODE_ALL) && c >= 256) - match = TRUE; - if (!match) { - AV *av; - SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av); - - if (sw) { - U8 * utf8_p; - if (do_utf8) { - utf8_p = (U8 *) p; - } else { - STRLEN len = 1; - utf8_p = bytes_to_utf8(p, &len); - } - if (swash_fetch(sw, utf8_p, 1)) - match = TRUE; - else if (flags & ANYOF_FOLD) { - if (!match && lenp && av) { - I32 i; - for (i = 0; i <= av_len(av); i++) { - SV* const sv = *av_fetch(av, i, FALSE); - STRLEN len; - const char * const s = SvPV_const(sv, len); - if (len <= plen && memEQ(s, (char*)utf8_p, len)) { - *lenp = len; - match = TRUE; - break; - } - } - } - if (!match) { - U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; - - STRLEN tmplen; - to_utf8_fold(utf8_p, tmpbuf, &tmplen); - if (swash_fetch(sw, tmpbuf, 1)) - match = TRUE; - } - } + /* Use passed in max length, or one character if none passed in or less + * than one character. And assume will match just one character. This is + * overwritten later if matched more. */ + if (lenp) { + maxlen = (*lenp > c_len) ? *lenp : c_len; + *lenp = c_len; - /* If we allocated a string above, free it */ - if (! do_utf8) Safefree(utf8_p); - } - } - if (match && lenp && *lenp == 0) - *lenp = UNISKIP(NATIVE_TO_UNI(c)); } - if (!match && c < 256) { + else { + maxlen = c_len; + } + + /* If this character is potentially in the bitmap, check it */ + if (c < 256) { if (ANYOF_BITMAP_TEST(n, c)) match = TRUE; else if (flags & ANYOF_FOLD) { @@ -6104,6 +6300,103 @@ S_reginclass(pTHX_ const regexp *prog, register const regnode *n, register const } } + /* If the bitmap didn't (or couldn't) match, and something outside the + * bitmap could match, try that */ + if (!match && (utf8_target || (flags & ANYOF_UNICODE))) { + if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) { + match = TRUE; + } + else { + AV *av; + SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av); + + if (sw) { + U8 * utf8_p; + if (utf8_target) { + utf8_p = (U8 *) p; + } else { + STRLEN len = 1; + utf8_p = bytes_to_utf8(p, &len); + } + if (swash_fetch(sw, utf8_p, 1)) + match = TRUE; + else if (flags & ANYOF_FOLD) { + if (!match && lenp && av) { + I32 i; + for (i = 0; i <= av_len(av); i++) { + SV* const sv = *av_fetch(av, i, FALSE); + STRLEN len; + const char * const s = SvPV_const(sv, len); + if (len <= maxlen && memEQ(s, (char*)utf8_p, len)) { + *lenp = len; + match = TRUE; + break; + } + } + } + if (!match) { + U8 folded[UTF8_MAXBYTES_CASE+1]; + + /* See if the folded version matches */ + STRLEN foldlen; + to_utf8_fold(utf8_p, folded, &foldlen); + if (swash_fetch(sw, folded, 1)) { /* 1 => is utf8 */ + match = TRUE; + } + else { + SV** listp; + + /* Consider "k" =~ /[K]/i. The line above would + * have just folded the 'k' to itself, and that + * isn't going to match 'K'. So we look through + * the closure of everything that folds to 'k'. + * That will find the 'K'. Initialize the list, if + * necessary */ + if (! PL_utf8_foldclosures) { + + /* If the folds haven't been read in, call a fold + * function to force that */ + if (! PL_utf8_tofold) { + U8 dummy[UTF8_MAXBYTES+1]; + STRLEN dummy_len; + to_utf8_fold((U8*) "A", dummy, &dummy_len); + } + PL_utf8_foldclosures = + _swash_inversion_hash(PL_utf8_tofold); + } + + /* The data structure is a hash with the keys every + * character that is folded to, like 'k', and the + * values each an array of everything that folds to + * its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */ + if ((listp = hv_fetch(PL_utf8_foldclosures, + (char *) folded, foldlen, FALSE))) + { + AV* list = (AV*) *listp; + IV i; + for (i = 0; i <= av_len(list); i++) { + SV** try_p = av_fetch(list, i, FALSE); + if (try_p == NULL) { + Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure"); + } + /* Don't have to worry about embeded nulls + * since NULL isn't folded or foldable */ + if (swash_fetch(sw, (U8*) SvPVX(*try_p),1)) { + match = TRUE; + break; + } + } + } + } + } + } + + /* If we allocated a string above, free it */ + if (! utf8_target) Safefree(utf8_p); + } + } + } + return (flags & ANYOF_INVERT) ? !match : match; }