X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/24e16d7b405f10168aae144d4a2c37d9c6443b9e..b41db090e21841b21ad0a9529cf6f76acc688dea:/regexec.c diff --git a/regexec.c b/regexec.c index 340a49e..0892554 100644 --- a/regexec.c +++ b/regexec.c @@ -96,6 +96,12 @@ static const char* const non_utf8_target_but_utf8_required = "Can't match, because target string needs to be in UTF-8\n"; #endif +/* Returns a boolean as to whether the input unsigned number is a power of 2 + * (2**0, 2**1, etc). In other words if it has just a single bit set. + * If not, subtracting 1 would leave the uppermost bit set, so the & would + * yield non-zero */ +#define isPOWER_OF_2(n) ((n & (n-1)) == 0) + #define NON_UTF8_TARGET_BUT_UTF8_REQUIRED(target) STMT_START { \ DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ "%s", non_utf8_target_but_utf8_required));\ goto target; \ @@ -119,7 +125,6 @@ static const char* const non_utf8_target_but_utf8_required */ #define CHR_SVLEN(sv) (utf8_target ? sv_len_utf8(sv) : SvCUR(sv)) -#define CHR_DIST(a,b) (reginfo->is_utf8_target ? utf8_distance(a,b) : a - b) #define HOPc(pos,off) \ (char *)(reginfo->is_utf8_target \ @@ -127,13 +132,16 @@ static const char* const non_utf8_target_but_utf8_required (U8*)(off >= 0 ? reginfo->strend : reginfo->strbeg)) \ : (U8*)(pos + off)) -#define HOPBACKc(pos, off) \ - (char*)(reginfo->is_utf8_target \ - ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(reginfo->strbeg)) \ - : (pos - off >= reginfo->strbeg) \ - ? (U8*)pos - off \ +/* like HOPMAYBE3 but backwards. lim must be +ve. Returns NULL on overshoot */ +#define HOPBACK3(pos, off, lim) \ + (reginfo->is_utf8_target \ + ? reghopmaybe3((U8*)pos, (SSize_t)0-off, (U8*)(lim)) \ + : (pos - off >= lim) \ + ? (U8*)pos - off \ : NULL) +#define HOPBACKc(pos, off) ((char*)HOPBACK3(pos, off, reginfo->strbeg)) + #define HOP3(pos,off,lim) (reginfo->is_utf8_target ? reghop3((U8*)(pos), off, (U8*)(lim)) : (U8*)(pos + off)) #define HOP3c(pos,off,lim) ((char*)HOP3(pos,off,lim)) @@ -150,6 +158,7 @@ static const char* const non_utf8_target_but_utf8_required #define HOP3lim(pos,off,lim) (reginfo->is_utf8_target \ ? reghop3((U8*)(pos), off, (U8*)(lim)) \ : (U8*)((pos + off) > lim ? lim : (pos + off))) +#define HOP3clim(pos,off,lim) ((char*)HOP3lim(pos,off,lim)) #define HOP4(pos,off,llim, rlim) (reginfo->is_utf8_target \ ? reghop4((U8*)(pos), off, (U8*)(llim), (U8*)(rlim)) \ @@ -445,6 +454,8 @@ S_regcp_restore(pTHX_ regexp *rex, I32 ix, U32 *maxopenparen_p _pDEPTH) #define regcpblow(cp) LEAVE_SCOPE(cp) /* Ignores regcppush()ed data. */ +#ifndef PERL_IN_XSUB_RE + bool Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character) { @@ -486,6 +497,8 @@ Perl_isFOO_lc(pTHX_ const U8 classnum, const U8 character) return FALSE; } +#endif + STATIC bool S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) { @@ -539,6 +552,116 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) return FALSE; /* Things like CNTRL are always below 256 */ } +STATIC char * +S_find_next_ascii(char * s, const char * send, const bool utf8_target) +{ + /* Returns the position of the first ASCII byte in the sequence between 's' + * and 'send-1' inclusive; returns 'send' if none found */ + + PERL_ARGS_ASSERT_FIND_NEXT_ASCII; + +#ifndef EBCDIC + + if ((STRLEN) (send - s) >= PERL_WORDSIZE + + /* This term is wordsize if subword; 0 if not */ + + PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(s) + + /* 'offset' */ + - (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK)) + { + + /* Process per-byte until reach word boundary. XXX This loop could be + * eliminated if we knew that this platform had fast unaligned reads */ + while (PTR2nat(s) & PERL_WORD_BOUNDARY_MASK) { + if (isASCII(*s)) { + return s; + } + s++; /* khw didn't bother creating a separate loop for + utf8_target */ + } + + /* Here, we know we have at least one full word to process. Process + * per-word as long as we have at least a full word left */ + do { + if ((* (PERL_UINTMAX_T *) s) & ~ PERL_VARIANTS_WORD_MASK) { + break; + } + s += PERL_WORDSIZE; + } while (s + PERL_WORDSIZE <= send); + } + +#endif + + /* Process per-character */ + if (utf8_target) { + while (s < send) { + if (isASCII(*s)) { + return s; + } + s += UTF8SKIP(s); + } + } + else { + while (s < send) { + if (isASCII(*s)) { + return s; + } + s++; + } + } + + return s; +} + +STATIC char * +S_find_next_non_ascii(char * s, const char * send, const bool utf8_target) +{ + /* Returns the position of the first non-ASCII byte in the sequence between + * 's' and 'send-1' inclusive; returns 'send' if none found */ + +#ifdef EBCDIC + + PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII; + + if (utf8_target) { + while (s < send) { + if ( ! isASCII(*s)) { + return s; + } + s += UTF8SKIP(s); + } + } + else { + while (s < send) { + if ( ! isASCII(*s)) { + return s; + } + s++; + } + } + + return s; + +#else + + const U8 * next_non_ascii = NULL; + + PERL_ARGS_ASSERT_FIND_NEXT_NON_ASCII; + PERL_UNUSED_ARG(utf8_target); + + /* On ASCII platforms invariants and ASCII are identical, so if the string + * is entirely invariants, there is no non-ASCII character */ + return (is_utf8_invariant_string_loc((U8 *) s, + (STRLEN) (send - s), + &next_non_ascii)) + ? (char *) send + : (char *) next_non_ascii; + +#endif + +} + /* * pregexec and friends */ @@ -700,7 +823,7 @@ Perl_re_intuit_start(pTHX_ goto fail; } - RX_MATCH_UTF8_set(rx,utf8_target); + RXp_MATCH_UTF8_set(prog, utf8_target); reginfo->is_utf8_target = cBOOL(utf8_target); reginfo->info_aux = NULL; reginfo->strbeg = strbeg; @@ -834,7 +957,7 @@ Perl_re_intuit_start(pTHX_ #ifdef DEBUGGING /* 7/99: reports of failure (with the older version) */ if (end_shift < 0) Perl_croak(aTHX_ "panic: end_shift: %" IVdf " pattern:\n%s\n ", - (IV)end_shift, RX_PRECOMP(prog)); + (IV)end_shift, RX_PRECOMP(rx)); #endif restart: @@ -880,7 +1003,9 @@ Perl_re_intuit_start(pTHX_ (IV)prog->check_end_shift); }); - end_point = HOP3(strend, -end_shift, strbeg); + end_point = HOPBACK3(strend, end_shift, rx_origin); + if (!end_point) + goto fail_finish; start_point = HOPMAYBE3(rx_origin, start_shift, end_point); if (!start_point) goto fail_finish; @@ -898,19 +1023,28 @@ Perl_re_intuit_start(pTHX_ && prog->intflags & PREGf_ANCH && prog->check_offset_max != SSize_t_MAX) { - SSize_t len = SvCUR(check) - !!SvTAIL(check); + SSize_t check_len = SvCUR(check) - !!SvTAIL(check); const char * const anchor = (prog->intflags & PREGf_ANCH_GPOS ? strpos : strbeg); + SSize_t targ_len = (char*)end_point - anchor; + + if (check_len > targ_len) { + DEBUG_EXECUTE_r(Perl_re_printf( aTHX_ + "Anchored string too short...\n")); + goto fail_finish; + } /* do a bytes rather than chars comparison. It's conservative; * so it skips doing the HOP if the result can't possibly end * up earlier than the old value of end_point. */ - if ((char*)end_point - anchor > prog->check_offset_max) { + assert(anchor + check_len <= (char *)end_point); + if (prog->check_offset_max + check_len < targ_len) { end_point = HOP3lim((U8*)anchor, prog->check_offset_max, - end_point -len) - + len; + end_point - check_len + ) + + check_len; } } @@ -1288,10 +1422,10 @@ Perl_re_intuit_start(pTHX_ */ if (prog->anchored_substr || prog->anchored_utf8 || ml_anch) - endpos= HOP3c(rx_origin, (prog->minlen ? cl_l : 0), strend); + endpos = HOP3clim(rx_origin, (prog->minlen ? cl_l : 0), strend); else if (prog->float_substr || prog->float_utf8) { rx_max_float = HOP3c(check_at, -start_shift, strbeg); - endpos= HOP3c(rx_max_float, cl_l, strend); + endpos = HOP3clim(rx_max_float, cl_l, strend); } else endpos= strend; @@ -1500,8 +1634,9 @@ STMT_START { uscan += len; \ len=0; \ } else { \ - uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \ len = UTF8SKIP(uc); \ + uvc = _toFOLD_utf8_flags( (const U8*) uc, uc + len, foldbuf, &foldlen, \ + flags); \ skiplen = UVCHR_SKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ @@ -1969,10 +2104,8 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, * trying that it will fail; so don't start a match past the * required minimum number from the far end */ e = HOP3c(strend, -((SSize_t)ln), s); - - if (reginfo->intuit && e < s) { - e = s; /* Due to minlen logic of intuit() */ - } + if (e < s) + break; c1 = *pat_string; c2 = fold_array[c1]; @@ -2016,10 +2149,6 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, */ e = HOP3c(strend, -((SSize_t)lnc), s); - if (reginfo->intuit && e < s) { - e = s; /* Due to minlen logic of intuit() */ - } - /* XXX Note that we could recalculate e to stop the loop earlier, * as the worst case expansion above will rarely be met, and as we * go along we would usually find that e moves further to the left. @@ -2360,6 +2489,22 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, ); break; + case ASCII: + s = find_next_ascii(s, strend, utf8_target); + if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) { + goto got_it; + } + + break; + + case NASCII: + s = find_next_non_ascii(s, strend, utf8_target); + if (s < strend && (reginfo->intuit || regtry(reginfo, &s))) { + goto got_it; + } + + break; + /* The argument to all the POSIX node types is the class number to pass to * _generic_isCC() to build a mask for searching in PL_charclass[] */ @@ -2393,12 +2538,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, } to_complement = 1; - /* FALLTHROUGH */ + goto posixa; case POSIXA: - posixa: /* Don't need to worry about utf8, as it can match only a single - * byte invariant character. */ + * byte invariant character. But we do anyway for performance reasons, + * as otherwise we would have to examine all the continuation + * characters */ + if (utf8_target) { + REXEC_FBC_UTF8_CLASS_SCAN(_generic_isCC_A(*s, FLAGS(c))); + break; + } + + posixa: REXEC_FBC_CLASS_SCAN( to_complement ^ cBOOL(_generic_isCC_A(*s, FLAGS(c)))); break; @@ -2429,7 +2581,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, if ((UTF8_IS_INVARIANT(*s) && to_complement ^ cBOOL(_generic_isCC((U8) *s, classnum))) - || (UTF8_IS_DOWNGRADEABLE_START(*s) + || ( UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, strend) && to_complement ^ cBOOL( _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s + 1)), @@ -2767,7 +2919,7 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, } else { /* create new COW SV to share string */ - RX_MATCH_COPY_FREE(rx); + RXp_MATCH_COPY_FREE(prog); prog->saved_copy = sv_setsv_cow(prog->saved_copy, sv); } prog->subbeg = (char *)SvPVX_const(prog->saved_copy); @@ -2830,7 +2982,7 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, assert(min >= 0 && min <= max && min <= strend - strbeg); sublen = max - min; - if (RX_MATCH_COPIED(rx)) { + if (RXp_MATCH_COPIED(prog)) { if (sublen > prog->sublen) prog->subbeg = (char*)saferealloc(prog->subbeg, sublen+1); @@ -2841,7 +2993,7 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, prog->subbeg[sublen] = '\0'; prog->suboffset = min; prog->sublen = sublen; - RX_MATCH_COPIED_on(rx); + RXp_MATCH_COPIED_on(prog); } prog->subcoffset = prog->suboffset; if (prog->suboffset && utf8_target) { @@ -2868,7 +3020,7 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, } } else { - RX_MATCH_COPY_FREE(rx); + RXp_MATCH_COPY_FREE(prog); prog->subbeg = strbeg; prog->suboffset = 0; prog->subcoffset = 0; @@ -3025,7 +3177,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* match via INTUIT shouldn't have any captures. * Let @-, @+, $^N know */ prog->lastparen = prog->lastcloseparen = 0; - RX_MATCH_UTF8_set(rx, utf8_target); + RXp_MATCH_UTF8_set(prog, utf8_target); prog->offs[0].start = s - strbeg; prog->offs[0].end = utf8_target ? (char*)utf8_hop((U8*)s, prog->minlenret) - strbeg @@ -3052,8 +3204,8 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, Perl_croak(aTHX_ "corrupted regexp program"); } - RX_MATCH_TAINTED_off(rx); - RX_MATCH_UTF8_set(rx, utf8_target); + RXp_MATCH_TAINTED_off(prog); + RXp_MATCH_UTF8_set(prog, utf8_target); reginfo->prog = rx; /* Yes, sorry that this is confusing. */ reginfo->intuit = 0; @@ -3375,7 +3527,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, regprop(prog, prop, c, reginfo, NULL); { RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1), - s,strend-s,60); + s,strend-s,PL_dump_re_max_len); Perl_re_printf( aTHX_ "Matching stclass %.*s against %s (%d bytes)\n", (int)SvCUR(prop), SvPVX_const(prop), @@ -3895,10 +4047,10 @@ S_debug_start_match(pTHX_ const REGEXP *prog, const bool utf8_target, reginitcolors(); { RE_PV_QUOTED_DECL(s0, utf8_pat, PERL_DEBUG_PAD_ZERO(0), - RX_PRECOMP_const(prog), RX_PRELEN(prog), 60); + RX_PRECOMP_const(prog), RX_PRELEN(prog), PL_dump_re_max_len); RE_PV_QUOTED_DECL(s1, utf8_target, PERL_DEBUG_PAD_ZERO(1), - start, end - start, 60); + start, end - start, PL_dump_re_max_len); Perl_re_printf( aTHX_ "%s%s REx%s %s against %s\n", @@ -3954,11 +4106,11 @@ S_dump_exec_pos(pTHX_ const char *locinput, const int is_uni = utf8_target ? 1 : 0; RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0), - (locinput - pref_len),pref0_len, 60, 4, 5); + (locinput - pref_len),pref0_len, PL_dump_re_max_len, 4, 5); RE_PV_COLOR_DECL(s1,len1,is_uni,PERL_DEBUG_PAD(1), (locinput - pref_len + pref0_len), - pref_len - pref0_len, 60, 2, 3); + pref_len - pref0_len, PL_dump_re_max_len, 2, 3); RE_PV_COLOR_DECL(s2,len2,is_uni,PERL_DEBUG_PAD(2), locinput, loc_regeol - locinput, 10, 0, 1); @@ -4133,10 +4285,11 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } else { STRLEN len; - _to_utf8_fold_flags(s, - d, - &len, - FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); + _toFOLD_utf8_flags(s, + pat_end, + d, + &len, + FOLD_FLAGS_FULL | FOLD_FLAGS_LOCALE); d += len; s += UTF8SKIP(s); } @@ -4181,7 +4334,7 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, } else { /* Does participate in folds */ AV* list = (AV*) *listp; - if (av_tindex_nomg(list) != 1) { + if (av_tindex_skip_len_mg(list) != 1) { /* If there aren't exactly two folds to this, it is * outside the scope of this function */ @@ -4340,7 +4493,7 @@ S_isGCB(pTHX_ const GCB_enum before, const GCB_enum after, const U8 * const strb /* Do not break within emoji flag sequences. That is, do not * break between regional indicator (RI) symbols if there is an * odd number of RI characters before the break point. - * GB12 ^ (RI RI)* RI × RI + * GB12 sot (RI RI)* RI × RI * GB13 [^RI] (RI RI)* RI × RI */ while (backup_one_GCB(strbeg, @@ -4632,7 +4785,7 @@ S_isLB(pTHX_ LB_enum before, * only if there are an even number of regional indicators * preceding the position of the break. * - * sot (RI RI)* RI × RI + * sot (RI RI)* RI × RI * [^RI] (RI RI)* RI × RI */ while (backup_one_LB(strbeg, @@ -5162,7 +5315,7 @@ S_isWB(pTHX_ WB_enum previous, * odd number of RI characters before the potential break * point. * - * WB15 ^ (RI RI)* RI × RI + * WB15 sot (RI RI)* RI × RI * WB16 [^RI] (RI RI)* RI × RI */ while (backup_one_WB(&previous, @@ -5343,6 +5496,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) regnode *next; U32 n = 0; /* general value; init to avoid compiler warning */ SSize_t ln = 0; /* len or last; init to avoid compiler warning */ + SSize_t endref = 0; /* offset of end of backref when ln is start */ char *locinput = startpos; char *pushinput; /* where to continue after a PUSH */ I32 nextchr; /* is always set to UCHARAT(locinput), or -1 at EOS */ @@ -5369,7 +5523,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SV *sv_yes_mark = NULL; /* last mark name we have seen during a successful match */ U32 lastopen = 0; /* last open we saw */ - bool has_cutgroup = RX_HAS_CUTGROUP(rex) ? 1 : 0; + bool has_cutgroup = RXp_HAS_CUTGROUP(rex) ? 1 : 0; SV* const oreplsv = GvSVn(PL_replgv); /* these three flags are set by various ops to signal information to * the very next op. They have a useful lifetime of exactly one loop @@ -5390,12 +5544,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) U8 gimme = G_SCALAR; CV *caller_cv = NULL; /* who called us */ CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ - CHECKPOINT runops_cp; /* savestack position before executing EVAL */ U32 maxopenparen = 0; /* max '(' index seen so far */ int to_complement; /* Invert the result? */ _char_class_number classnum; bool is_utf8_pat = reginfo->is_utf8_pat; bool match = FALSE; + I32 orig_savestack_ix = PL_savestack_ix; + U8 * script_run_begin = NULL; /* Solaris Studio 12.3 messes up fetching PL_charclass['\n'] */ #if (defined(__SUNPRO_C) && (__SUNPRO_C == 0x5120) && defined(__x86_64) && defined(USE_64_BIT_ALL)) @@ -5429,8 +5584,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) })); while (scan != NULL) { - - next = scan + NEXT_OFF(scan); if (next == scan) next = NULL; @@ -5592,6 +5745,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (scan->flags == EXACTL || scan->flags == EXACTFLU8) { _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (utf8_target + && nextchr >= 0 /* guard against negative EOS value in nextchr */ && UTF8_IS_ABOVE_LATIN1(nextchr) && scan->flags == EXACTL) { @@ -5735,6 +5889,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) { U8 *uc; if ( ST.jump ) { + /* undo any captures done in the tail part of a branch, + * e.g. + * /(?:X(.)(.)|Y(.)).../ + * where the trie just matches X then calls out to do the + * rest of the branch */ REGCP_UNWIND(ST.cp); UNWIND_PAREN(ST.lastparen, ST.lastcloseparen); } @@ -6349,6 +6508,22 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } break; + case ASCII: + if (NEXTCHR_IS_EOS || ! isASCII(UCHARAT(locinput))) { + sayNO; + } + + locinput++; /* ASCII is always single byte */ + break; + + case NASCII: + if (NEXTCHR_IS_EOS || isASCII(UCHARAT(locinput))) { + sayNO; + } + + goto increment_locinput; + break; + /* The argument (FLAGS) to all the POSIX node types is the class number * */ @@ -6373,8 +6548,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; } - if (! UTF8_IS_DOWNGRADEABLE_START(nextchr)) { /* An above Latin-1 code point */ - _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend); + if (! UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) { + /* An above Latin-1 code point, or malformed */ + _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, + reginfo->strend); goto utf8_posix_above_latin1; } @@ -6458,7 +6635,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } locinput++; } - else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { + else if (UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(locinput, reginfo->strend)) { if (! (to_complement ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr, *(locinput + 1)), @@ -6671,10 +6848,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) do_nref_ref_common: ln = rex->offs[n].start; + endref = rex->offs[n].end; reginfo->poscache_iter = reginfo->poscache_maxiter; /* Void cache */ - if (rex->lastparen < n || ln == -1) + if (rex->lastparen < n || ln == -1 || endref == -1) sayNO; /* Do not match unless seen CLOSEn. */ - if (ln == rex->offs[n].end) + if (ln == endref) break; s = reginfo->strbeg + ln; @@ -6688,7 +6866,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * not going off the end given by reginfo->strend, and * returns in upon success, how much of the * current input was matched */ - if (! foldEQ_utf8_flags(s, NULL, rex->offs[n].end - ln, utf8_target, + if (! foldEQ_utf8_flags(s, NULL, endref - ln, utf8_target, locinput, &limit, 0, utf8_target, utf8_fold_flags)) { sayNO; @@ -6703,7 +6881,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) (type == REF || UCHARAT(s) != fold_array[nextchr])) sayNO; - ln = rex->offs[n].end - ln; + ln = endref - ln; if (locinput + ln > reginfo->strend) sayNO; if (ln > 1 && (type == REF @@ -6782,7 +6960,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) goto eval_recurse_doit; /* NOTREACHED */ - case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */ + case EVAL: /* /(?{...})B/ /(??{A})B/ and /(?(?{...})X|Y)B/ */ if (cur_eval && cur_eval->locinput==locinput) { if ( ++nochange_depth > max_nochange_depth ) Perl_croak(aTHX_ "EVAL without pos change exceeded limit in regex"); @@ -6801,7 +6979,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* save *all* paren positions */ regcppush(rex, 0, maxopenparen); - REGCP_SET(runops_cp); + REGCP_SET(ST.lastcp); if (!caller_cv) caller_cv = find_runcv(NULL); @@ -6826,30 +7004,67 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) nop = (OP*)rexi->data->data[n]; } - /* normally if we're about to execute code from the same - * CV that we used previously, we just use the existing - * CX stack entry. However, its possible that in the - * meantime we may have backtracked, popped from the save - * stack, and undone the SAVECOMPPAD(s) associated with - * PUSH_MULTICALL; in which case PL_comppad no longer - * points to newcv's pad. */ + /* Some notes about MULTICALL and the context and save stacks. + * + * In something like + * /...(?{ my $x)}...(?{ my $y)}...(?{ my $z)}.../ + * since codeblocks don't introduce a new scope (so that + * local() etc accumulate), at the end of a successful + * match there will be a SAVEt_CLEARSV on the savestack + * for each of $x, $y, $z. If the three code blocks above + * happen to have come from different CVs (e.g. via + * embedded qr//s), then we must ensure that during any + * savestack unwinding, PL_comppad always points to the + * right pad at each moment. We achieve this by + * interleaving SAVEt_COMPPAD's on the savestack whenever + * there is a change of pad. + * In theory whenever we call a code block, we should + * push a CXt_SUB context, then pop it on return from + * that code block. This causes a bit of an issue in that + * normally popping a context also clears the savestack + * back to cx->blk_oldsaveix, but here we specifically + * don't want to clear the save stack on exit from the + * code block. + * Also for efficiency we don't want to keep pushing and + * popping the single SUB context as we backtrack etc. + * So instead, we push a single context the first time + * we need, it, then hang onto it until the end of this + * function. Whenever we encounter a new code block, we + * update the CV etc if that's changed. During the times + * in this function where we're not executing a code + * block, having the SUB context still there is a bit + * naughty - but we hope that no-one notices. + * When the SUB context is initially pushed, we fake up + * cx->blk_oldsaveix to be as if we'd pushed this context + * on first entry to S_regmatch rather than at some random + * point during the regexe execution. That way if we + * croak, popping the context stack will ensure that + * *everything* SAVEd by this function is undone and then + * the context popped, rather than e.g., popping the + * context (and restoring the original PL_comppad) then + * popping more of the savestack and restoring a bad + * PL_comppad. + */ + + /* If this is the first EVAL, push a MULTICALL. On + * subsequent calls, if we're executing a different CV, or + * if PL_comppad has got messed up from backtracking + * through SAVECOMPPADs, then refresh the context. + */ if (newcv != last_pushed_cv || PL_comppad != last_pad) { U8 flags = (CXp_SUB_RE | ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0)); + SAVECOMPPAD(); if (last_pushed_cv) { - /* PUSH/POP_MULTICALL save and restore the - * caller's PL_comppad; if we call multiple subs - * using the same CX block, we have to save and - * unwind the varying PL_comppad's ourselves, - * especially restoring the right PL_comppad on - * backtrack - so save it on the save stack */ - SAVECOMPPAD(); CHANGE_MULTICALL_FLAGS(newcv, flags); } else { PUSH_MULTICALL_FLAGS(newcv, flags); } + /* see notes above */ + CX_CUR()->blk_oldsaveix = orig_savestack_ix; + last_pushed_cv = newcv; } else { @@ -6931,7 +7146,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (logical == 0) /* (?{})/ */ sv_setsv(save_scalar(PL_replgv), ret); /* $^R */ else if (logical == 1) { /* /(?(?{...})X|Y)/ */ - sw = cBOOL(SvTRUE(ret)); + sw = cBOOL(SvTRUE_NN(ret)); logical = 0; } else { /* /(??{}) */ @@ -6966,12 +7181,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * in the regexp code uses the pad ! */ PL_op = oop; PL_curcop = ocurcop; - regcp_restore(rex, runops_cp, &maxopenparen); + regcp_restore(rex, ST.lastcp, &maxopenparen); PL_curpm_under = PL_curpm; PL_curpm = PL_reg_curpm; - if (logical != 2) - break; + if (logical != 2) { + PUSH_STATE_GOTO(EVAL_B, next, locinput); + /* NOTREACHED */ + } } /* only /(??{})/ from now on */ @@ -7069,11 +7286,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) ST.prev_eval = cur_eval; cur_eval = st; /* now continue from first node in postoned RE */ - PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput); + PUSH_YES_STATE_GOTO(EVAL_postponed_AB, startpoint, locinput); NOT_REACHED; /* NOTREACHED */ } - case EVAL_AB: /* cleanup after a successful (??{A})B */ + case EVAL_postponed_AB: /* cleanup after a successful (??{A})B */ /* note: this is called twice; first after popping B, then A */ DEBUG_STACK_r({ Perl_re_exec_indentf( aTHX_ "EVAL_AB cur_eval=%p prev_eval=%p\n", @@ -7119,7 +7336,11 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) sayYES; - case EVAL_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ + case EVAL_B_fail: /* unsuccessful B in (?{...})B */ + REGCP_UNWIND(ST.lastcp); + sayNO; + + case EVAL_postponed_AB_fail: /* unsuccessfully ran A or B in (??{A})B */ /* note: this is called twice; first after popping B, then A */ DEBUG_STACK_r({ Perl_re_exec_indentf( aTHX_ "EVAL_AB_fail cur_eval=%p prev_eval=%p\n", @@ -7165,6 +7386,10 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) lastopen = n; break; + case SROPEN: /* (*SCRIPT_RUN: */ + script_run_begin = (U8 *) locinput; + break; + /* XXX really need to log other places start/end are set too */ #define CLOSE_CAPTURE \ rex->offs[n].start = rex->offs[n].start_tmp; \ @@ -7190,6 +7415,16 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; + case SRCLOSE: /* (*SCRIPT_RUN: ... ) */ + + if (! isSCRIPT_RUN(script_run_begin, (U8 *) locinput, utf8_target)) + { + sayNO; + } + + break; + + case ACCEPT: /* (*ACCEPT) */ if (scan->flags) sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); @@ -7519,9 +7754,6 @@ NULL if (cur_curlyx->u.curlyx.minmod) { ST.save_curlyx = cur_curlyx; cur_curlyx = cur_curlyx->u.curlyx.prev_curlyx; - ST.cp = regcppush(rex, ST.save_curlyx->u.curlyx.parenfloor, - maxopenparen); - REGCP_SET(ST.lastcp); PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B, locinput); NOT_REACHED; /* NOTREACHED */ @@ -7554,11 +7786,11 @@ NULL CACHEsayNO; NOT_REACHED; /* NOTREACHED */ - case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ - /* FALLTHROUGH */ case WHILEM_A_pre_fail: /* just failed to match even minimal A */ REGCP_UNWIND(ST.lastcp); regcppop(rex, &maxopenparen); + /* FALLTHROUGH */ + case WHILEM_A_min_fail: /* just failed to match A in a minimal match */ cur_curlyx->u.curlyx.lastloc = ST.save_lastloc; cur_curlyx->u.curlyx.count--; CACHEsayNO; @@ -7591,8 +7823,6 @@ NULL case WHILEM_B_min_fail: /* just failed to match B in a minimal match */ cur_curlyx = ST.save_curlyx; - REGCP_UNWIND(ST.lastcp); - regcppop(rex, &maxopenparen); if (cur_curlyx->u.curlyx.count >= /*max*/ARG2(cur_curlyx->u.curlyx.me)) { /* Maximum greed exceeded */ @@ -7614,9 +7844,6 @@ NULL ); /* Try grabbing another A and see if it helps. */ cur_curlyx->u.curlyx.lastloc = locinput; - ST.cp = regcppush(rex, cur_curlyx->u.curlyx.parenfloor, - maxopenparen); - REGCP_SET(ST.lastcp); PUSH_STATE_GOTO(WHILEM_A_min, /*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS, locinput); @@ -8080,16 +8307,46 @@ NULL } else { /* Not utf8_target */ if (ST.c1 == ST.c2) { - while (locinput <= ST.maxpos && - UCHARAT(locinput) != ST.c1) - locinput++; - } - else { - while (locinput <= ST.maxpos - && UCHARAT(locinput) != ST.c1 - && UCHARAT(locinput) != ST.c2) - locinput++; + locinput = (char *) memchr(locinput, + ST.c1, + ST.maxpos + 1 - locinput); + if (! locinput) { + locinput = ST.maxpos + 1; + } } + else { + U8 c1_c2_bits_differing = ST.c1 ^ ST.c2; + + if (! isPOWER_OF_2(c1_c2_bits_differing)) { + while ( locinput <= ST.maxpos + && UCHARAT(locinput) != ST.c1 + && UCHARAT(locinput) != ST.c2) + { + locinput++; + } + } + else { + /* If c1 and c2 only differ by a single bit, we can + * avoid a conditional each time through the loop, + * at the expense of a little preliminary setup and + * an extra mask each iteration. By masking out + * that bit, we match exactly two characters, c1 + * and c2, and so we don't have to test for both. + * On both ASCII and EBCDIC platforms, most of the + * ASCII-range and Latin1-range folded equivalents + * differ only in a single bit, so this is actually + * the most common case. (e.g. 'A' 0x41 vs 'a' + * 0x61). */ + U8 c1_masked = ST.c1 &~ c1_c2_bits_differing; + U8 c1_c2_mask = ~ c1_c2_bits_differing; + while ( locinput <= ST.maxpos + && (UCHARAT(locinput) & c1_c2_mask) + != c1_masked) + { + locinput++; + } + } + } n = locinput - ST.oldloc; } if (locinput > ST.maxpos) @@ -8221,7 +8478,7 @@ NULL SET_RECURSE_LOCINPUT("FAKE-END[after]", cur_eval->locinput); - PUSH_YES_STATE_GOTO(EVAL_AB, st->u.eval.prev_eval->u.eval.B, + PUSH_YES_STATE_GOTO(EVAL_postponed_AB, st->u.eval.prev_eval->u.eval.B, locinput); /* match B */ } @@ -8449,7 +8706,8 @@ NULL assert(!NEXTCHR_IS_EOS); if (utf8_target) { locinput += PL_utf8skip[nextchr]; - /* locinput is allowed to go 1 char off the end, but not 2+ */ + /* locinput is allowed to go 1 char off the end (signifying + * EOS), but not 2+ */ if (locinput > reginfo->strend) sayNO; } @@ -8477,16 +8735,17 @@ NULL DEBUG_STACK_r({ regmatch_state *cur = st; regmatch_state *curyes = yes_state; - int curd = depth; + U32 i; regmatch_slab *slab = PL_regmatch_slab; - for (;curd > -1 && (depth-curd < 3);cur--,curd--) { + for (i = 0; i < 3 && i <= depth; cur--,i++) { if (cur < SLAB_FIRST(slab)) { slab = slab->prev; cur = SLAB_LAST(slab); } - Perl_re_exec_indentf( aTHX_ "#%-3d %-10s %s\n", + Perl_re_exec_indentf( aTHX_ "%4s #%-3d %-10s %s\n", depth, - curd, PL_reg_name[cur->resume_state], + i ? " " : "push", + depth - i, PL_reg_name[cur->resume_state], (curyes == cur) ? "yes" : "" ); if (curyes == cur) @@ -8638,9 +8897,12 @@ NULL if (last_pushed_cv) { dSP; + /* see "Some notes about MULTICALL" above */ POP_MULTICALL; PERL_UNUSED_VAR(SP); } + else + LEAVE_SCOPE(orig_savestack_ix); assert(!result || locinput - reginfo->strbeg >= 0); return result ? locinput - reginfo->strbeg : -1; @@ -8876,10 +9138,27 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } } else { - while (scan < loceol && - (UCHARAT(scan) == c1 || UCHARAT(scan) == c2)) - { - scan++; + /* See comments in regmatch() CURLY_B_min_known_fail. We avoid + * a conditional each time through the loop if the characters + * differ only in a single bit, as is the usual situation */ + U8 c1_c2_bits_differing = c1 ^ c2; + + if (isPOWER_OF_2(c1_c2_bits_differing)) { + U8 c1_masked = c1 & ~ c1_c2_bits_differing; + U8 c1_c2_mask = ~ c1_c2_bits_differing; + + while ( scan < loceol + && (UCHARAT(scan) & c1_c2_mask) == c1_masked) + { + scan++; + } + } + else { + while ( scan < loceol + && (UCHARAT(scan) == c1 || UCHARAT(scan) == c2)) + { + scan++; + } } } } @@ -8914,6 +9193,33 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } break; + case ASCII: + if (utf8_target && loceol - scan > max) { + + /* We didn't adjust at the beginning of this routine + * because is UTF-8, but it is actually ok to do so, since here, to + * match, 1 char == 1 byte. */ + loceol = scan + max; + } + + scan = find_next_non_ascii(scan, loceol, utf8_target); + break; + + case NASCII: + if (utf8_target) { + while ( hardcount < max + && scan < loceol + && ! isASCII_utf8_safe(scan, loceol)) + { + scan += UTF8SKIP(scan); + hardcount++; + } + } + else { + scan = find_next_ascii(scan, loceol, utf8_target); + } + break; + /* The argument (FLAGS) to all the POSIX node types is the class number */ case NPOSIXL: @@ -9408,7 +9714,10 @@ S_reghop3(U8 *s, SSize_t off, const U8* lim) if (off >= 0) { while (off-- && s < lim) { /* XXX could check well-formedness here */ - s += UTF8SKIP(s); + U8 *new_s = s + UTF8SKIP(s); + if (new_s > lim) /* lim may be in the middle of a long character */ + return s; + s = new_s; } } else { @@ -9700,6 +10009,434 @@ S_to_byte_substr(pTHX_ regexp *prog) return TRUE; } +#ifndef PERL_IN_XSUB_RE + +bool +Perl__is_grapheme(pTHX_ const U8 * strbeg, const U8 * s, const U8 * strend, const UV cp) +{ + /* Temporary helper function for toke.c. Verify that the code point 'cp' + * is a stand-alone grapheme. The UTF-8 for 'cp' begins at position 's' in + * the larger string bounded by 'strbeg' and 'strend'. + * + * 'cp' needs to be assigned (if not a future version of the Unicode + * Standard could make it something that combines with adjacent characters, + * so code using it would then break), and there has to be a GCB break + * before and after the character. */ + + GCB_enum cp_gcb_val, prev_cp_gcb_val, next_cp_gcb_val; + const U8 * prev_cp_start; + + PERL_ARGS_ASSERT__IS_GRAPHEME; + + /* Unassigned code points are forbidden */ + if (UNLIKELY(! ELEMENT_RANGE_MATCHES_INVLIST( + _invlist_search(PL_Assigned_invlist, cp)))) + { + return FALSE; + } + + cp_gcb_val = getGCB_VAL_CP(cp); + + /* Find the GCB value of the previous code point in the input */ + prev_cp_start = utf8_hop_back(s, -1, strbeg); + if (UNLIKELY(prev_cp_start == s)) { + prev_cp_gcb_val = GCB_EDGE; + } + else { + prev_cp_gcb_val = getGCB_VAL_UTF8(prev_cp_start, strend); + } + + /* And check that is a grapheme boundary */ + if (! isGCB(prev_cp_gcb_val, cp_gcb_val, strbeg, s, + TRUE /* is UTF-8 encoded */ )) + { + return FALSE; + } + + /* Similarly verify there is a break between the current character and the + * following one */ + s += UTF8SKIP(s); + if (s >= strend) { + next_cp_gcb_val = GCB_EDGE; + } + else { + next_cp_gcb_val = getGCB_VAL_UTF8(s, strend); + } + + return isGCB(cp_gcb_val, next_cp_gcb_val, strbeg, s, TRUE); +} + +#endif + +bool +Perl_isSCRIPT_RUN(pTHX_ const U8 * s, const U8 * send, const bool utf8_target) +{ + /* Checks that every character in the sequence from 's' to 'send' is one of + * three scripts: Common, Inherited, and possibly one other. Additionally + * all decimal digits must come from the same consecutive sequence of 10. + * 'utf8_target' is TRUE iff the sequence is encoded in UTF-8. + * + * Basically, it looks at each character in the sequence to see if the + * above conditions are met; if not it fails. It uses an inversion map to + * find the enum corresponding to the script of each character. But this + * is complicated by the fact that a few code points can be in any of + * several scripts. The data has been constructed so that there are + * additional enum values (all negative) for these situations. The + * absolute value of those is an index into another table which contains + * pointers to auxiliary tables for each such situation. Each aux array + * lists all the scripts for the given situation. There is another, + * parallel, table that gives the number of entries in each aux table. + * These are all defined in charclass_invlists.h */ + + /* XXX Here are the additional things UTS 39 says could be done: + * Mark Chinese strings as “mixed script” if they contain both simplified + * (S) and traditional (T) Chinese characters, using the Unihan data in the + * Unicode Character Database [UCD]. The criterion can only be applied if + * the language of the string is known to be Chinese. So, for example, the + * string “写真だけの結婚式 ” is Japanese, and should not be marked as + * mixed script because of a mixture of S and T characters. Testing for + * whether a character is S or T needs to be based not on whether the + * character has a S or T variant , but whether the character is an S or T + * variant. khw notes that the sample contains a Hiragana character, and it + * is unclear if absence of any foreign script marks the script as + * "Chinese" + * + * Forbid sequences of the same nonspacing mark + * + * Check to see that all the characters are in the sets of exemplar + * characters for at least one language in the Unicode Common Locale Data + * Repository [CLDR]. */ + + + /* Things that match /\d/u */ + SV * decimals_invlist = PL_XPosix_ptrs[_CC_DIGIT]; + UV * decimals_array = invlist_array(decimals_invlist); + + /* What code point is the digit '0' of the script run? */ + UV zero_of_run = 0; + SCX_enum script_of_run = SCX_INVALID; /* Illegal value */ + SCX_enum script_of_char = SCX_INVALID; + + /* If the script remains not fully determined from iteration to iteration, + * this is the current intersection of the possiblities. */ + SCX_enum * intersection = NULL; + PERL_UINT_FAST8_T intersection_len = 0; + + bool retval = TRUE; + + assert(send > s); + + PERL_ARGS_ASSERT_ISSCRIPT_RUN; + + /* Look at each character in the sequence */ + while (s < send) { + UV cp; + + /* The code allows all scripts to use the ASCII digits. This is + * because they are used in commerce even in scripts that have their + * own set. Hence any ASCII ones found are ok, unless a digit from + * another set has already been encountered. (The other digit ranges + * in Common are not similarly blessed */ + if (UNLIKELY(isDIGIT(*s))) { + if (zero_of_run > 0) { + if (zero_of_run != '0') { + retval = FALSE; + break; + } + } + else { + zero_of_run = '0'; + } + s++; + continue; + } + + /* Here, isn't an ASCII digit. Find the code point of the character */ + if (utf8_target && ! UTF8_IS_INVARIANT(*s)) { + Size_t len; + cp = valid_utf8_to_uvchr((U8 *) s, &len); + s += len; + } + else { + cp = *(s++); + } + + /* If is within the range [+0 .. +9] of the script's zero, it also is a + * digit in that script. We can skip the rest of this code for this + * character. */ + if (UNLIKELY( zero_of_run > 0 + && cp >= zero_of_run + && cp - zero_of_run <= 9)) + { + continue; + } + + /* Find the character's script. The correct values are hard-coded here + * for small-enough code points. */ + if (cp < 0x2B9) { /* From inspection of Unicode db; extremely + unlikely to change */ + if ( cp > 255 + || ( isALPHA_L1(cp) + && LIKELY(cp != MICRO_SIGN_NATIVE))) + { + script_of_char = SCX_Latin; + } + else { + script_of_char = SCX_Common; + } + } + else { + script_of_char = _Perl_SCX_invmap[ + _invlist_search(PL_SCX_invlist, cp)]; + } + + /* We arbitrarily accept a single unassigned character, but not in + * combination with anything else, and not a run of them. */ + if ( UNLIKELY(script_of_run == SCX_Unknown) + || UNLIKELY( script_of_run != SCX_INVALID + && script_of_char == SCX_Unknown)) + { + retval = FALSE; + break; + } + + if (UNLIKELY(script_of_char == SCX_Unknown)) { + script_of_run = SCX_Unknown; + continue; + } + + /* We accept 'inherited' script characters currently even at the + * beginning. (We know that no characters in Inherited are digits, or + * we'd have to check for that) */ + if (UNLIKELY(script_of_char == SCX_Inherited)) { + continue; + } + + /* If unknown, the run's script is set to the char's */ + if (UNLIKELY(script_of_run == SCX_INVALID)) { + script_of_run = script_of_char; + } + + /* All decimal digits must be from the same sequence of 10. Above, we + * handled any ASCII digits without descending to here. We also + * handled the case where we already knew what digit sequence is the + * one to use, and the character is in that sequence. Now that we know + * the script, we can use script_zeros[] to directly find which + * sequence the script uses, except in a few cases it returns 0 */ + if (UNLIKELY(zero_of_run == 0) && script_of_char >= 0) { + zero_of_run = script_zeros[script_of_char]; + } + + /* Now we can see if the script of the character is the same as that of + * the run */ + if (LIKELY(script_of_char == script_of_run)) { + /* By far the most common case */ + goto scripts_match; + } + + /* Here, the scripts of the run and the current character don't match + * exactly. The run could so far have been entirely characters from + * Common. It's now time to change its script to that of this + * non-Common character */ + if (script_of_run == SCX_Common) { + + /* But Common contains several sets of digits. Only the '0' set + * can be part of another script. */ + if (zero_of_run > 0 && zero_of_run != '0') { + retval = FALSE; + break; + } + + script_of_run = script_of_char; + goto scripts_match; + } + + /* Here, the script of the run isn't Common. But characters in Common + * match any script */ + if (script_of_char == SCX_Common) { + goto scripts_match; + } + +#ifndef HAS_SCX_AUX_TABLES + + /* Too early a Unicode version to have a code point belonging to more + * than one script, so, if the scripts don't exactly match, fail */ + retval = FALSE; + break; + +#else + + /* Here there is no exact match between the character's script and the + * run's. And we've handled the special cases of scripts Unknown, + * Inherited, and Common. + * + * Negative script numbers signify that the value may be any of several + * scripts, and we need to look at auxiliary information to make our + * deterimination. But if both are non-negative, we can fail now */ + if (LIKELY(script_of_char >= 0)) { + const SCX_enum * search_in; + PERL_UINT_FAST8_T search_in_len; + PERL_UINT_FAST8_T i; + + if (LIKELY(script_of_run >= 0)) { + retval = FALSE; + break; + } + + /* Use the previously constructed set of possible scripts, if any. + * */ + if (intersection) { + search_in = intersection; + search_in_len = intersection_len; + } + else { + search_in = SCX_AUX_TABLE_ptrs[-script_of_run]; + search_in_len = SCX_AUX_TABLE_lengths[-script_of_run]; + } + + for (i = 0; i < search_in_len; i++) { + if (search_in[i] == script_of_char) { + script_of_run = script_of_char; + goto scripts_match; + } + } + + retval = FALSE; + break; + } + else if (LIKELY(script_of_run >= 0)) { + /* script of character could be one of several, but run is a single + * script */ + const SCX_enum * search_in = SCX_AUX_TABLE_ptrs[-script_of_char]; + const PERL_UINT_FAST8_T search_in_len + = SCX_AUX_TABLE_lengths[-script_of_char]; + PERL_UINT_FAST8_T i; + + for (i = 0; i < search_in_len; i++) { + if (search_in[i] == script_of_run) { + script_of_char = script_of_run; + goto scripts_match; + } + } + + retval = FALSE; + break; + } + else { + /* Both run and char could be in one of several scripts. If the + * intersection is empty, then this character isn't in this script + * run. Otherwise, we need to calculate the intersection to use + * for future iterations of the loop, unless we are already at the + * final character */ + const SCX_enum * search_char = SCX_AUX_TABLE_ptrs[-script_of_char]; + const PERL_UINT_FAST8_T char_len + = SCX_AUX_TABLE_lengths[-script_of_char]; + const SCX_enum * search_run; + PERL_UINT_FAST8_T run_len; + + SCX_enum * new_overlap = NULL; + PERL_UINT_FAST8_T i, j; + + if (intersection) { + search_run = intersection; + run_len = intersection_len; + } + else { + search_run = SCX_AUX_TABLE_ptrs[-script_of_run]; + run_len = SCX_AUX_TABLE_lengths[-script_of_run]; + } + + intersection_len = 0; + + for (i = 0; i < run_len; i++) { + for (j = 0; j < char_len; j++) { + if (search_run[i] == search_char[j]) { + + /* Here, the script at i,j matches. That means this + * character is in the run. But continue on to find + * the complete intersection, for the next loop + * iteration, and for the digit check after it. + * + * On the first found common script, we malloc space + * for the intersection list for the worst case of the + * intersection, which is the minimum of the number of + * scripts remaining in each set. */ + if (intersection_len == 0) { + Newx(new_overlap, + MIN(run_len - i, char_len - j), + SCX_enum); + } + new_overlap[intersection_len++] = search_run[i]; + } + } + } + + /* Here we've looked through everything. If they have no scripts + * in common, not a run */ + if (intersection_len == 0) { + retval = FALSE; + break; + } + + /* If there is only a single script in common, set to that. + * Otherwise, use the intersection going forward */ + Safefree(intersection); + if (intersection_len == 1) { + script_of_run = script_of_char = new_overlap[0]; + Safefree(new_overlap); + } + else { + intersection = new_overlap; + } + } + +#endif + + scripts_match: + + /* Here, the script of the character is compatible with that of the + * run. Either they match exactly, or one or both can be any of + * several scripts, and the intersection is not empty. If the + * character is not a decimal digit, we are done with it. Otherwise, + * it could still fail if it is from a different set of 10 than seen + * already (or we may not have seen any, and we need to set the + * sequence). If we have determined a single script and that script + * only has one set of digits (almost all scripts are like that), then + * this isn't a problem, as any digit must come from the same sequence. + * The only scripts that have multiple sequences have been constructed + * to be 0 in 'script_zeros[]'. + * + * Here we check if it is a digit. */ + if ( cp >= FIRST_NON_ASCII_DECIMAL_DIGIT + && ( ( zero_of_run == 0 + || ( ( script_of_char >= 0 + && script_zeros[script_of_char] == 0) + || intersection)))) + { + SSize_t range_zero_index; + range_zero_index = _invlist_search(decimals_invlist, cp); + if ( LIKELY(range_zero_index >= 0) + && ELEMENT_RANGE_MATCHES_INVLIST(range_zero_index)) + { + UV range_zero = decimals_array[range_zero_index]; + if (zero_of_run) { + if (zero_of_run != range_zero) { + retval = FALSE; + break; + } + } + else { + zero_of_run = range_zero; + } + } + } + } /* end of looping through CLOSESR text */ + + Safefree(intersection); + return retval; +} + + /* * ex: set ts=8 sts=4 sw=4 et: */