X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/a0bd1a30d379f2625c307657d63fc50173d7a56d..986b8b496d052a921ed92f9591b8adeea5dc72e4:/regexec.c diff --git a/regexec.c b/regexec.c index 781bc6b..f443113 100644 --- a/regexec.c +++ b/regexec.c @@ -37,9 +37,6 @@ #include "re_top.h" #endif -#define B_ON_NON_UTF8_LOCALE_IS_WRONG \ - "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale" - /* * pregcomp and pregexec -- regsub and regerror are not used in perl * @@ -86,6 +83,9 @@ #include "invlist_inline.h" #include "unicode_constants.h" +#define B_ON_NON_UTF8_LOCALE_IS_WRONG \ + "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale" + static const char utf8_locale_required[] = "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"; @@ -492,7 +492,7 @@ S_isFOO_utf8_lc(pTHX_ const U8 classnum, const U8* character) } else if (UTF8_IS_DOWNGRADEABLE_START(*character)) { return isFOO_lc(classnum, - TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1))); + EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1))); } _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character)); @@ -654,7 +654,7 @@ Perl_re_intuit_start(pTHX_ "Intuit: trying to determine minimum start position...\n")); /* for now, assume that all substr offsets are positive. If at some point - * in the future someone wants to do clever things with look-behind and + * in the future someone wants to do clever things with lookbehind and * -ve offsets, they'll need to fix up any code in this function * which uses these offsets. See the thread beginning * <20140113145929.GF27210@iabyn.com> @@ -1205,10 +1205,10 @@ Perl_re_intuit_start(pTHX_ * didn't contradict, so just retry the anchored "other" * substr */ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, - " Found /%s^%s/m, rescanning for anchored from offset %ld (rx_origin now %"IVdf")...\n", + " Found /%s^%s/m, rescanning for anchored from offset %"IVdf" (rx_origin now %"IVdf")...\n", PL_colors[0], PL_colors[1], - (long)(rx_origin - strbeg + prog->anchored_offset), - (long)(rx_origin - strbeg) + (IV)(rx_origin - strbeg + prog->anchored_offset), + (IV)(rx_origin - strbeg) )); goto do_other_substr; } @@ -1484,7 +1484,7 @@ STMT_START { } else { \ uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \ len = UTF8SKIP(uc); \ - skiplen = UNISKIP( uvc ); \ + skiplen = UVCHR_SKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ @@ -1501,7 +1501,7 @@ STMT_START { } else { \ len = 1; \ uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \ - skiplen = UNISKIP( uvc ); \ + skiplen = UVCHR_SKIP( uvc ); \ foldlen -= skiplen; \ uscan = foldbuf + skiplen; \ } \ @@ -1736,12 +1736,26 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ FBC_UTF8_A(TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT), \ TEST_NON_UTF8, PLACEHOLDER, REXEC_FBC_TRYIT) +#ifdef DEBUGGING +static IV +S_get_break_val_cp_checked(SV* const invlist, const UV cp_in) { + IV cp_out = Perl__invlist_search(invlist, cp_in); + assert(cp_out >= 0); + return cp_out; +} +# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \ + invmap[S_get_break_val_cp_checked(invlist, cp)] +#else +# define _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) \ + invmap[_invlist_search(invlist, cp)] +#endif + /* Takes a pointer to an inversion list, a pointer to its corresponding * inversion map, and a code point, and returns the code point's value * according to the two arrays. It assumes that all code points have a value. * This is used as the base macro for macros for particular properties */ #define _generic_GET_BREAK_VAL_CP(invlist, invmap, cp) \ - invmap[_invlist_search(invlist, cp)] + _generic_GET_BREAK_VAL_CP_CHECKED(invlist, invmap, cp) /* Same as above, but takes begin, end ptrs to a UTF-8 encoded string instead * of a code point, returning the value for the first code point in the string. @@ -1764,6 +1778,18 @@ REXEC_FBC_SCAN( /* Loops while (s < strend) */ \ #define getGCB_VAL_UTF8(pos, strend) \ _generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend) +/* Returns the LB value for the input code point */ +#define getLB_VAL_CP(cp) \ + _generic_GET_BREAK_VAL_CP( \ + PL_LB_invlist, \ + _Perl_LB_invmap, \ + (cp)) + +/* Returns the LB value for the first code point in the UTF-8 encoded string + * bounded by pos and strend */ +#define getLB_VAL_UTF8(pos, strend) \ + _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend) + /* Returns the SB value for the input code point */ #define getSB_VAL_CP(cp) \ @@ -1826,7 +1852,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case ANYOFL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - if ((FLAGS(c) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) { + if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) { Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); } @@ -2065,14 +2091,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8); break; case GCB_BOUND: - if (s == reginfo->strbeg) { /* GCB always matches at begin and - end */ - if (to_complement ^ cBOOL(reginfo->intuit - || regtry(reginfo, &s))) + if (s == reginfo->strbeg) { + if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } + + /* Didn't match. Try at the next position (if there is one) */ s += (utf8_target) ? UTF8SKIP(s) : 1; + if (UNLIKELY(s >= reginfo->strend)) { + break; + } } if (utf8_target) { @@ -2083,44 +2112,101 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, while (s < strend) { GCB_enum after = getGCB_VAL_UTF8((U8*) s, (U8*) reginfo->strend); - if (to_complement ^ isGCB(before, after)) { - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } - before = after; + if ( (to_complement ^ isGCB(before, after)) + && (reginfo->intuit || regtry(reginfo, &s))) + { + goto got_it; } + before = after; s += UTF8SKIP(s); } } else { /* Not utf8. Everything is a GCB except between CR and LF */ while (s < strend) { - if (to_complement ^ (UCHARAT(s - 1) != '\r' - || UCHARAT(s) != '\n')) + if ((to_complement ^ ( UCHARAT(s - 1) != '\r' + || UCHARAT(s) != '\n')) + && (reginfo->intuit || regtry(reginfo, &s))) { - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } - s++; + goto got_it; } + s++; } } - if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) { + /* And, since this is a bound, it can match after the final + * character in the string */ + if ((reginfo->intuit || regtry(reginfo, &s))) { goto got_it; } break; - case SB_BOUND: - if (s == reginfo->strbeg) { /* SB always matches at beginning */ - if (to_complement - ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) - { + case LB_BOUND: + if (s == reginfo->strbeg) { + if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } + s += (utf8_target) ? UTF8SKIP(s) : 1; + if (UNLIKELY(s >= reginfo->strend)) { + break; + } + } + + if (utf8_target) { + LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s, + -1, + (U8*)(reginfo->strbeg)), + (U8*) reginfo->strend); + while (s < strend) { + LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend); + if (to_complement ^ isLB(before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + utf8_target) + && (reginfo->intuit || regtry(reginfo, &s))) + { + goto got_it; + } + before = after; + s += UTF8SKIP(s); + } + } + else { /* Not utf8. */ + LB_enum before = getLB_VAL_CP((U8) *(s -1)); + while (s < strend) { + LB_enum after = getLB_VAL_CP((U8) *s); + if (to_complement ^ isLB(before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + utf8_target) + && (reginfo->intuit || regtry(reginfo, &s))) + { + goto got_it; + } + before = after; + s++; + } + } - /* Didn't match. Go try at the next position */ + if (reginfo->intuit || regtry(reginfo, &s)) { + goto got_it; + } + + break; + + case SB_BOUND: + if (s == reginfo->strbeg) { + if (reginfo->intuit || regtry(reginfo, &s)) { + goto got_it; + } s += (utf8_target) ? UTF8SKIP(s) : 1; + if (UNLIKELY(s >= reginfo->strend)) { + break; + } } if (utf8_target) { @@ -2131,18 +2217,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, while (s < strend) { SB_enum after = getSB_VAL_UTF8((U8*) s, (U8*) reginfo->strend); - if (to_complement ^ isSB(before, - after, - (U8*) reginfo->strbeg, - (U8*) s, - (U8*) reginfo->strend, - utf8_target)) + if ((to_complement ^ isSB(before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + utf8_target)) + && (reginfo->intuit || regtry(reginfo, &s))) { - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } - before = after; + goto got_it; } + before = after; s += UTF8SKIP(s); } } @@ -2150,18 +2235,17 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, SB_enum before = getSB_VAL_CP((U8) *(s -1)); while (s < strend) { SB_enum after = getSB_VAL_CP((U8) *s); - if (to_complement ^ isSB(before, - after, - (U8*) reginfo->strbeg, - (U8*) s, - (U8*) reginfo->strend, - utf8_target)) + if ((to_complement ^ isSB(before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + utf8_target)) + && (reginfo->intuit || regtry(reginfo, &s))) { - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } - before = after; + goto got_it; } + before = after; s++; } } @@ -2169,9 +2253,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, /* Here are at the final position in the target string. The SB * value is always true here, so matches, depending on other * constraints */ - if (to_complement ^ cBOOL(reginfo->intuit - || regtry(reginfo, &s))) - { + if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } @@ -2179,12 +2261,13 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, case WB_BOUND: if (s == reginfo->strbeg) { - if (to_complement ^ cBOOL(reginfo->intuit - || regtry(reginfo, &s))) - { + if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } s += (utf8_target) ? UTF8SKIP(s) : 1; + if (UNLIKELY(s >= reginfo->strend)) { + break; + } } if (utf8_target) { @@ -2202,20 +2285,19 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, while (s < strend) { WB_enum after = getWB_VAL_UTF8((U8*) s, (U8*) reginfo->strend); - if (to_complement ^ isWB(previous, - before, - after, - (U8*) reginfo->strbeg, - (U8*) s, - (U8*) reginfo->strend, - utf8_target)) + if ((to_complement ^ isWB(previous, + before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + utf8_target)) + && (reginfo->intuit || regtry(reginfo, &s))) { - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } - previous = before; - before = after; + goto got_it; } + previous = before; + before = after; s += UTF8SKIP(s); } } @@ -2224,31 +2306,26 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, WB_enum before = getWB_VAL_CP((U8) *(s -1)); while (s < strend) { WB_enum after = getWB_VAL_CP((U8) *s); - if (to_complement ^ isWB(previous, - before, - after, - (U8*) reginfo->strbeg, - (U8*) s, - (U8*) reginfo->strend, - utf8_target)) + if ((to_complement ^ isWB(previous, + before, + after, + (U8*) reginfo->strbeg, + (U8*) s, + (U8*) reginfo->strend, + utf8_target)) + && (reginfo->intuit || regtry(reginfo, &s))) { - if (reginfo->intuit || regtry(reginfo, &s)) { - goto got_it; - } - previous = before; - before = after; + goto got_it; } + previous = before; + before = after; s++; } } - if (to_complement ^ cBOOL(reginfo->intuit - || regtry(reginfo, &s))) - { + if (reginfo->intuit || regtry(reginfo, &s)) { goto got_it; } - - break; } break; @@ -2329,7 +2406,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s, classnum))) || (UTF8_IS_DOWNGRADEABLE_START(*s) && to_complement ^ cBOOL( - _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s, + _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s + 1)), classnum)))) { @@ -2689,7 +2766,7 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, U32 n = 0; max = -1; /* calculate the right-most part of the string covered - * by a capture. Due to look-ahead, this may be to + * by a capture. Due to lookahead, this may be to * the right of $&, so we have to scan all captures */ while (n <= prog->lastparen) { if (prog->offs[n].end > max) @@ -2710,7 +2787,7 @@ S_reg_set_capture_string(pTHX_ REGEXP * const rx, U32 n = 0; min = max; /* calculate the left-most part of the string covered - * by a capture. Due to look-behind, this may be to + * by a capture. Due to lookbehind, this may be to * the left of $&, so we have to scan all captures */ while (min && n <= prog->lastparen) { if ( prog->offs[n].start != -1 @@ -2826,6 +2903,11 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, startpos = stringarg; + /* set these early as they may be used by the HOP macros below */ + reginfo->strbeg = strbeg; + reginfo->strend = strend; + reginfo->is_utf8_target = cBOOL(utf8_target); + if (prog->intflags & PREGf_GPOS_SEEN) { MAGIC *mg; @@ -2853,20 +2935,23 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, */ if (prog->intflags & PREGf_ANCH_GPOS) { - startpos = reginfo->ganch - prog->gofs; - if (startpos < - ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg)) - { - DEBUG_r(PerlIO_printf(Perl_debug_log, - "fail: ganch-gofs before earliest possible start\n")); - return 0; + if (prog->gofs) { + startpos = HOPBACKc(reginfo->ganch, prog->gofs); + if (!startpos || + ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg)) + { + DEBUG_r(PerlIO_printf(Perl_debug_log, + "fail: ganch-gofs before earliest possible start\n")); + return 0; + } } + else + startpos = reginfo->ganch; } else if (prog->gofs) { - if (startpos - prog->gofs < strbeg) + startpos = HOPBACKc(startpos, prog->gofs); + if (!startpos) startpos = strbeg; - else - startpos -= prog->gofs; } else if (prog->intflags & PREGf_GPOS_FLOAT) startpos = strbeg; @@ -2949,13 +3034,10 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, reginfo->prog = rx; /* Yes, sorry that this is confusing. */ reginfo->intuit = 0; - reginfo->is_utf8_target = cBOOL(utf8_target); reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx)); reginfo->warned = FALSE; - reginfo->strbeg = strbeg; reginfo->sv = sv; reginfo->poscache_maxiter = 0; /* not yet started a countdown */ - reginfo->strend = strend; /* see how far we have to get to not match where we matched before */ reginfo->till = stringarg + minend; @@ -3094,7 +3176,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* For anchored \G, the only position it can match from is * (ganch-gofs); we already set startpos to this above; if intuit * moved us on from there, we can't possibly succeed */ - assert(startpos == reginfo->ganch - prog->gofs); + assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs)); if (s == startpos && regtry(reginfo, &s)) goto got_it; goto phooey; @@ -3478,7 +3560,7 @@ Perl_regexec_flags(pTHX_ REGEXP * const rx, char *stringarg, char *strend, /* - regtry - try match at specific point */ -STATIC I32 /* 0 failure, 1 success */ +STATIC bool /* 0 failure, 1 success */ S_regtry(pTHX_ regmatch_info *reginfo, char **startposp) { CHECKPOINT lastcp; @@ -4179,106 +4261,298 @@ S_setup_EXACTISH_ST_c1_c2(pTHX_ const regnode * const text_node, int *c1p, return TRUE; } -/* This creates a single number by combining two, with 'before' being like the - * 10's digit, but this isn't necessarily base 10; it is base however many - * elements of the enum there are */ -#define GCBcase(before, after) ((GCB_ENUM_COUNT * before) + after) - -STATIC bool +PERL_STATIC_INLINE bool S_isGCB(const GCB_enum before, const GCB_enum after) { /* returns a boolean indicating if there is a Grapheme Cluster Boundary * between the inputs. See http://www.unicode.org/reports/tr29/ */ - switch (GCBcase(before, after)) { + return GCB_table[before][after]; +} - /* Break at the start and end of text. - GB1. sot ÷ - GB2. ÷ eot +/* Combining marks attach to most classes that precede them, but this defines + * the exceptions (from TR14) */ +#define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \ + || prev == LB_Mandatory_Break \ + || prev == LB_Carriage_Return \ + || prev == LB_Line_Feed \ + || prev == LB_Next_Line \ + || prev == LB_Space \ + || prev == LB_ZWSpace)) - Break before and after controls except between CR and LF - GB4. ( Control | CR | LF ) ÷ - GB5. ÷ ( Control | CR | LF ) +STATIC bool +S_isLB(pTHX_ LB_enum before, + LB_enum after, + const U8 * const strbeg, + const U8 * const curpos, + const U8 * const strend, + const bool utf8_target) +{ + U8 * temp_pos = (U8 *) curpos; + LB_enum prev = before; - Otherwise, break everywhere. - GB10. Any ÷ Any */ - default: + /* Is the boundary between 'before' and 'after' line-breakable? + * Most of this is just a table lookup of a generated table from Unicode + * rules. But some rules require context to decide, and so have to be + * implemented in code */ + + PERL_ARGS_ASSERT_ISLB; + + /* Rule numbers in the comments below are as of Unicode 8.0 */ + + redo: + before = prev; + switch (LB_table[before][after]) { + case LB_BREAKABLE: return TRUE; - /* Do not break between a CR and LF. - GB3. CR × LF */ - case GCBcase(GCB_CR, GCB_LF): + case LB_NOBREAK: + case LB_NOBREAK_EVEN_WITH_SP_BETWEEN: return FALSE; - /* Do not break Hangul syllable sequences. - GB6. L × ( L | V | LV | LVT ) */ - case GCBcase(GCB_L, GCB_L): - case GCBcase(GCB_L, GCB_V): - case GCBcase(GCB_L, GCB_LV): - case GCBcase(GCB_L, GCB_LVT): - return FALSE; + case LB_SP_foo + LB_BREAKABLE: + case LB_SP_foo + LB_NOBREAK: + case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN: - /* GB7. ( LV | V ) × ( V | T ) */ - case GCBcase(GCB_LV, GCB_V): - case GCBcase(GCB_LV, GCB_T): - case GCBcase(GCB_V, GCB_V): - case GCBcase(GCB_V, GCB_T): - return FALSE; + /* When we have something following a SP, we have to look at the + * context in order to know what to do. + * + * SP SP should not reach here because LB7: Do not break before + * spaces. (For two spaces in a row there is nothing that + * overrides that) */ + assert(after != LB_Space); + + /* Here we have a space followed by a non-space. Mostly this is a + * case of LB18: "Break after spaces". But there are complications + * as the handling of spaces is somewhat tricky. They are in a + * number of rules, which have to be applied in priority order, but + * something earlier in the string can cause a rule to be skipped + * and a lower priority rule invoked. A prime example is LB7 which + * says don't break before a space. But rule LB8 (lower priority) + * says that the first break opportunity after a ZW is after any + * span of spaces immediately after it. If a ZW comes before a SP + * in the input, rule LB8 applies, and not LB7. Other such rules + * involve combining marks which are rules 9 and 10, but they may + * override higher priority rules if they come earlier in the + * string. Since we're doing random access into the middle of the + * string, we have to look for rules that should get applied based + * on both string position and priority. Combining marks do not + * attach to either ZW nor SP, so we don't have to consider them + * until later. + * + * To check for LB8, we have to find the first non-space character + * before this span of spaces */ + do { + prev = backup_one_LB(strbeg, &temp_pos, utf8_target); + } + while (prev == LB_Space); + + /* LB8 Break before any character following a zero-width space, + * even if one or more spaces intervene. + * ZW SP* ÷ + * So if we have a ZW just before this span, and to get here this + * is the final space in the span. */ + if (prev == LB_ZWSpace) { + return TRUE; + } - /* GB8. ( LVT | T) × T */ - case GCBcase(GCB_LVT, GCB_T): - case GCBcase(GCB_T, GCB_T): - return FALSE; + /* Here, not ZW SP+. There are several rules that have higher + * priority than LB18 and can be resolved now, as they don't depend + * on anything earlier in the string (except ZW, which we have + * already handled). One of these rules is LB11 Do not break + * before Word joiner, but we have specially encoded that in the + * lookup table so it is caught by the single test below which + * catches the other ones. */ + if (LB_table[LB_Space][after] - LB_SP_foo + == LB_NOBREAK_EVEN_WITH_SP_BETWEEN) + { + return FALSE; + } - /* Do not break between regional indicator symbols. - GB8a. Regional_Indicator × Regional_Indicator */ - case GCBcase(GCB_Regional_Indicator, GCB_Regional_Indicator): - return FALSE; + /* If we get here, we have to XXX consider combining marks. */ + if (prev == LB_Combining_Mark) { - /* Do not break before extending characters. - GB9. × Extend */ - case GCBcase(GCB_Other, GCB_Extend): - case GCBcase(GCB_Extend, GCB_Extend): - case GCBcase(GCB_L, GCB_Extend): - case GCBcase(GCB_LV, GCB_Extend): - case GCBcase(GCB_LVT, GCB_Extend): - case GCBcase(GCB_Prepend, GCB_Extend): - case GCBcase(GCB_Regional_Indicator, GCB_Extend): - case GCBcase(GCB_SpacingMark, GCB_Extend): - case GCBcase(GCB_T, GCB_Extend): - case GCBcase(GCB_V, GCB_Extend): - return FALSE; + /* What happens with these depends on the character they + * follow. */ + do { + prev = backup_one_LB(strbeg, &temp_pos, utf8_target); + } + while (prev == LB_Combining_Mark); - /* Do not break before SpacingMarks, or after Prepend characters. - GB9a. × SpacingMark */ - case GCBcase(GCB_Other, GCB_SpacingMark): - case GCBcase(GCB_Extend, GCB_SpacingMark): - case GCBcase(GCB_L, GCB_SpacingMark): - case GCBcase(GCB_LV, GCB_SpacingMark): - case GCBcase(GCB_LVT, GCB_SpacingMark): - case GCBcase(GCB_Prepend, GCB_SpacingMark): - case GCBcase(GCB_Regional_Indicator, GCB_SpacingMark): - case GCBcase(GCB_SpacingMark, GCB_SpacingMark): - case GCBcase(GCB_T, GCB_SpacingMark): - case GCBcase(GCB_V, GCB_SpacingMark): - return FALSE; + /* Most times these attach to and inherit the characteristics + * of that character, but not always, and when not, they are to + * be treated as AL by rule LB10. */ + if (! LB_CM_ATTACHES_TO(prev)) { + prev = LB_Alphabetic; + } + } - /* GB9b. Prepend × */ - case GCBcase(GCB_Prepend, GCB_Other): - case GCBcase(GCB_Prepend, GCB_L): - case GCBcase(GCB_Prepend, GCB_LV): - case GCBcase(GCB_Prepend, GCB_LVT): - case GCBcase(GCB_Prepend, GCB_Prepend): - case GCBcase(GCB_Prepend, GCB_Regional_Indicator): - case GCBcase(GCB_Prepend, GCB_T): - case GCBcase(GCB_Prepend, GCB_V): - return FALSE; + /* Here, we have the character preceding the span of spaces all set + * up. We follow LB18: "Break after spaces" unless the table shows + * that is overriden */ + return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN; + + case LB_CM_foo: + + /* We don't know how to treat the CM except by looking at the first + * non-CM character preceding it */ + do { + prev = backup_one_LB(strbeg, &temp_pos, utf8_target); + } + while (prev == LB_Combining_Mark); + + /* Here, 'prev' is that first earlier non-CM character. If the CM + * attatches to it, then it inherits the behavior of 'prev'. If it + * doesn't attach, it is to be treated as an AL */ + if (! LB_CM_ATTACHES_TO(prev)) { + prev = LB_Alphabetic; + } + + goto redo; + + case LB_HY_or_BA_then_foo + LB_BREAKABLE: + case LB_HY_or_BA_then_foo + LB_NOBREAK: + + /* LB21a Don't break after Hebrew + Hyphen. + * HL (HY | BA) × */ + + if (backup_one_LB(strbeg, &temp_pos, utf8_target) + == LB_Hebrew_Letter) + { + return FALSE; + } + + return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE; + + case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE: + case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK: + + /* LB25a (PR | PO) × ( OP | HY )? NU */ + if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) { + return FALSE; + } + + return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY + == LB_BREAKABLE; + + case LB_SY_or_IS_then_various + LB_BREAKABLE: + case LB_SY_or_IS_then_various + LB_NOBREAK: + { + /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */ + + LB_enum temp = prev; + do { + temp = backup_one_LB(strbeg, &temp_pos, utf8_target); + } + while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric); + if (temp == LB_Numeric) { + return FALSE; + } + + return LB_table[prev][after] - LB_SY_or_IS_then_various + == LB_BREAKABLE; + } + + case LB_various_then_PO_or_PR + LB_BREAKABLE: + case LB_various_then_PO_or_PR + LB_NOBREAK: + { + /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */ + + LB_enum temp = prev; + if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis) + { + temp = backup_one_LB(strbeg, &temp_pos, utf8_target); + } + while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) { + temp = backup_one_LB(strbeg, &temp_pos, utf8_target); + } + if (temp == LB_Numeric) { + return FALSE; + } + return LB_various_then_PO_or_PR; + } + + default: + break; } - NOT_REACHED; /* NOTREACHED */ +#ifdef DEBUGGING + PerlIO_printf(Perl_error_log, "Unhandled LB pair: LB_table[%d, %d] = %d\n", + before, after, LB_table[before][after]); + assert(0); +#endif + return TRUE; +} + +STATIC LB_enum +S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) +{ + LB_enum lb; + + PERL_ARGS_ASSERT_ADVANCE_ONE_LB; + + if (*curpos >= strend) { + return LB_EDGE; + } + + if (utf8_target) { + *curpos += UTF8SKIP(*curpos); + if (*curpos >= strend) { + return LB_EDGE; + } + lb = getLB_VAL_UTF8(*curpos, strend); + } + else { + (*curpos)++; + if (*curpos >= strend) { + return LB_EDGE; + } + lb = getLB_VAL_CP(**curpos); + } + + return lb; } -#define SBcase(before, after) ((SB_ENUM_COUNT * before) + after) +STATIC LB_enum +S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target) +{ + LB_enum lb; + + PERL_ARGS_ASSERT_BACKUP_ONE_LB; + + if (*curpos < strbeg) { + return LB_EDGE; + } + + if (utf8_target) { + U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg); + U8 * prev_prev_char_pos; + + if (! prev_char_pos) { + return LB_EDGE; + } + + if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) { + lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos); + *curpos = prev_char_pos; + prev_char_pos = prev_prev_char_pos; + } + else { + *curpos = (U8 *) strbeg; + return LB_EDGE; + } + } + else { + if (*curpos - 2 < strbeg) { + *curpos = (U8 *) strbeg; + return LB_EDGE; + } + (*curpos)--; + lb = getLB_VAL_CP(*(*curpos - 1)); + } + + return lb; +} STATIC bool S_isSB(pTHX_ SB_enum before, @@ -4292,16 +4566,17 @@ S_isSB(pTHX_ SB_enum before, * between the inputs. See http://www.unicode.org/reports/tr29/ */ U8 * lpos = (U8 *) curpos; - U8 * temp_pos; - SB_enum backup; + bool has_para_sep = FALSE; + bool has_sp = FALSE; PERL_ARGS_ASSERT_ISSB; /* Break at the start and end of text. SB1. sot ÷ - SB2. ÷ eot */ + SB2. ÷ eot + But unstated in Unicode is don't break if the text is empty */ if (before == SB_EDGE || after == SB_EDGE) { - return TRUE; + return before != after; } /* SB 3: Do not break within CRLF. */ @@ -4309,8 +4584,10 @@ S_isSB(pTHX_ SB_enum before, return FALSE; } - /* Break after paragraph separators. (though why CR and LF are considered - * so is beyond me (khw) + /* Break after paragraph separators. CR and LF are considered + * so because Unicode views text as like word processing text where there + * are no newlines except between paragraphs, and the word processor takes + * care of wrapping without there being hard line-breaks in the text *./ SB4. Sep | CR | LF ÷ */ if (before == SB_Sep || before == SB_CR || before == SB_LF) { return TRUE; @@ -4320,11 +4597,31 @@ S_isSB(pTHX_ SB_enum before, * (See Section 6.2, Replacing Ignore Rules.) SB5. X (Extend | Format)* → X */ if (after == SB_Extend || after == SB_Format) { + + /* Implied is that the these characters attach to everything + * immediately prior to them except for those separator-type + * characters. And the rules earlier have already handled the case + * when one of those immediately precedes the extend char */ return FALSE; } if (before == SB_Extend || before == SB_Format) { - before = backup_one_SB(strbeg, &lpos, utf8_target); + U8 * temp_pos = lpos; + const SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target); + if ( backup != SB_EDGE + && backup != SB_Sep + && backup != SB_CR + && backup != SB_LF) + { + before = backup; + lpos = temp_pos; + } + + /* Here, both 'before' and 'backup' are these types; implied is that we + * don't break between them */ + if (backup == SB_Extend || backup == SB_Format) { + return FALSE; + } } /* Do not break after ambiguous terminators like period, if they are @@ -4342,97 +4639,107 @@ S_isSB(pTHX_ SB_enum before, /* SB7. (Upper | Lower) ATerm × Upper */ if (before == SB_ATerm && after == SB_Upper) { - temp_pos = lpos; - backup = backup_one_SB(strbeg, &temp_pos, utf8_target); + U8 * temp_pos = lpos; + SB_enum backup = backup_one_SB(strbeg, &temp_pos, utf8_target); if (backup == SB_Upper || backup == SB_Lower) { return FALSE; } } - /* SB8a. (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm) - * SB10. (STerm | ATerm) Close* Sp* × ( Sp | Sep | CR | LF ) */ - backup = before; - temp_pos = lpos; - while (backup == SB_Sp) { - backup = backup_one_SB(strbeg, &temp_pos, utf8_target); - } - while (backup == SB_Close) { - backup = backup_one_SB(strbeg, &temp_pos, utf8_target); - } - if ((backup == SB_STerm || backup == SB_ATerm) - && ( after == SB_SContinue - || after == SB_STerm - || after == SB_ATerm - || after == SB_Sp - || after == SB_Sep - || after == SB_CR - || after == SB_LF)) - { - return FALSE; + /* The remaining rules that aren't the final one, all require an STerm or + * an ATerm after having backed up over some Close* Sp*, and in one case an + * optional Paragraph separator, although one rule doesn't have any Sp's in it. + * So do that backup now, setting flags if either Sp or a paragraph + * separator are found */ + + if (before == SB_Sep || before == SB_CR || before == SB_LF) { + has_para_sep = TRUE; + before = backup_one_SB(strbeg, &lpos, utf8_target); } - /* SB8. ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR | LF | - * STerm | ATerm) )* Lower */ - if (backup == SB_ATerm) { - U8 * rpos = (U8 *) curpos; - SB_enum later = after; - - while ( later != SB_OLetter - && later != SB_Upper - && later != SB_Lower - && later != SB_Sep - && later != SB_CR - && later != SB_LF - && later != SB_STerm - && later != SB_ATerm - && later != SB_EDGE) - { - later = advance_one_SB(&rpos, strend, utf8_target); - } - if (later == SB_Lower) { - return FALSE; + if (before == SB_Sp) { + has_sp = TRUE; + do { + before = backup_one_SB(strbeg, &lpos, utf8_target); } + while (before == SB_Sp); } - /* Break after sentence terminators, but include closing punctuation, - * trailing spaces, and a paragraph separator (if present). [See note - * below.] - * SB9. ( STerm | ATerm ) Close* × ( Close | Sp | Sep | CR | LF ) */ - backup = before; - temp_pos = lpos; - while (backup == SB_Close) { - backup = backup_one_SB(strbeg, &temp_pos, utf8_target); - } - if ((backup == SB_STerm || backup == SB_ATerm) - && ( after == SB_Close - || after == SB_Sp - || after == SB_Sep - || after == SB_CR - || after == SB_LF)) - { - return FALSE; + while (before == SB_Close) { + before = backup_one_SB(strbeg, &lpos, utf8_target); } + /* The next few rules apply only when the backed-up-to is an ATerm, and in + * most cases an STerm */ + if (before == SB_STerm || before == SB_ATerm) { - /* SB11. ( STerm | ATerm ) Close* Sp* ( Sep | CR | LF )? ÷ */ - temp_pos = lpos; - backup = backup_one_SB(strbeg, &temp_pos, utf8_target); - if ( backup == SB_Sep - || backup == SB_CR - || backup == SB_LF) - { - lpos = temp_pos; - } - else { - backup = before; - } - while (backup == SB_Sp) { - backup = backup_one_SB(strbeg, &lpos, utf8_target); - } - while (backup == SB_Close) { - backup = backup_one_SB(strbeg, &lpos, utf8_target); - } - if (backup == SB_STerm || backup == SB_ATerm) { + /* So, here the lhs matches + * (STerm | ATerm) Close* Sp* (Sep | CR | LF)? + * and we have set flags if we found an Sp, or the optional Sep,CR,LF. + * The rules that apply here are: + * + * SB8 ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR + | LF | STerm | ATerm) )* Lower + SB8a (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm) + SB9 (STerm | ATerm) Close* × (Close | Sp | Sep | CR | LF) + SB10 (STerm | ATerm) Close* Sp* × (Sp | Sep | CR | LF) + SB11 (STerm | ATerm) Close* Sp* (Sep | CR | LF)? ÷ + */ + + /* And all but SB11 forbid having seen a paragraph separator */ + if (! has_para_sep) { + if (before == SB_ATerm) { /* SB8 */ + U8 * rpos = (U8 *) curpos; + SB_enum later = after; + + while ( later != SB_OLetter + && later != SB_Upper + && later != SB_Lower + && later != SB_Sep + && later != SB_CR + && later != SB_LF + && later != SB_STerm + && later != SB_ATerm + && later != SB_EDGE) + { + later = advance_one_SB(&rpos, strend, utf8_target); + } + if (later == SB_Lower) { + return FALSE; + } + } + + if ( after == SB_SContinue /* SB8a */ + || after == SB_STerm + || after == SB_ATerm) + { + return FALSE; + } + + if (! has_sp) { /* SB9 applies only if there was no Sp* */ + if ( after == SB_Close + || after == SB_Sp + || after == SB_Sep + || after == SB_CR + || after == SB_LF) + { + return FALSE; + } + } + + /* SB10. This and SB9 could probably be combined some way, but khw + * has decided to follow the Unicode rule book precisely for + * simplified maintenance */ + if ( after == SB_Sp + || after == SB_Sep + || after == SB_CR + || after == SB_LF) + { + return FALSE; + } + } + + /* SB11. */ return TRUE; } @@ -4523,8 +4830,6 @@ S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_tar return sb; } -#define WBcase(before, after) ((WB_ENUM_COUNT * before) + after) - STATIC bool S_isWB(pTHX_ WB_enum previous, WB_enum before, @@ -4535,7 +4840,8 @@ S_isWB(pTHX_ WB_enum previous, const bool utf8_target) { /* Return a boolean as to if the boundary between 'before' and 'after' is - * a Unicode word break, using their published algorithm. Context may be + * a Unicode word break, using their published algorithm, but tailored for + * Perl by treating spans of white space as one unit. Context may be * needed to make this determination. If the value for the character * before 'before' is known, it is passed as 'previous'; otherwise that * should be set to WB_UNKNOWN. The other input parameters give the @@ -4545,160 +4851,151 @@ S_isWB(pTHX_ WB_enum previous, U8 * before_pos = (U8 *) curpos; U8 * after_pos = (U8 *) curpos; + WB_enum prev = before; + WB_enum next; PERL_ARGS_ASSERT_ISWB; - /* WB1 and WB2: Break at the start and end of text. */ - if (before == WB_EDGE || after == WB_EDGE) { - return TRUE; - } + /* Rule numbers in the comments below are as of Unicode 8.0 */ - /* WB 3: Do not break within CRLF. */ - if (before == WB_CR && after == WB_LF) { - return FALSE; - } + redo: + before = prev; + switch (WB_table[before][after]) { + case WB_BREAKABLE: + return TRUE; - /* WB 3a and WB 3b: Otherwise break before and after Newlines (including CR - * and LF) */ - if ( before == WB_CR || before == WB_LF || before == WB_Newline - || after == WB_CR || after == WB_LF || after == WB_Newline) - { - return TRUE; - } + case WB_NOBREAK: + return FALSE; - /* Ignore Format and Extend characters, except when they appear at the - * beginning of a region of text. - * WB4. X (Extend | Format)* → X. */ + case WB_hs_then_hs: /* 2 horizontal spaces in a row */ + next = advance_one_WB(&after_pos, strend, utf8_target, + FALSE /* Don't skip Extend nor Format */ ); + /* A space immediately preceeding an Extend or Format is attached + * to by them, and hence gets separated from previous spaces. + * Otherwise don't break between horizontal white space */ + return next == WB_Extend || next == WB_Format; + + /* WB4 Ignore Format and Extend characters, except when they appear at + * the beginning of a region of text. This code currently isn't + * general purpose, but it works as the rules are currently and likely + * to be laid out. The reason it works is that when 'they appear at + * the beginning of a region of text', the rule is to break before + * them, just like any other character. Therefore, the default rule + * applies and we don't have to look in more depth. Should this ever + * change, we would have to have 2 'case' statements, like in the + * rules below, and backup a single character (not spacing over the + * extend ones) and then see if that is one of the region-end + * characters and go from there */ + case WB_Ex_or_FO_then_foo: + prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); + goto redo; + + case WB_DQ_then_HL + WB_BREAKABLE: + case WB_DQ_then_HL + WB_NOBREAK: + + /* WB7c Hebrew_Letter Double_Quote × Hebrew_Letter */ + + if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target) + == WB_Hebrew_Letter) + { + return FALSE; + } - if (after == WB_Extend || after == WB_Format) { - return FALSE; - } + return WB_table[before][after] - WB_DQ_then_HL == WB_BREAKABLE; - if (before == WB_Extend || before == WB_Format) { - before = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); - } + case WB_HL_then_DQ + WB_BREAKABLE: + case WB_HL_then_DQ + WB_NOBREAK: - switch (WBcase(before, after)) { - /* Otherwise, break everywhere (including around ideographs). - WB14. Any ÷ Any */ - default: - return TRUE; + /* WB7b Hebrew_Letter × Double_Quote Hebrew_Letter */ - /* Do not break between most letters. - WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) */ - case WBcase(WB_ALetter, WB_ALetter): - case WBcase(WB_ALetter, WB_Hebrew_Letter): - case WBcase(WB_Hebrew_Letter, WB_ALetter): - case WBcase(WB_Hebrew_Letter, WB_Hebrew_Letter): + if (advance_one_WB(&after_pos, strend, utf8_target, + TRUE /* Do skip Extend and Format */ ) + == WB_Hebrew_Letter) + { return FALSE; + } - /* Do not break letters across certain punctuation. - WB6. (ALetter | Hebrew_Letter) - × (MidLetter | MidNumLet | Single_Quote) (ALetter - | Hebrew_Letter) */ - case WBcase(WB_ALetter, WB_MidLetter): - case WBcase(WB_ALetter, WB_MidNumLet): - case WBcase(WB_ALetter, WB_Single_Quote): - case WBcase(WB_Hebrew_Letter, WB_MidLetter): - case WBcase(WB_Hebrew_Letter, WB_MidNumLet): - /*case WBcase(WB_Hebrew_Letter, WB_Single_Quote):*/ - after = advance_one_WB(&after_pos, strend, utf8_target); - return after != WB_ALetter && after != WB_Hebrew_Letter; - - /* WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet | - * Single_Quote) × (ALetter | Hebrew_Letter) */ - case WBcase(WB_MidLetter, WB_ALetter): - case WBcase(WB_MidLetter, WB_Hebrew_Letter): - case WBcase(WB_MidNumLet, WB_ALetter): - case WBcase(WB_MidNumLet, WB_Hebrew_Letter): - case WBcase(WB_Single_Quote, WB_ALetter): - case WBcase(WB_Single_Quote, WB_Hebrew_Letter): - before - = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); - return before != WB_ALetter && before != WB_Hebrew_Letter; - - /* WB7a. Hebrew_Letter × Single_Quote */ - case WBcase(WB_Hebrew_Letter, WB_Single_Quote): - return FALSE; + return WB_table[before][after] - WB_HL_then_DQ == WB_BREAKABLE; - /* WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter */ - case WBcase(WB_Hebrew_Letter, WB_Double_Quote): - return advance_one_WB(&after_pos, strend, utf8_target) - != WB_Hebrew_Letter; + case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_NOBREAK: + case WB_LE_or_HL_then_MB_or_ML_or_SQ + WB_BREAKABLE: - /* WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter */ - case WBcase(WB_Double_Quote, WB_Hebrew_Letter): - return backup_one_WB(&previous, strbeg, &before_pos, utf8_target) - != WB_Hebrew_Letter; + /* WB6 (ALetter | Hebrew_Letter) × (MidLetter | MidNumLet + * | Single_Quote) (ALetter | Hebrew_Letter) */ - /* Do not break within sequences of digits, or digits adjacent to - * letters (“3a”, or “A3”). - WB8. Numeric × Numeric */ - case WBcase(WB_Numeric, WB_Numeric): - return FALSE; + next = advance_one_WB(&after_pos, strend, utf8_target, + TRUE /* Do skip Extend and Format */ ); - /* WB9. (ALetter | Hebrew_Letter) × Numeric */ - case WBcase(WB_ALetter, WB_Numeric): - case WBcase(WB_Hebrew_Letter, WB_Numeric): + if (next == WB_ALetter || next == WB_Hebrew_Letter) + { return FALSE; + } - /* WB10. Numeric × (ALetter | Hebrew_Letter) */ - case WBcase(WB_Numeric, WB_ALetter): - case WBcase(WB_Numeric, WB_Hebrew_Letter): - return FALSE; + return WB_table[before][after] + - WB_LE_or_HL_then_MB_or_ML_or_SQ == WB_BREAKABLE; - /* Do not break within sequences, such as “3.2” or “3,456.789”. - WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric - */ - case WBcase(WB_MidNum, WB_Numeric): - case WBcase(WB_MidNumLet, WB_Numeric): - case WBcase(WB_Single_Quote, WB_Numeric): - return backup_one_WB(&previous, strbeg, &before_pos, utf8_target) - != WB_Numeric; - - /* WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric - * */ - case WBcase(WB_Numeric, WB_MidNum): - case WBcase(WB_Numeric, WB_MidNumLet): - case WBcase(WB_Numeric, WB_Single_Quote): - return advance_one_WB(&after_pos, strend, utf8_target) - != WB_Numeric; - - /* Do not break between Katakana. - WB13. Katakana × Katakana */ - case WBcase(WB_Katakana, WB_Katakana): - return FALSE; + case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_NOBREAK: + case WB_MB_or_ML_or_SQ_then_LE_or_HL + WB_BREAKABLE: - /* Do not break from extenders. - WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana | - ExtendNumLet) × ExtendNumLet */ - case WBcase(WB_ALetter, WB_ExtendNumLet): - case WBcase(WB_Hebrew_Letter, WB_ExtendNumLet): - case WBcase(WB_Numeric, WB_ExtendNumLet): - case WBcase(WB_Katakana, WB_ExtendNumLet): - case WBcase(WB_ExtendNumLet, WB_ExtendNumLet): + /* WB7 (ALetter | Hebrew_Letter) (MidLetter | MidNumLet + * | Single_Quote) × (ALetter | Hebrew_Letter) */ + + prev = backup_one_WB(&previous, strbeg, &before_pos, utf8_target); + if (prev == WB_ALetter || prev == WB_Hebrew_Letter) + { return FALSE; + } + + return WB_table[before][after] + - WB_MB_or_ML_or_SQ_then_LE_or_HL == WB_BREAKABLE; + + case WB_MB_or_MN_or_SQ_then_NU + WB_NOBREAK: + case WB_MB_or_MN_or_SQ_then_NU + WB_BREAKABLE: + + /* WB11 Numeric (MidNum | (MidNumLet | Single_Quote)) × Numeric + * */ - /* WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric - * | Katakana) */ - case WBcase(WB_ExtendNumLet, WB_ALetter): - case WBcase(WB_ExtendNumLet, WB_Hebrew_Letter): - case WBcase(WB_ExtendNumLet, WB_Numeric): - case WBcase(WB_ExtendNumLet, WB_Katakana): + if (backup_one_WB(&previous, strbeg, &before_pos, utf8_target) + == WB_Numeric) + { return FALSE; + } + + return WB_table[before][after] + - WB_MB_or_MN_or_SQ_then_NU == WB_BREAKABLE; - /* Do not break between regional indicator symbols. - WB13c. Regional_Indicator × Regional_Indicator */ - case WBcase(WB_Regional_Indicator, WB_Regional_Indicator): + case WB_NU_then_MB_or_MN_or_SQ + WB_NOBREAK: + case WB_NU_then_MB_or_MN_or_SQ + WB_BREAKABLE: + + /* WB12 Numeric × (MidNum | MidNumLet | Single_Quote) Numeric */ + + if (advance_one_WB(&after_pos, strend, utf8_target, + TRUE /* Do skip Extend and Format */ ) + == WB_Numeric) + { return FALSE; + } + + return WB_table[before][after] + - WB_NU_then_MB_or_MN_or_SQ == WB_BREAKABLE; + default: + break; } - NOT_REACHED; /* NOTREACHED */ +#ifdef DEBUGGING + PerlIO_printf(Perl_error_log, "Unhandled WB pair: WB_table[%d, %d] = %d\n", + before, after, WB_table[before][after]); + assert(0); +#endif + return TRUE; } STATIC WB_enum -S_advance_one_WB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target) +S_advance_one_WB(pTHX_ U8 ** curpos, + const U8 * const strend, + const bool utf8_target, + const bool skip_Extend_Format) { WB_enum wb; @@ -4717,7 +5014,8 @@ S_advance_one_WB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_ta return WB_EDGE; } wb = getWB_VAL_UTF8(*curpos, strend); - } while (wb == WB_Extend || wb == WB_Format); + } while ( skip_Extend_Format + && (wb == WB_Extend || wb == WB_Format)); } else { do { @@ -4726,7 +5024,8 @@ S_advance_one_WB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_ta return WB_EDGE; } wb = getWB_VAL_CP(**curpos); - } while (wb == WB_Extend || wb == WB_Format); + } while ( skip_Extend_Format + && (wb == WB_Extend || wb == WB_Format)); } return wb; @@ -4743,10 +5042,24 @@ S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, * to look it up */ if (*previous != WB_UNKNOWN) { wb = *previous; - *previous = WB_UNKNOWN; - /* XXX Note that doesn't change curpos, and maybe should */ - /* But we always back up over these two types */ + /* But we need to move backwards by one */ + if (utf8_target) { + *curpos = reghopmaybe3(*curpos, -1, strbeg); + if (! *curpos) { + *previous = WB_EDGE; + *curpos = (U8 *) strbeg; + } + else { + *previous = WB_UNKNOWN; + } + } + else { + (*curpos)--; + *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN; + } + + /* And we always back up over these two types */ if (wb != WB_Extend && wb != WB_Format) { return wb; } @@ -4798,6 +5111,7 @@ S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, STATIC SSize_t S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) { + #if PERL_VERSION < 9 && !defined(PERL_CORE) dMY_CXT; #endif @@ -4816,7 +5130,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) SSize_t ln = 0; /* len or last; init to avoid compiler warning */ char *locinput = startpos; char *pushinput; /* where to continue after a PUSH */ - I32 nextchr; /* is always set to UCHARAT(locinput) */ + I32 nextchr; /* is always set to UCHARAT(locinput), or -1 at EOS */ bool result = 0; /* return value of S_regmatch */ int depth = 0; /* depth of backtrack stack */ @@ -4858,7 +5172,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) */ PAD* last_pad = NULL; dMULTICALL; - I32 gimme = G_SCALAR; + U8 gimme = G_SCALAR; CV *caller_cv = NULL; /* who called us */ CV *last_pushed_cv = NULL; /* most recently called (?{}) CV */ CHECKPOINT runops_cp; /* savestack position before executing EVAL */ @@ -4878,11 +5192,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* shut up 'may be used uninitialized' compiler warnings for dMULTICALL */ multicall_oldcatch = 0; - multicall_cv = NULL; - cx = NULL; PERL_UNUSED_VAR(multicall_cop); - PERL_UNUSED_VAR(newsp); - PERL_ARGS_ASSERT_REGMATCH; @@ -5386,7 +5696,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) l++; } else { - if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) + if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s) { sayNO; } @@ -5410,7 +5720,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) s++; } else { - if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) + if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l) { sayNO; } @@ -5526,6 +5836,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* FALLTHROUGH */ case BOUNDL: /* /\b/l */ + { + bool b1, b2; _CHECK_AND_WARN_PROBLEMATIC_LOCALE; if (FLAGS(scan) != TRADITIONAL_BOUND) { @@ -5538,27 +5850,28 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) if (utf8_target) { if (locinput == reginfo->strbeg) - ln = isWORDCHAR_LC('\n'); + b1 = isWORDCHAR_LC('\n'); else { - ln = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1, + b1 = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg))); } - n = (NEXTCHR_IS_EOS) + b2 = (NEXTCHR_IS_EOS) ? isWORDCHAR_LC('\n') : isWORDCHAR_LC_utf8((U8*)locinput); } else { /* Here the string isn't utf8 */ - ln = (locinput == reginfo->strbeg) + b1 = (locinput == reginfo->strbeg) ? isWORDCHAR_LC('\n') : isWORDCHAR_LC(UCHARAT(locinput - 1)); - n = (NEXTCHR_IS_EOS) + b2 = (NEXTCHR_IS_EOS) ? isWORDCHAR_LC('\n') : isWORDCHAR_LC(nextchr); } - if (to_complement ^ (ln == n)) { + if (to_complement ^ (b1 == b2)) { sayNO; } break; + } case NBOUND: /* /\B/ */ to_complement = 1; @@ -5575,6 +5888,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* FALLTHROUGH */ case BOUNDA: /* /\b/a */ + { + bool b1, b2; bound_ascii_match_only: /* Here the string isn't utf8, or is utf8 and only ascii characters @@ -5586,16 +5901,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * 2) it is a multi-byte character, in which case the final byte is * never mistakable for ASCII, and so the test will say it is * not a word character, which is the correct answer. */ - ln = (locinput == reginfo->strbeg) + b1 = (locinput == reginfo->strbeg) ? isWORDCHAR_A('\n') : isWORDCHAR_A(UCHARAT(locinput - 1)); - n = (NEXTCHR_IS_EOS) + b2 = (NEXTCHR_IS_EOS) ? isWORDCHAR_A('\n') : isWORDCHAR_A(nextchr); - if (to_complement ^ (ln == n)) { + if (to_complement ^ (b1 == b2)) { sayNO; } break; + } case NBOUNDU: /* /\B/u */ to_complement = 1; @@ -5604,20 +5920,25 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case BOUNDU: /* /\b/u */ boundu: - if (utf8_target) { - + if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) { + match = FALSE; + } + else if (utf8_target) { bound_utf8: switch((bound_type) FLAGS(scan)) { case TRADITIONAL_BOUND: - ln = (locinput == reginfo->strbeg) + { + bool b1, b2; + b1 = (locinput == reginfo->strbeg) ? 0 /* isWORDCHAR_L1('\n') */ : isWORDCHAR_utf8(reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg))); - n = (NEXTCHR_IS_EOS) + b2 = (NEXTCHR_IS_EOS) ? 0 /* isWORDCHAR_L1('\n') */ : isWORDCHAR_utf8((U8*)locinput); - match = cBOOL(ln != n); + match = cBOOL(b1 != b2); break; + } case GCB_BOUND: if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { match = TRUE; /* GCB always matches at begin and @@ -5636,6 +5957,28 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } break; + case LB_BOUND: + if (locinput == reginfo->strbeg) { + match = FALSE; + } + else if (NEXTCHR_IS_EOS) { + match = TRUE; + } + else { + match = isLB(getLB_VAL_UTF8( + reghop3((U8*)locinput, + -1, + (U8*)(reginfo->strbeg)), + (U8*) reginfo->strend), + getLB_VAL_UTF8((U8*) locinput, + (U8*) reginfo->strend), + (U8*) reginfo->strbeg, + (U8*) locinput, + (U8*) reginfo->strend, + utf8_target); + } + break; + case SB_BOUND: /* Always matches at begin and end */ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { match = TRUE; @@ -5679,14 +6022,17 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) else { /* Not utf8 target */ switch((bound_type) FLAGS(scan)) { case TRADITIONAL_BOUND: - ln = (locinput == reginfo->strbeg) + { + bool b1, b2; + b1 = (locinput == reginfo->strbeg) ? 0 /* isWORDCHAR_L1('\n') */ : isWORDCHAR_L1(UCHARAT(locinput - 1)); - n = (NEXTCHR_IS_EOS) + b2 = (NEXTCHR_IS_EOS) ? 0 /* isWORDCHAR_L1('\n') */ : isWORDCHAR_L1(nextchr); - match = cBOOL(ln != n); + match = cBOOL(b1 != b2); break; + } case GCB_BOUND: if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { @@ -5700,6 +6046,23 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } break; + case LB_BOUND: + if (locinput == reginfo->strbeg) { + match = FALSE; + } + else if (NEXTCHR_IS_EOS) { + match = TRUE; + } + else { + match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)), + getLB_VAL_CP(UCHARAT(locinput)), + (U8*) reginfo->strbeg, + (U8*) locinput, + (U8*) reginfo->strend, + utf8_target); + } + break; + case SB_BOUND: /* Always matches at begin and end */ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) { match = TRUE; @@ -5739,7 +6102,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case ANYOFL: /* /[abc]/l */ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - if ((FLAGS(scan) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) + if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE) { Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); } @@ -5748,7 +6111,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) case ANYOF: /* /[abc]/ */ if (NEXTCHR_IS_EOS) sayNO; - if (utf8_target) { + if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) { if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend, utf8_target)) sayNO; @@ -5783,8 +6146,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan), - (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr, - *(locinput + 1)))))) + EIGHT_BIT_UTF8_TO_NATIVE(nextchr, + *(locinput + 1)))))) { sayNO; } @@ -5820,7 +6183,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } to_complement = 1; - /* FALLTHROUGH */ + goto join_nposixa; case POSIXA: /* \w or [:punct:] etc. under /a */ @@ -5829,9 +6192,14 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) * UTF-8, and also from NPOSIXA even in UTF-8 when the current * character is a single byte */ - if (NEXTCHR_IS_EOS - || ! (to_complement ^ cBOOL(_generic_isCC_A(nextchr, - FLAGS(scan))))) + if (NEXTCHR_IS_EOS) { + sayNO; + } + + join_nposixa: + + if (! (to_complement ^ cBOOL(_generic_isCC_A(nextchr, + FLAGS(scan))))) { sayNO; } @@ -5864,7 +6232,7 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) } else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr, + ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr, *(locinput + 1)), FLAGS(scan))))) { @@ -6218,6 +6586,13 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) U8 flags = (CXp_SUB_RE | ((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0)); if (last_pushed_cv) { + /* PUSH/POP_MULTICALL save and restore the + * caller's PL_comppad; if we call multiple subs + * using the same CX block, we have to save and + * unwind the varying PL_comppad's ourselves, + * especially restoring the right PL_comppad on + * backtrack - so save it on the save stack */ + SAVECOMPPAD(); CHANGE_MULTICALL_FLAGS(newcv, flags); } else { @@ -6229,7 +6604,6 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* these assignments are just to silence compiler * warnings */ multicall_cop = NULL; - newsp = NULL; } last_pad = PL_comppad; @@ -6281,7 +6655,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) /* we don't use MULTICALL here as we want to call the * first op of the block of interest, rather than the - * first op of the sub */ + * first op of the sub. Also, we don't want to free + * the savestack frame */ before = (IV)(SP-PL_stack_base); PL_op = nop; CALLRUNOPS(aTHX); /* Scalar context. */ @@ -6529,7 +6904,9 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog) break; case ACCEPT: /* (*ACCEPT) */ - if (ARG(scan)){ + if (scan->flags) + sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + if (ARG2L(scan)){ regnode *cursor; for (cursor=scan; cursor && OP(cursor)!=END; @@ -7001,8 +7378,9 @@ NULL NOT_REACHED; /* NOTREACHED */ case CUTGROUP: /* /(*THEN)/ */ - sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL : - MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + sv_yes_mark = st->u.mark.mark_name = scan->flags + ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ]) + : NULL; PUSH_STATE_GOTO(CUTGROUP_next, next, locinput); /* NOTREACHED */ NOT_REACHED; /* NOTREACHED */ @@ -7699,7 +8077,7 @@ NULL /* FALLTHROUGH */ case PRUNE: /* (*PRUNE) */ - if (!scan->flags) + if (scan->flags) sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); PUSH_STATE_GOTO(COMMIT_next, next, locinput); /* NOTREACHED */ @@ -7708,9 +8086,21 @@ NULL case COMMIT_next_fail: no_final = 1; /* FALLTHROUGH */ + sayNO; + NOT_REACHED; /* NOTREACHED */ case OPFAIL: /* (*FAIL) */ - sayNO; + if (scan->flags) + sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]); + if (logical) { + /* deal with (?(?!)X|Y) properly, + * make sure we trigger the no branch + * of the trailing IFTHEN structure*/ + sw= 0; + break; + } else { + sayNO; + } /* NOTREACHED */ NOT_REACHED; /* NOTREACHED */ @@ -7754,7 +8144,7 @@ NULL NOT_REACHED; /* NOTREACHED */ case SKIP: /* (*SKIP) */ - if (scan->flags) { + if (!scan->flags) { /* (*SKIP) : if we fail we cut here*/ ST.mark_name = NULL; ST.mark_loc = locinput; @@ -8141,7 +8531,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, /* Target isn't utf8; convert the character in the UTF-8 * pattern to non-UTF8, and do a simple loop */ - c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); + c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1)); while (scan < loceol && UCHARAT(scan) == c) { scan++; } @@ -8259,7 +8649,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, case ANYOFL: _CHECK_AND_WARN_PROBLEMATIC_LOCALE; - if ((FLAGS(p) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) { + if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) { Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required); } /* FALLTHROUGH */ @@ -8385,7 +8775,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p, } else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) { if (! (to_complement - ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan, + ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan, *(scan + 1)), classnum)))) { @@ -8606,7 +8996,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const * UTF8_ALLOW_FFFF */ if (c_len == (STRLEN)-1) Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)"); - if (c > 255 && OP(n) == ANYOFL && ! (flags & ANYOF_LOC_REQ_UTF8)) { + if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) { _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c); } } @@ -8624,7 +9014,7 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const match = TRUE; } else if (flags & ANYOF_LOCALE_FLAGS) { - if ((flags & ANYOF_LOC_FOLD) + if ((flags & ANYOFL_FOLD) && c < 256 && ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) { @@ -8690,11 +9080,27 @@ S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const { match = TRUE; /* Everything above the bitmap matches */ } - else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES) - || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES)) - || ((flags & ANYOF_LOC_FOLD) - && IN_UTF8_CTYPE_LOCALE - && ARG(n) != ANYOF_ONLY_HAS_BITMAP)) + /* Here doesn't match everything above the bitmap. If there is + * some information available beyond the bitmap, we may find a + * match in it. If so, this is most likely because the code point + * is outside the bitmap range. But rarely, it could be because of + * some other reason. If so, various flags are set to indicate + * this possibility. On ANYOFD nodes, there may be matches that + * happen only when the target string is UTF-8; or for other node + * types, because runtime lookup is needed, regardless of the + * UTF-8ness of the target string. Finally, under /il, there may + * be some matches only possible if the locale is a UTF-8 one. */ + else if ( ARG(n) != ANYOF_ONLY_HAS_BITMAP + && ( c >= NUM_ANYOF_CODE_POINTS + || ( (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP) + && ( UNLIKELY(OP(n) != ANYOFD) + || (utf8_target && ! isASCII_uni(c) +# if NUM_ANYOF_CODE_POINTS > 256 + && c < 256 +# endif + ))) + || ( ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags) + && IN_UTF8_CTYPE_LOCALE))) { SV* only_utf8_locale = NULL; SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0, @@ -8761,6 +9167,9 @@ S_reghop3(U8 *s, SSize_t off, const U8* lim) if (UTF8_IS_CONTINUED(*s)) { while (s > lim && UTF8_IS_CONTINUATION(*s)) s--; + if (! UTF8_IS_START(*s)) { + Perl_croak_nocontext("Malformed UTF-8 character (fatal)"); + } } /* XXX could check well-formedness here */ } @@ -8785,6 +9194,9 @@ S_reghop4(U8 *s, SSize_t off, const U8* llim, const U8* rlim) if (UTF8_IS_CONTINUED(*s)) { while (s > llim && UTF8_IS_CONTINUATION(*s)) s--; + if (! UTF8_IS_START(*s)) { + Perl_croak_nocontext("Malformed UTF-8 character (fatal)"); + } } /* XXX could check well-formedness here */ } @@ -8814,6 +9226,9 @@ S_reghopmaybe3(U8* s, SSize_t off, const U8* lim) if (UTF8_IS_CONTINUED(*s)) { while (s > lim && UTF8_IS_CONTINUATION(*s)) s--; + if (! UTF8_IS_START(*s)) { + Perl_croak_nocontext("Malformed UTF-8 character (fatal)"); + } } /* XXX could check well-formedness here */ }