+ else { /* Here nothing above Latin1 can fold to the pattern
+ character */
+ switch (OP(text_node)) {
+
+ case EXACTFL: /* /l rules */
+ c2 = PL_fold_locale[c1];
+ break;
+
+ case EXACTF: /* This node only generated for non-utf8
+ patterns */
+ assert(! is_utf8_pat);
+ if (! utf8_target) { /* /d rules */
+ c2 = PL_fold[c1];
+ break;
+ }
+ /* FALLTHROUGH */
+ /* /u rules for all these. This happens to work for
+ * EXACTFA as nothing in Latin1 folds to ASCII */
+ case EXACTFA_NO_TRIE: /* This node only generated for
+ non-utf8 patterns */
+ assert(! is_utf8_pat);
+ /* FALLTHROUGH */
+ case EXACTFA:
+ case EXACTFU_SS:
+ case EXACTFU:
+ c2 = PL_fold_latin1[c1];
+ break;
+
+ default:
+ Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
+ NOT_REACHED; /* NOTREACHED */
+ }
+ }
+ }
+ }
+
+ /* Here have figured things out. Set up the returns */
+ if (use_chrtest_void) {
+ *c2p = *c1p = CHRTEST_VOID;
+ }
+ else if (utf8_target) {
+ if (! utf8_has_been_setup) { /* Don't have the utf8; must get it */
+ uvchr_to_utf8(c1_utf8, c1);
+ uvchr_to_utf8(c2_utf8, c2);
+ }
+
+ /* Invariants are stored in both the utf8 and byte outputs; Use
+ * negative numbers otherwise for the byte ones. Make sure that the
+ * byte ones are the same iff the utf8 ones are the same */
+ *c1p = (UTF8_IS_INVARIANT(*c1_utf8)) ? *c1_utf8 : CHRTEST_NOT_A_CP_1;
+ *c2p = (UTF8_IS_INVARIANT(*c2_utf8))
+ ? *c2_utf8
+ : (c1 == c2)
+ ? CHRTEST_NOT_A_CP_1
+ : CHRTEST_NOT_A_CP_2;
+ }
+ else if (c1 > 255) {
+ if (c2 > 255) { /* both possibilities are above what a non-utf8 string
+ can represent */
+ return FALSE;
+ }
+
+ *c1p = *c2p = c2; /* c2 is the only representable value */
+ }
+ else { /* c1 is representable; see about c2 */
+ *c1p = c1;
+ *c2p = (c2 < 256) ? c2 : c1;
+ }
+
+ return TRUE;
+}
+
+PERL_STATIC_INLINE bool
+S_isGCB(const GCB_enum before, const GCB_enum after)
+{
+ /* returns a boolean indicating if there is a Grapheme Cluster Boundary
+ * between the inputs. See http://www.unicode.org/reports/tr29/ */
+
+ return GCB_table[before][after];
+}
+
+/* Combining marks attach to most classes that precede them, but this defines
+ * the exceptions (from TR14) */
+#define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
+ || prev == LB_Mandatory_Break \
+ || prev == LB_Carriage_Return \
+ || prev == LB_Line_Feed \
+ || prev == LB_Next_Line \
+ || prev == LB_Space \
+ || prev == LB_ZWSpace))
+
+STATIC bool
+S_isLB(pTHX_ LB_enum before,
+ LB_enum after,
+ const U8 * const strbeg,
+ const U8 * const curpos,
+ const U8 * const strend,
+ const bool utf8_target)
+{
+ U8 * temp_pos = (U8 *) curpos;
+ LB_enum prev = before;
+
+ /* Is the boundary between 'before' and 'after' line-breakable?
+ * Most of this is just a table lookup of a generated table from Unicode
+ * rules. But some rules require context to decide, and so have to be
+ * implemented in code */
+
+ PERL_ARGS_ASSERT_ISLB;
+
+ /* Rule numbers in the comments below are as of Unicode 8.0 */
+
+ redo:
+ before = prev;
+ switch (LB_table[before][after]) {
+ case LB_BREAKABLE:
+ return TRUE;
+
+ case LB_NOBREAK:
+ case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
+ return FALSE;
+
+ case LB_SP_foo + LB_BREAKABLE:
+ case LB_SP_foo + LB_NOBREAK:
+ case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
+
+ /* When we have something following a SP, we have to look at the
+ * context in order to know what to do.
+ *
+ * SP SP should not reach here because LB7: Do not break before
+ * spaces. (For two spaces in a row there is nothing that
+ * overrides that) */
+ assert(after != LB_Space);
+
+ /* Here we have a space followed by a non-space. Mostly this is a
+ * case of LB18: "Break after spaces". But there are complications
+ * as the handling of spaces is somewhat tricky. They are in a
+ * number of rules, which have to be applied in priority order, but
+ * something earlier in the string can cause a rule to be skipped
+ * and a lower priority rule invoked. A prime example is LB7 which
+ * says don't break before a space. But rule LB8 (lower priority)
+ * says that the first break opportunity after a ZW is after any
+ * span of spaces immediately after it. If a ZW comes before a SP
+ * in the input, rule LB8 applies, and not LB7. Other such rules
+ * involve combining marks which are rules 9 and 10, but they may
+ * override higher priority rules if they come earlier in the
+ * string. Since we're doing random access into the middle of the
+ * string, we have to look for rules that should get applied based
+ * on both string position and priority. Combining marks do not
+ * attach to either ZW nor SP, so we don't have to consider them
+ * until later.
+ *
+ * To check for LB8, we have to find the first non-space character
+ * before this span of spaces */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Space);
+
+ /* LB8 Break before any character following a zero-width space,
+ * even if one or more spaces intervene.
+ * ZW SP* ÷
+ * So if we have a ZW just before this span, and to get here this
+ * is the final space in the span. */
+ if (prev == LB_ZWSpace) {
+ return TRUE;
+ }
+
+ /* Here, not ZW SP+. There are several rules that have higher
+ * priority than LB18 and can be resolved now, as they don't depend
+ * on anything earlier in the string (except ZW, which we have
+ * already handled). One of these rules is LB11 Do not break
+ * before Word joiner, but we have specially encoded that in the
+ * lookup table so it is caught by the single test below which
+ * catches the other ones. */
+ if (LB_table[LB_Space][after] - LB_SP_foo
+ == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
+ {
+ return FALSE;
+ }
+
+ /* If we get here, we have to XXX consider combining marks. */
+ if (prev == LB_Combining_Mark) {
+
+ /* What happens with these depends on the character they
+ * follow. */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Combining_Mark);
+
+ /* Most times these attach to and inherit the characteristics
+ * of that character, but not always, and when not, they are to
+ * be treated as AL by rule LB10. */
+ if (! LB_CM_ATTACHES_TO(prev)) {
+ prev = LB_Alphabetic;
+ }
+ }
+
+ /* Here, we have the character preceding the span of spaces all set
+ * up. We follow LB18: "Break after spaces" unless the table shows
+ * that is overriden */
+ return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
+
+ case LB_CM_foo:
+
+ /* We don't know how to treat the CM except by looking at the first
+ * non-CM character preceding it */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Combining_Mark);
+
+ /* Here, 'prev' is that first earlier non-CM character. If the CM
+ * attatches to it, then it inherits the behavior of 'prev'. If it
+ * doesn't attach, it is to be treated as an AL */
+ if (! LB_CM_ATTACHES_TO(prev)) {
+ prev = LB_Alphabetic;
+ }
+
+ goto redo;
+
+ case LB_HY_or_BA_then_foo + LB_BREAKABLE:
+ case LB_HY_or_BA_then_foo + LB_NOBREAK:
+
+ /* LB21a Don't break after Hebrew + Hyphen.
+ * HL (HY | BA) × */
+
+ if (backup_one_LB(strbeg, &temp_pos, utf8_target)
+ == LB_Hebrew_Letter)
+ {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
+
+ case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
+ case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
+
+ /* LB25a (PR | PO) × ( OP | HY )? NU */
+ if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
+ == LB_BREAKABLE;
+
+ case LB_SY_or_IS_then_various + LB_BREAKABLE:
+ case LB_SY_or_IS_then_various + LB_NOBREAK:
+ {
+ /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
+
+ LB_enum temp = prev;
+ do {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
+ if (temp == LB_Numeric) {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_SY_or_IS_then_various
+ == LB_BREAKABLE;
+ }
+
+ case LB_various_then_PO_or_PR + LB_BREAKABLE:
+ case LB_various_then_PO_or_PR + LB_NOBREAK:
+ {
+ /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
+
+ LB_enum temp = prev;
+ if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
+ {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ if (temp == LB_Numeric) {
+ return FALSE;
+ }
+ return LB_various_then_PO_or_PR;
+ }
+
+ default:
+ break;
+ }
+
+#ifdef DEBUGGING
+ PerlIO_printf(Perl_error_log, "Unhandled LB pair: LB_table[%d, %d] = %d\n",
+ before, after, LB_table[before][after]);
+ assert(0);
+#endif
+ return TRUE;
+}
+
+STATIC LB_enum
+S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
+{
+ LB_enum lb;
+
+ PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
+
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+
+ if (utf8_target) {
+ *curpos += UTF8SKIP(*curpos);
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+ lb = getLB_VAL_UTF8(*curpos, strend);
+ }
+ else {
+ (*curpos)++;
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+ lb = getLB_VAL_CP(**curpos);
+ }
+
+ return lb;
+}
+
+STATIC LB_enum
+S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
+{
+ LB_enum lb;
+
+ PERL_ARGS_ASSERT_BACKUP_ONE_LB;
+
+ if (*curpos < strbeg) {
+ return LB_EDGE;
+ }
+
+ if (utf8_target) {
+ U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
+ U8 * prev_prev_char_pos;
+
+ if (! prev_char_pos) {
+ return LB_EDGE;
+ }
+
+ if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
+ lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
+ *curpos = prev_char_pos;
+ prev_char_pos = prev_prev_char_pos;
+ }
+ else {
+ *curpos = (U8 *) strbeg;
+ return LB_EDGE;
+ }
+ }
+ else {
+ if (*curpos - 2 < strbeg) {
+ *curpos = (U8 *) strbeg;
+ return LB_EDGE;
+ }
+ (*curpos)--;
+ lb = getLB_VAL_CP(*(*curpos - 1));
+ }
+
+ return lb;
+}
+
+/* This creates a single number by combining two, with 'before' being like the
+ * 10's digit, but this isn't necessarily base 10; it is base however many
+ * elements of the enum there are */
+#define SBcase(before, after) ((SB_ENUM_COUNT * before) + after)
+
+STATIC bool
+S_isSB(pTHX_ SB_enum before,
+ SB_enum after,
+ const U8 * const strbeg,
+ const U8 * const curpos,
+ const U8 * const strend,
+ const bool utf8_target)
+{
+ /* returns a boolean indicating if there is a Sentence Boundary Break
+ * between the inputs. See http://www.unicode.org/reports/tr29/ */
+
+ U8 * lpos = (U8 *) curpos;
+ U8 * temp_pos;
+ SB_enum backup;
+
+ PERL_ARGS_ASSERT_ISSB;
+
+ /* Break at the start and end of text.
+ SB1. sot ÷
+ SB2. ÷ eot */
+ if (before == SB_EDGE || after == SB_EDGE) {
+ return TRUE;
+ }
+
+ /* SB 3: Do not break within CRLF. */
+ if (before == SB_CR && after == SB_LF) {
+ return FALSE;
+ }
+
+ /* Break after paragraph separators. (though why CR and LF are considered
+ * so is beyond me (khw)
+ SB4. Sep | CR | LF ÷ */
+ if (before == SB_Sep || before == SB_CR || before == SB_LF) {
+ return TRUE;
+ }
+
+ /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
+ * (See Section 6.2, Replacing Ignore Rules.)
+ SB5. X (Extend | Format)* → X */
+ if (after == SB_Extend || after == SB_Format) {
+ return FALSE;
+ }
+
+ if (before == SB_Extend || before == SB_Format) {
+ before = backup_one_SB(strbeg, &lpos, utf8_target);
+ }
+
+ /* Do not break after ambiguous terminators like period, if they are
+ * immediately followed by a number or lowercase letter, if they are
+ * between uppercase letters, if the first following letter (optionally
+ * after certain punctuation) is lowercase, or if they are followed by
+ * "continuation" punctuation such as comma, colon, or semicolon. For
+ * example, a period may be an abbreviation or numeric period, and thus may
+ * not mark the end of a sentence.
+
+ * SB6. ATerm × Numeric */
+ if (before == SB_ATerm && after == SB_Numeric) {
+ return FALSE;
+ }
+
+ /* SB7. (Upper | Lower) ATerm × Upper */
+ if (before == SB_ATerm && after == SB_Upper) {
+ temp_pos = lpos;
+ backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
+ if (backup == SB_Upper || backup == SB_Lower) {
+ return FALSE;
+ }
+ }
+
+ /* SB8a. (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
+ * SB10. (STerm | ATerm) Close* Sp* × ( Sp | Sep | CR | LF ) */
+ backup = before;
+ temp_pos = lpos;
+ while (backup == SB_Sp) {
+ backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
+ }
+ while (backup == SB_Close) {
+ backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
+ }
+ if ((backup == SB_STerm || backup == SB_ATerm)
+ && ( after == SB_SContinue
+ || after == SB_STerm
+ || after == SB_ATerm
+ || after == SB_Sp
+ || after == SB_Sep
+ || after == SB_CR
+ || after == SB_LF))
+ {
+ return FALSE;
+ }
+
+ /* SB8. ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR | LF |
+ * STerm | ATerm) )* Lower */
+ if (backup == SB_ATerm) {
+ U8 * rpos = (U8 *) curpos;
+ SB_enum later = after;
+
+ while ( later != SB_OLetter
+ && later != SB_Upper
+ && later != SB_Lower
+ && later != SB_Sep
+ && later != SB_CR
+ && later != SB_LF
+ && later != SB_STerm
+ && later != SB_ATerm
+ && later != SB_EDGE)
+ {
+ later = advance_one_SB(&rpos, strend, utf8_target);
+ }
+ if (later == SB_Lower) {
+ return FALSE;
+ }
+ }
+
+ /* Break after sentence terminators, but include closing punctuation,
+ * trailing spaces, and a paragraph separator (if present). [See note
+ * below.]
+ * SB9. ( STerm | ATerm ) Close* × ( Close | Sp | Sep | CR | LF ) */
+ backup = before;
+ temp_pos = lpos;
+ while (backup == SB_Close) {
+ backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
+ }
+ if ((backup == SB_STerm || backup == SB_ATerm)
+ && ( after == SB_Close
+ || after == SB_Sp
+ || after == SB_Sep
+ || after == SB_CR
+ || after == SB_LF))
+ {
+ return FALSE;
+ }
+
+
+ /* SB11. ( STerm | ATerm ) Close* Sp* ( Sep | CR | LF )? ÷ */
+ temp_pos = lpos;
+ backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
+ if ( backup == SB_Sep
+ || backup == SB_CR
+ || backup == SB_LF)
+ {
+ lpos = temp_pos;
+ }
+ else {
+ backup = before;
+ }
+ while (backup == SB_Sp) {
+ backup = backup_one_SB(strbeg, &lpos, utf8_target);
+ }
+ while (backup == SB_Close) {
+ backup = backup_one_SB(strbeg, &lpos, utf8_target);
+ }
+ if (backup == SB_STerm || backup == SB_ATerm) {
+ return TRUE;
+ }
+
+ /* Otherwise, do not break.
+ SB12. Any × Any */
+
+ return FALSE;
+}
+
+STATIC SB_enum
+S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
+{
+ SB_enum sb;
+
+ PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
+
+ if (*curpos >= strend) {
+ return SB_EDGE;
+ }
+
+ if (utf8_target) {
+ do {
+ *curpos += UTF8SKIP(*curpos);
+ if (*curpos >= strend) {
+ return SB_EDGE;
+ }
+ sb = getSB_VAL_UTF8(*curpos, strend);
+ } while (sb == SB_Extend || sb == SB_Format);
+ }
+ else {
+ do {
+ (*curpos)++;
+ if (*curpos >= strend) {
+ return SB_EDGE;
+ }
+ sb = getSB_VAL_CP(**curpos);
+ } while (sb == SB_Extend || sb == SB_Format);
+ }
+
+ return sb;
+}
+
+STATIC SB_enum
+S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
+{
+ SB_enum sb;
+
+ PERL_ARGS_ASSERT_BACKUP_ONE_SB;
+
+ if (*curpos < strbeg) {
+ return SB_EDGE;
+ }
+
+ if (utf8_target) {
+ U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
+ if (! prev_char_pos) {
+ return SB_EDGE;
+ }
+
+ /* Back up over Extend and Format. curpos is always just to the right
+ * of the characater whose value we are getting */
+ do {
+ U8 * prev_prev_char_pos;
+ if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
+ strbeg)))
+ {
+ sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
+ *curpos = prev_char_pos;
+ prev_char_pos = prev_prev_char_pos;
+ }
+ else {
+ *curpos = (U8 *) strbeg;
+ return SB_EDGE;
+ }
+ } while (sb == SB_Extend || sb == SB_Format);
+ }
+ else {
+ do {
+ if (*curpos - 2 < strbeg) {
+ *curpos = (U8 *) strbeg;
+ return SB_EDGE;
+ }
+ (*curpos)--;
+ sb = getSB_VAL_CP(*(*curpos - 1));
+ } while (sb == SB_Extend || sb == SB_Format);
+ }
+
+ return sb;
+}
+
+#define WBcase(before, after) ((WB_ENUM_COUNT * before) + after)
+
+STATIC bool
+S_isWB(pTHX_ WB_enum previous,
+ WB_enum before,
+ WB_enum after,
+ const U8 * const strbeg,
+ const U8 * const curpos,
+ const U8 * const strend,
+ const bool utf8_target)
+{
+ /* Return a boolean as to if the boundary between 'before' and 'after' is
+ * a Unicode word break, using their published algorithm, but tailored for
+ * Perl by treating spans of white space as one unit. Context may be
+ * needed to make this determination. If the value for the character
+ * before 'before' is known, it is passed as 'previous'; otherwise that
+ * should be set to WB_UNKNOWN. The other input parameters give the
+ * boundaries and current position in the matching of the string. That
+ * is, 'curpos' marks the position where the character whose wb value is
+ * 'after' begins. See http://www.unicode.org/reports/tr29/ */
+
+ U8 * before_pos = (U8 *) curpos;
+ U8 * after_pos = (U8 *) curpos;
+
+ PERL_ARGS_ASSERT_ISWB;
+
+ /* WB1 and WB2: Break at the start and end of text. */
+ if (before == WB_EDGE || after == WB_EDGE) {
+ return TRUE;
+ }
+
+ /* WB 3 is: "Do not break within CRLF." Perl extends this so that all
+ * white space sequences ending in a vertical space are treated as one
+ * unit. */
+
+ if (after == WB_CR || after == WB_LF || after == WB_Newline) {
+ if (before == WB_CR || before == WB_LF || before == WB_Newline
+ || before == WB_Perl_Tailored_HSpace)
+ {
+ return FALSE;
+ }
+
+ /* WB 3a: Otherwise break before Newlines (including CR and LF) */
+ return TRUE;
+ }
+
+ /* Here, we know that 'after' is not a vertical space character, but
+ * 'before' could be. WB 3b is: "Otherwise break after Newlines (including
+ * CR and LF)." Perl changes that to not break-up spans of white space,
+ * except when horizontal space is followed by an Extend or Format
+ * character. These apply just to the final white space character in the
+ * span, so it is broken away from the rest. (If the Extend or Format
+ * character follows a vertical space character, it is treated as beginning
+ * a line, and doesn't modify the preceeding character.) */
+ if ( before == WB_CR || before == WB_LF || before == WB_Newline
+ || before == WB_Perl_Tailored_HSpace)
+ {
+ if (after == WB_Perl_Tailored_HSpace) {
+ U8 * temp_pos = (U8 *) curpos;
+ const WB_enum next
+ = advance_one_WB(&temp_pos, strend, utf8_target,
+ FALSE /* Don't skip Extend nor Format */ );
+ return next == WB_Extend || next == WB_Format;
+ }
+ else if (before != WB_Perl_Tailored_HSpace) {
+
+ /* Here, 'before' must be one of the vertical space characters, and
+ * after is not any type of white-space. Follow WB 3b. */
+ return TRUE;
+ }
+
+ /* Here, 'before' is horizontal space, and 'after' is not any kind of
+ * space. Normal rules apply */
+ }
+
+ /* Ignore Format and Extend characters, except when they appear at the
+ * beginning of a region of text.
+ * WB4. X (Extend | Format)* → X. */
+
+ if (after == WB_Extend || after == WB_Format) {
+ return FALSE;
+ }
+
+ if (before == WB_Extend || before == WB_Format) {
+ before = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
+ }
+
+ switch (WBcase(before, after)) {
+ /* Otherwise, break everywhere (including around ideographs).
+ WB14. Any ÷ Any */
+ default:
+ return TRUE;
+
+ /* Do not break between most letters.
+ WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) */
+ case WBcase(WB_ALetter, WB_ALetter):
+ case WBcase(WB_ALetter, WB_Hebrew_Letter):
+ case WBcase(WB_Hebrew_Letter, WB_ALetter):
+ case WBcase(WB_Hebrew_Letter, WB_Hebrew_Letter):
+ return FALSE;
+
+ /* Do not break letters across certain punctuation.
+ WB6. (ALetter | Hebrew_Letter)
+ × (MidLetter | MidNumLet | Single_Quote) (ALetter
+ | Hebrew_Letter) */
+ case WBcase(WB_ALetter, WB_MidLetter):
+ case WBcase(WB_ALetter, WB_MidNumLet):
+ case WBcase(WB_ALetter, WB_Single_Quote):
+ case WBcase(WB_Hebrew_Letter, WB_MidLetter):
+ case WBcase(WB_Hebrew_Letter, WB_MidNumLet):
+ /*case WBcase(WB_Hebrew_Letter, WB_Single_Quote):*/
+ after = advance_one_WB(&after_pos, strend, utf8_target,
+ TRUE /* Do skip Extend and Format */ );
+ return after != WB_ALetter && after != WB_Hebrew_Letter;
+
+ /* WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet |
+ * Single_Quote) × (ALetter | Hebrew_Letter) */
+ case WBcase(WB_MidLetter, WB_ALetter):
+ case WBcase(WB_MidLetter, WB_Hebrew_Letter):
+ case WBcase(WB_MidNumLet, WB_ALetter):
+ case WBcase(WB_MidNumLet, WB_Hebrew_Letter):
+ case WBcase(WB_Single_Quote, WB_ALetter):
+ case WBcase(WB_Single_Quote, WB_Hebrew_Letter):
+ before
+ = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
+ return before != WB_ALetter && before != WB_Hebrew_Letter;
+
+ /* WB7a. Hebrew_Letter × Single_Quote */
+ case WBcase(WB_Hebrew_Letter, WB_Single_Quote):
+ return FALSE;
+
+ /* WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter */
+ case WBcase(WB_Hebrew_Letter, WB_Double_Quote):
+ return advance_one_WB(&after_pos, strend, utf8_target,
+ TRUE /* Do skip Extend and Format */ )
+ != WB_Hebrew_Letter;
+
+ /* WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter */
+ case WBcase(WB_Double_Quote, WB_Hebrew_Letter):
+ return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
+ != WB_Hebrew_Letter;
+
+ /* Do not break within sequences of digits, or digits adjacent to
+ * letters (“3a”, or “A3”).
+ WB8. Numeric × Numeric */
+ case WBcase(WB_Numeric, WB_Numeric):
+ return FALSE;
+
+ /* WB9. (ALetter | Hebrew_Letter) × Numeric */
+ case WBcase(WB_ALetter, WB_Numeric):
+ case WBcase(WB_Hebrew_Letter, WB_Numeric):
+ return FALSE;
+
+ /* WB10. Numeric × (ALetter | Hebrew_Letter) */
+ case WBcase(WB_Numeric, WB_ALetter):
+ case WBcase(WB_Numeric, WB_Hebrew_Letter):
+ return FALSE;
+
+ /* Do not break within sequences, such as “3.2” or “3,456.789”.
+ WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
+ */
+ case WBcase(WB_MidNum, WB_Numeric):
+ case WBcase(WB_MidNumLet, WB_Numeric):
+ case WBcase(WB_Single_Quote, WB_Numeric):
+ return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
+ != WB_Numeric;
+
+ /* WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
+ * */
+ case WBcase(WB_Numeric, WB_MidNum):
+ case WBcase(WB_Numeric, WB_MidNumLet):
+ case WBcase(WB_Numeric, WB_Single_Quote):
+ return advance_one_WB(&after_pos, strend, utf8_target,
+ TRUE /* Do skip Extend and Format */ )
+ != WB_Numeric;
+
+ /* Do not break between Katakana.
+ WB13. Katakana × Katakana */
+ case WBcase(WB_Katakana, WB_Katakana):
+ return FALSE;
+
+ /* Do not break from extenders.
+ WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana |
+ ExtendNumLet) × ExtendNumLet */
+ case WBcase(WB_ALetter, WB_ExtendNumLet):
+ case WBcase(WB_Hebrew_Letter, WB_ExtendNumLet):
+ case WBcase(WB_Numeric, WB_ExtendNumLet):
+ case WBcase(WB_Katakana, WB_ExtendNumLet):
+ case WBcase(WB_ExtendNumLet, WB_ExtendNumLet):
+ return FALSE;
+
+ /* WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric
+ * | Katakana) */
+ case WBcase(WB_ExtendNumLet, WB_ALetter):
+ case WBcase(WB_ExtendNumLet, WB_Hebrew_Letter):
+ case WBcase(WB_ExtendNumLet, WB_Numeric):
+ case WBcase(WB_ExtendNumLet, WB_Katakana):
+ return FALSE;
+
+ /* Do not break between regional indicator symbols.
+ WB13c. Regional_Indicator × Regional_Indicator */
+ case WBcase(WB_Regional_Indicator, WB_Regional_Indicator):
+ return FALSE;
+
+ }
+
+ NOT_REACHED; /* NOTREACHED */
+}
+
+STATIC WB_enum
+S_advance_one_WB(pTHX_ U8 ** curpos,
+ const U8 * const strend,
+ const bool utf8_target,
+ const bool skip_Extend_Format)
+{
+ WB_enum wb;
+
+ PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
+
+ if (*curpos >= strend) {
+ return WB_EDGE;
+ }
+
+ if (utf8_target) {
+
+ /* Advance over Extend and Format */
+ do {
+ *curpos += UTF8SKIP(*curpos);
+ if (*curpos >= strend) {
+ return WB_EDGE;
+ }
+ wb = getWB_VAL_UTF8(*curpos, strend);
+ } while ( skip_Extend_Format
+ && (wb == WB_Extend || wb == WB_Format));
+ }
+ else {
+ do {
+ (*curpos)++;
+ if (*curpos >= strend) {
+ return WB_EDGE;
+ }
+ wb = getWB_VAL_CP(**curpos);
+ } while ( skip_Extend_Format
+ && (wb == WB_Extend || wb == WB_Format));
+ }