#define getGCB_VAL_UTF8(pos, strend) \
_generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
+/* Returns the LB value for the input code point */
+#define getLB_VAL_CP(cp) \
+ _generic_GET_BREAK_VAL_CP( \
+ PL_LB_invlist, \
+ _Perl_LB_invmap, \
+ (cp))
+
+/* Returns the LB value for the first code point in the UTF-8 encoded string
+ * bounded by pos and strend */
+#define getLB_VAL_UTF8(pos, strend) \
+ _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
+
/* Returns the SB value for the input code point */
#define getSB_VAL_CP(cp) \
}
break;
+ case LB_BOUND:
+ if (s == reginfo->strbeg) {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ s += (utf8_target) ? UTF8SKIP(s) : 1;
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
+ }
+
+ if (utf8_target) {
+ LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
+ if (to_complement ^ isLB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target)
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ before = after;
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Not utf8. */
+ LB_enum before = getLB_VAL_CP((U8) *(s -1));
+ while (s < strend) {
+ LB_enum after = getLB_VAL_CP((U8) *s);
+ if (to_complement ^ isLB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target)
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ before = after;
+ s++;
+ }
+ }
+
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+
+ break;
+
case SB_BOUND:
if (s == reginfo->strbeg) {
if (reginfo->intuit || regtry(reginfo, &s)) {
return TRUE;
}
-/* This creates a single number by combining two, with 'before' being like the
- * 10's digit, but this isn't necessarily base 10; it is base however many
- * elements of the enum there are */
-#define GCBcase(before, after) ((GCB_ENUM_COUNT * before) + after)
-
-STATIC bool
+PERL_STATIC_INLINE bool
S_isGCB(const GCB_enum before, const GCB_enum after)
{
/* returns a boolean indicating if there is a Grapheme Cluster Boundary
* between the inputs. See http://www.unicode.org/reports/tr29/ */
- switch (GCBcase(before, after)) {
+ return GCB_table[before][after];
+}
+
+/* Combining marks attach to most classes that precede them, but this defines
+ * the exceptions (from TR14) */
+#define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
+ || prev == LB_Mandatory_Break \
+ || prev == LB_Carriage_Return \
+ || prev == LB_Line_Feed \
+ || prev == LB_Next_Line \
+ || prev == LB_Space \
+ || prev == LB_ZWSpace))
+
+STATIC bool
+S_isLB(pTHX_ LB_enum before,
+ LB_enum after,
+ const U8 * const strbeg,
+ const U8 * const curpos,
+ const U8 * const strend,
+ const bool utf8_target)
+{
+ U8 * temp_pos = (U8 *) curpos;
+ LB_enum prev = before;
+
+ /* Is the boundary between 'before' and 'after' line-breakable?
+ * Most of this is just a table lookup of a generated table from Unicode
+ * rules. But some rules require context to decide, and so have to be
+ * implemented in code */
- /* Break at the start and end of text.
- GB1. sot ÷
- GB2. ÷ eot
+ PERL_ARGS_ASSERT_ISLB;
- Break before and after controls except between CR and LF
- GB4. ( Control | CR | LF ) ÷
- GB5. ÷ ( Control | CR | LF )
+ /* Rule numbers in the comments below are as of Unicode 8.0 */
- Otherwise, break everywhere.
- GB10. Any ÷ Any */
- default:
+ redo:
+ before = prev;
+ switch (LB_table[before][after]) {
+ case LB_BREAKABLE:
return TRUE;
- /* Do not break between a CR and LF.
- GB3. CR × LF */
- case GCBcase(GCB_CR, GCB_LF):
+ case LB_NOBREAK:
+ case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
return FALSE;
- /* Do not break Hangul syllable sequences.
- GB6. L × ( L | V | LV | LVT ) */
- case GCBcase(GCB_L, GCB_L):
- case GCBcase(GCB_L, GCB_V):
- case GCBcase(GCB_L, GCB_LV):
- case GCBcase(GCB_L, GCB_LVT):
- return FALSE;
+ case LB_SP_foo + LB_BREAKABLE:
+ case LB_SP_foo + LB_NOBREAK:
+ case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
- /* GB7. ( LV | V ) × ( V | T ) */
- case GCBcase(GCB_LV, GCB_V):
- case GCBcase(GCB_LV, GCB_T):
- case GCBcase(GCB_V, GCB_V):
- case GCBcase(GCB_V, GCB_T):
- return FALSE;
+ /* When we have something following a SP, we have to look at the
+ * context in order to know what to do.
+ *
+ * SP SP should not reach here because LB7: Do not break before
+ * spaces. (For two spaces in a row there is nothing that
+ * overrides that) */
+ assert(after != LB_Space);
+
+ /* Here we have a space followed by a non-space. Mostly this is a
+ * case of LB18: "Break after spaces". But there are complications
+ * as the handling of spaces is somewhat tricky. They are in a
+ * number of rules, which have to be applied in priority order, but
+ * something earlier in the string can cause a rule to be skipped
+ * and a lower priority rule invoked. A prime example is LB7 which
+ * says don't break before a space. But rule LB8 (lower priority)
+ * says that the first break opportunity after a ZW is after any
+ * span of spaces immediately after it. If a ZW comes before a SP
+ * in the input, rule LB8 applies, and not LB7. Other such rules
+ * involve combining marks which are rules 9 and 10, but they may
+ * override higher priority rules if they come earlier in the
+ * string. Since we're doing random access into the middle of the
+ * string, we have to look for rules that should get applied based
+ * on both string position and priority. Combining marks do not
+ * attach to either ZW nor SP, so we don't have to consider them
+ * until later.
+ *
+ * To check for LB8, we have to find the first non-space character
+ * before this span of spaces */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Space);
+
+ /* LB8 Break before any character following a zero-width space,
+ * even if one or more spaces intervene.
+ * ZW SP* ÷
+ * So if we have a ZW just before this span, and to get here this
+ * is the final space in the span. */
+ if (prev == LB_ZWSpace) {
+ return TRUE;
+ }
- /* GB8. ( LVT | T) × T */
- case GCBcase(GCB_LVT, GCB_T):
- case GCBcase(GCB_T, GCB_T):
- return FALSE;
+ /* Here, not ZW SP+. There are several rules that have higher
+ * priority than LB18 and can be resolved now, as they don't depend
+ * on anything earlier in the string (except ZW, which we have
+ * already handled). One of these rules is LB11 Do not break
+ * before Word joiner, but we have specially encoded that in the
+ * lookup table so it is caught by the single test below which
+ * catches the other ones. */
+ if (LB_table[LB_Space][after] - LB_SP_foo
+ == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
+ {
+ return FALSE;
+ }
- /* Do not break between regional indicator symbols.
- GB8a. Regional_Indicator × Regional_Indicator */
- case GCBcase(GCB_Regional_Indicator, GCB_Regional_Indicator):
- return FALSE;
+ /* If we get here, we have to XXX consider combining marks. */
+ if (prev == LB_Combining_Mark) {
- /* Do not break before extending characters.
- GB9. × Extend */
- case GCBcase(GCB_Other, GCB_Extend):
- case GCBcase(GCB_Extend, GCB_Extend):
- case GCBcase(GCB_L, GCB_Extend):
- case GCBcase(GCB_LV, GCB_Extend):
- case GCBcase(GCB_LVT, GCB_Extend):
- case GCBcase(GCB_Prepend, GCB_Extend):
- case GCBcase(GCB_Regional_Indicator, GCB_Extend):
- case GCBcase(GCB_SpacingMark, GCB_Extend):
- case GCBcase(GCB_T, GCB_Extend):
- case GCBcase(GCB_V, GCB_Extend):
- return FALSE;
+ /* What happens with these depends on the character they
+ * follow. */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Combining_Mark);
- /* Do not break before SpacingMarks, or after Prepend characters.
- GB9a. × SpacingMark */
- case GCBcase(GCB_Other, GCB_SpacingMark):
- case GCBcase(GCB_Extend, GCB_SpacingMark):
- case GCBcase(GCB_L, GCB_SpacingMark):
- case GCBcase(GCB_LV, GCB_SpacingMark):
- case GCBcase(GCB_LVT, GCB_SpacingMark):
- case GCBcase(GCB_Prepend, GCB_SpacingMark):
- case GCBcase(GCB_Regional_Indicator, GCB_SpacingMark):
- case GCBcase(GCB_SpacingMark, GCB_SpacingMark):
- case GCBcase(GCB_T, GCB_SpacingMark):
- case GCBcase(GCB_V, GCB_SpacingMark):
- return FALSE;
+ /* Most times these attach to and inherit the characteristics
+ * of that character, but not always, and when not, they are to
+ * be treated as AL by rule LB10. */
+ if (! LB_CM_ATTACHES_TO(prev)) {
+ prev = LB_Alphabetic;
+ }
+ }
- /* GB9b. Prepend × */
- case GCBcase(GCB_Prepend, GCB_Other):
- case GCBcase(GCB_Prepend, GCB_L):
- case GCBcase(GCB_Prepend, GCB_LV):
- case GCBcase(GCB_Prepend, GCB_LVT):
- case GCBcase(GCB_Prepend, GCB_Prepend):
- case GCBcase(GCB_Prepend, GCB_Regional_Indicator):
- case GCBcase(GCB_Prepend, GCB_T):
- case GCBcase(GCB_Prepend, GCB_V):
- return FALSE;
+ /* Here, we have the character preceding the span of spaces all set
+ * up. We follow LB18: "Break after spaces" unless the table shows
+ * that is overriden */
+ return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
+
+ case LB_CM_foo:
+
+ /* We don't know how to treat the CM except by looking at the first
+ * non-CM character preceding it */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Combining_Mark);
+
+ /* Here, 'prev' is that first earlier non-CM character. If the CM
+ * attatches to it, then it inherits the behavior of 'prev'. If it
+ * doesn't attach, it is to be treated as an AL */
+ if (! LB_CM_ATTACHES_TO(prev)) {
+ prev = LB_Alphabetic;
+ }
+
+ goto redo;
+
+ case LB_HY_or_BA_then_foo + LB_BREAKABLE:
+ case LB_HY_or_BA_then_foo + LB_NOBREAK:
+
+ /* LB21a Don't break after Hebrew + Hyphen.
+ * HL (HY | BA) × */
+
+ if (backup_one_LB(strbeg, &temp_pos, utf8_target)
+ == LB_Hebrew_Letter)
+ {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
+
+ case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
+ case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
+
+ /* LB25a (PR | PO) × ( OP | HY )? NU */
+ if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
+ == LB_BREAKABLE;
+
+ case LB_SY_or_IS_then_various + LB_BREAKABLE:
+ case LB_SY_or_IS_then_various + LB_NOBREAK:
+ {
+ /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
+
+ LB_enum temp = prev;
+ do {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
+ if (temp == LB_Numeric) {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_SY_or_IS_then_various
+ == LB_BREAKABLE;
+ }
+
+ case LB_various_then_PO_or_PR + LB_BREAKABLE:
+ case LB_various_then_PO_or_PR + LB_NOBREAK:
+ {
+ /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
+
+ LB_enum temp = prev;
+ if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
+ {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ if (temp == LB_Numeric) {
+ return FALSE;
+ }
+ return LB_various_then_PO_or_PR;
+ }
+
+ default:
+ break;
}
- NOT_REACHED; /* NOTREACHED */
+#ifdef DEBUGGING
+ PerlIO_printf(Perl_error_log, "Unhandled LB pair: LB_table[%d, %d] = %d\n",
+ before, after, LB_table[before][after]);
+ assert(0);
+#endif
+ return TRUE;
}
+STATIC LB_enum
+S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
+{
+ LB_enum lb;
+
+ PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
+
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+
+ if (utf8_target) {
+ *curpos += UTF8SKIP(*curpos);
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+ lb = getLB_VAL_UTF8(*curpos, strend);
+ }
+ else {
+ (*curpos)++;
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+ lb = getLB_VAL_CP(**curpos);
+ }
+
+ return lb;
+}
+
+STATIC LB_enum
+S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
+{
+ LB_enum lb;
+
+ PERL_ARGS_ASSERT_BACKUP_ONE_LB;
+
+ if (*curpos < strbeg) {
+ return LB_EDGE;
+ }
+
+ if (utf8_target) {
+ U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
+ U8 * prev_prev_char_pos;
+
+ if (! prev_char_pos) {
+ return LB_EDGE;
+ }
+
+ if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
+ lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
+ *curpos = prev_char_pos;
+ prev_char_pos = prev_prev_char_pos;
+ }
+ else {
+ *curpos = (U8 *) strbeg;
+ return LB_EDGE;
+ }
+ }
+ else {
+ if (*curpos - 2 < strbeg) {
+ *curpos = (U8 *) strbeg;
+ return LB_EDGE;
+ }
+ (*curpos)--;
+ lb = getLB_VAL_CP(*(*curpos - 1));
+ }
+
+ return lb;
+}
+
+/* This creates a single number by combining two, with 'before' being like the
+ * 10's digit, but this isn't necessarily base 10; it is base however many
+ * elements of the enum there are */
#define SBcase(before, after) ((SB_ENUM_COUNT * before) + after)
STATIC bool
}
break;
+ case LB_BOUND:
+ if (locinput == reginfo->strbeg) {
+ match = FALSE;
+ }
+ else if (NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isLB(getLB_VAL_UTF8(
+ reghop3((U8*)locinput,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend),
+ getLB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
+
case SB_BOUND: /* Always matches at begin and end */
if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
match = TRUE;
}
break;
+ case LB_BOUND:
+ if (locinput == reginfo->strbeg) {
+ match = FALSE;
+ }
+ else if (NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
+ getLB_VAL_CP(UCHARAT(locinput)),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
+
case SB_BOUND: /* Always matches at begin and end */
if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
match = TRUE;
case ANYOF: /* /[abc]/ */
if (NEXTCHR_IS_EOS)
sayNO;
- if (utf8_target && ! UTF8_IS_INVARIANT(locinput)) {
+ if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
utf8_target))
sayNO;
U8 flags = (CXp_SUB_RE |
((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
if (last_pushed_cv) {
+ /* PUSH/POP_MULTICALL save and restore the
+ * caller's PL_comppad; if we call multiple subs
+ * using the same CX block, we have to save and
+ * unwind the varying PL_comppad's ourselves,
+ * especially restoring the right PL_comppad on
+ * backtrack - so save it on the save stack */
+ SAVECOMPPAD();
CHANGE_MULTICALL_FLAGS(newcv, flags);
}
else {