#include "re_top.h"
#endif
-#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
- "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
-
/*
* pregcomp and pregexec -- regsub and regerror are not used in perl
*
#include "invlist_inline.h"
#include "unicode_constants.h"
+#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
+ "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
+
+static const char utf8_locale_required[] =
+ "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
+
#ifdef DEBUGGING
/* At least one required character in the target string is expressible only in
* UTF-8. */
PL_utf8_swash_ptrs[_CC_WORDCHAR], \
"", \
PL_XPosix_ptrs[_CC_WORDCHAR], \
- LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
+ LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
#define PLACEHOLDER /* Something for the preprocessor to grab onto */
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
}
else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
return isFOO_lc(classnum,
- TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
+ EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
}
_CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
"Intuit: trying to determine minimum start position...\n"));
/* for now, assume that all substr offsets are positive. If at some point
- * in the future someone wants to do clever things with look-behind and
+ * in the future someone wants to do clever things with lookbehind and
* -ve offsets, they'll need to fix up any code in this function
* which uses these offsets. See the thread beginning
* <20140113145929.GF27210@iabyn.com>
* didn't contradict, so just retry the anchored "other"
* substr */
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- " Found /%s^%s/m, rescanning for anchored from offset %ld (rx_origin now %"IVdf")...\n",
+ " Found /%s^%s/m, rescanning for anchored from offset %"IVdf" (rx_origin now %"IVdf")...\n",
PL_colors[0], PL_colors[1],
- (long)(rx_origin - strbeg + prog->anchored_offset),
- (long)(rx_origin - strbeg)
+ (IV)(rx_origin - strbeg + prog->anchored_offset),
+ (IV)(rx_origin - strbeg)
));
goto do_other_substr;
}
} else { \
uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
len = UTF8SKIP(uc); \
- skiplen = UNISKIP( uvc ); \
+ skiplen = UVCHR_SKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
} \
} else { \
len = 1; \
uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
- skiplen = UNISKIP( uvc ); \
+ skiplen = UVCHR_SKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
} \
#define getGCB_VAL_CP(cp) \
_generic_GET_BREAK_VAL_CP( \
PL_GCB_invlist, \
- Grapheme_Cluster_Break_invmap, \
+ _Perl_GCB_invmap, \
(cp))
/* Returns the GCB value for the first code point in the UTF-8 encoded string
#define getGCB_VAL_UTF8(pos, strend) \
_generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
+/* Returns the LB value for the input code point */
+#define getLB_VAL_CP(cp) \
+ _generic_GET_BREAK_VAL_CP( \
+ PL_LB_invlist, \
+ _Perl_LB_invmap, \
+ (cp))
+
+/* Returns the LB value for the first code point in the UTF-8 encoded string
+ * bounded by pos and strend */
+#define getLB_VAL_UTF8(pos, strend) \
+ _generic_GET_BREAK_VAL_UTF8(getLB_VAL_CP, pos, strend)
+
/* Returns the SB value for the input code point */
#define getSB_VAL_CP(cp) \
_generic_GET_BREAK_VAL_CP( \
PL_SB_invlist, \
- Sentence_Break_invmap, \
+ _Perl_SB_invmap, \
(cp))
/* Returns the SB value for the first code point in the UTF-8 encoded string
#define getWB_VAL_CP(cp) \
_generic_GET_BREAK_VAL_CP( \
PL_WB_invlist, \
- Word_Break_invmap, \
+ _Perl_WB_invmap, \
(cp))
/* Returns the WB value for the first code point in the UTF-8 encoded string
switch (OP(c)) {
case ANYOFL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(c)) && ! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
+
/* FALLTHROUGH */
+ case ANYOFD:
case ANYOF:
if (utf8_target) {
REXEC_FBC_UTF8_CLASS_SCAN(
FBC_BOUND(isWORDCHAR_L1, isWORDCHAR_uni, isWORDCHAR_utf8);
break;
case GCB_BOUND:
- if (s == reginfo->strbeg) { /* GCB always matches at begin and
- end */
- if (to_complement ^ cBOOL(reginfo->intuit
- || regtry(reginfo, &s)))
+ if (s == reginfo->strbeg) {
+ if (reginfo->intuit || regtry(reginfo, &s))
{
goto got_it;
}
+
+ /* Didn't match. Try at the next position (if there is one) */
s += (utf8_target) ? UTF8SKIP(s) : 1;
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
}
if (utf8_target) {
while (s < strend) {
GCB_enum after = getGCB_VAL_UTF8((U8*) s,
(U8*) reginfo->strend);
- if (to_complement ^ isGCB(before, after)) {
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
- }
- before = after;
+ if ( (to_complement ^ isGCB(before, after))
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
}
+ before = after;
s += UTF8SKIP(s);
}
}
else { /* Not utf8. Everything is a GCB except between CR and
LF */
while (s < strend) {
- if (to_complement ^ (UCHARAT(s - 1) != '\r'
- || UCHARAT(s) != '\n'))
+ if ((to_complement ^ ( UCHARAT(s - 1) != '\r'
+ || UCHARAT(s) != '\n'))
+ && (reginfo->intuit || regtry(reginfo, &s)))
{
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
- }
- s++;
+ goto got_it;
}
+ s++;
}
}
- if (to_complement ^ cBOOL(reginfo->intuit || regtry(reginfo, &s))) {
+ /* And, since this is a bound, it can match after the final
+ * character in the string */
+ if ((reginfo->intuit || regtry(reginfo, &s))) {
goto got_it;
}
break;
- case SB_BOUND:
- if (s == reginfo->strbeg) { /* SB always matches at beginning */
- if (to_complement
- ^ cBOOL(reginfo->intuit || regtry(reginfo, &s)))
- {
+ case LB_BOUND:
+ if (s == reginfo->strbeg) {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
goto got_it;
}
+ s += (utf8_target) ? UTF8SKIP(s) : 1;
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
+ }
+
+ if (utf8_target) {
+ LB_enum before = getLB_VAL_UTF8(reghop3((U8*)s,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ LB_enum after = getLB_VAL_UTF8((U8*) s, (U8*) reginfo->strend);
+ if (to_complement ^ isLB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target)
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ before = after;
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Not utf8. */
+ LB_enum before = getLB_VAL_CP((U8) *(s -1));
+ while (s < strend) {
+ LB_enum after = getLB_VAL_CP((U8) *s);
+ if (to_complement ^ isLB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target)
+ && (reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+ before = after;
+ s++;
+ }
+ }
- /* Didn't match. Go try at the next position */
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+
+ break;
+
+ case SB_BOUND:
+ if (s == reginfo->strbeg) {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
s += (utf8_target) ? UTF8SKIP(s) : 1;
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
}
if (utf8_target) {
while (s < strend) {
SB_enum after = getSB_VAL_UTF8((U8*) s,
(U8*) reginfo->strend);
- if (to_complement ^ isSB(before,
- after,
- (U8*) reginfo->strbeg,
- (U8*) s,
- (U8*) reginfo->strend,
- utf8_target))
+ if ((to_complement ^ isSB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ && (reginfo->intuit || regtry(reginfo, &s)))
{
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
- }
- before = after;
+ goto got_it;
}
+ before = after;
s += UTF8SKIP(s);
}
}
SB_enum before = getSB_VAL_CP((U8) *(s -1));
while (s < strend) {
SB_enum after = getSB_VAL_CP((U8) *s);
- if (to_complement ^ isSB(before,
- after,
- (U8*) reginfo->strbeg,
- (U8*) s,
- (U8*) reginfo->strend,
- utf8_target))
+ if ((to_complement ^ isSB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ && (reginfo->intuit || regtry(reginfo, &s)))
{
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
- }
- before = after;
+ goto got_it;
}
+ before = after;
s++;
}
}
/* Here are at the final position in the target string. The SB
* value is always true here, so matches, depending on other
* constraints */
- if (to_complement ^ cBOOL(reginfo->intuit
- || regtry(reginfo, &s)))
- {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
goto got_it;
}
case WB_BOUND:
if (s == reginfo->strbeg) {
- if (to_complement ^ cBOOL(reginfo->intuit
- || regtry(reginfo, &s)))
- {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
goto got_it;
}
s += (utf8_target) ? UTF8SKIP(s) : 1;
+ if (UNLIKELY(s >= reginfo->strend)) {
+ break;
+ }
}
if (utf8_target) {
while (s < strend) {
WB_enum after = getWB_VAL_UTF8((U8*) s,
(U8*) reginfo->strend);
- if (to_complement ^ isWB(previous,
- before,
- after,
- (U8*) reginfo->strbeg,
- (U8*) s,
- (U8*) reginfo->strend,
- utf8_target))
+ if ((to_complement ^ isWB(previous,
+ before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ && (reginfo->intuit || regtry(reginfo, &s)))
{
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
- }
- previous = before;
- before = after;
+ goto got_it;
}
+ previous = before;
+ before = after;
s += UTF8SKIP(s);
}
}
WB_enum before = getWB_VAL_CP((U8) *(s -1));
while (s < strend) {
WB_enum after = getWB_VAL_CP((U8) *s);
- if (to_complement ^ isWB(previous,
- before,
- after,
- (U8*) reginfo->strbeg,
- (U8*) s,
- (U8*) reginfo->strend,
- utf8_target))
+ if ((to_complement ^ isWB(previous,
+ before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ && (reginfo->intuit || regtry(reginfo, &s)))
{
- if (reginfo->intuit || regtry(reginfo, &s)) {
- goto got_it;
- }
- previous = before;
- before = after;
+ goto got_it;
}
+ previous = before;
+ before = after;
s++;
}
}
- if (to_complement ^ cBOOL(reginfo->intuit
- || regtry(reginfo, &s)))
- {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
goto got_it;
}
-
- break;
}
break;
classnum)))
|| (UTF8_IS_DOWNGRADEABLE_START(*s)
&& to_complement ^ cBOOL(
- _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
+ _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
*(s + 1)),
classnum))))
{
U32 n = 0;
max = -1;
/* calculate the right-most part of the string covered
- * by a capture. Due to look-ahead, this may be to
+ * by a capture. Due to lookahead, this may be to
* the right of $&, so we have to scan all captures */
while (n <= prog->lastparen) {
if (prog->offs[n].end > max)
U32 n = 0;
min = max;
/* calculate the left-most part of the string covered
- * by a capture. Due to look-behind, this may be to
+ * by a capture. Due to lookbehind, this may be to
* the left of $&, so we have to scan all captures */
while (min && n <= prog->lastparen) {
if ( prog->offs[n].start != -1
startpos = stringarg;
+ /* set these early as they may be used by the HOP macros below */
+ reginfo->strbeg = strbeg;
+ reginfo->strend = strend;
+ reginfo->is_utf8_target = cBOOL(utf8_target);
+
if (prog->intflags & PREGf_GPOS_SEEN) {
MAGIC *mg;
*/
if (prog->intflags & PREGf_ANCH_GPOS) {
- startpos = reginfo->ganch - prog->gofs;
- if (startpos <
- ((flags & REXEC_FAIL_ON_UNDERFLOW) ? stringarg : strbeg))
- {
- DEBUG_r(PerlIO_printf(Perl_debug_log,
- "fail: ganch-gofs before earliest possible start\n"));
- return 0;
+ if (prog->gofs) {
+ startpos = HOPBACKc(reginfo->ganch, prog->gofs);
+ if (!startpos ||
+ ((flags & REXEC_FAIL_ON_UNDERFLOW) && startpos < stringarg))
+ {
+ DEBUG_r(PerlIO_printf(Perl_debug_log,
+ "fail: ganch-gofs before earliest possible start\n"));
+ return 0;
+ }
}
+ else
+ startpos = reginfo->ganch;
}
else if (prog->gofs) {
- if (startpos - prog->gofs < strbeg)
+ startpos = HOPBACKc(startpos, prog->gofs);
+ if (!startpos)
startpos = strbeg;
- else
- startpos -= prog->gofs;
}
else if (prog->intflags & PREGf_GPOS_FLOAT)
startpos = strbeg;
reginfo->prog = rx; /* Yes, sorry that this is confusing. */
reginfo->intuit = 0;
- reginfo->is_utf8_target = cBOOL(utf8_target);
reginfo->is_utf8_pat = cBOOL(RX_UTF8(rx));
reginfo->warned = FALSE;
- reginfo->strbeg = strbeg;
reginfo->sv = sv;
reginfo->poscache_maxiter = 0; /* not yet started a countdown */
- reginfo->strend = strend;
/* see how far we have to get to not match where we matched before */
reginfo->till = stringarg + minend;
/* For anchored \G, the only position it can match from is
* (ganch-gofs); we already set startpos to this above; if intuit
* moved us on from there, we can't possibly succeed */
- assert(startpos == reginfo->ganch - prog->gofs);
+ assert(startpos == HOPBACKc(reginfo->ganch, prog->gofs));
if (s == startpos && regtry(reginfo, &s))
goto got_it;
goto phooey;
/*
- regtry - try match at specific point
*/
-STATIC I32 /* 0 failure, 1 success */
+STATIC bool /* 0 failure, 1 success */
S_regtry(pTHX_ regmatch_info *reginfo, char **startposp)
{
CHECKPOINT lastcp;
return TRUE;
}
-/* This creates a single number by combining two, with 'before' being like the
- * 10's digit, but this isn't necessarily base 10; it is base however many
- * elements of the enum there are */
-#define GCBcase(before, after) ((GCB_ENUM_COUNT * before) + after)
-
-STATIC bool
+PERL_STATIC_INLINE bool
S_isGCB(const GCB_enum before, const GCB_enum after)
{
/* returns a boolean indicating if there is a Grapheme Cluster Boundary
* between the inputs. See http://www.unicode.org/reports/tr29/ */
- switch (GCBcase(before, after)) {
+ return GCB_table[before][after];
+}
- /* Break at the start and end of text.
- GB1. sot ÷
- GB2. ÷ eot
+/* Combining marks attach to most classes that precede them, but this defines
+ * the exceptions (from TR14) */
+#define LB_CM_ATTACHES_TO(prev) ( ! ( prev == LB_EDGE \
+ || prev == LB_Mandatory_Break \
+ || prev == LB_Carriage_Return \
+ || prev == LB_Line_Feed \
+ || prev == LB_Next_Line \
+ || prev == LB_Space \
+ || prev == LB_ZWSpace))
- Break before and after controls except between CR and LF
- GB4. ( Control | CR | LF ) ÷
- GB5. ÷ ( Control | CR | LF )
+STATIC bool
+S_isLB(pTHX_ LB_enum before,
+ LB_enum after,
+ const U8 * const strbeg,
+ const U8 * const curpos,
+ const U8 * const strend,
+ const bool utf8_target)
+{
+ U8 * temp_pos = (U8 *) curpos;
+ LB_enum prev = before;
- Otherwise, break everywhere.
- GB10. Any ÷ Any */
- default:
+ /* Is the boundary between 'before' and 'after' line-breakable?
+ * Most of this is just a table lookup of a generated table from Unicode
+ * rules. But some rules require context to decide, and so have to be
+ * implemented in code */
+
+ PERL_ARGS_ASSERT_ISLB;
+
+ /* Rule numbers in the comments below are as of Unicode 8.0 */
+
+ redo:
+ before = prev;
+ switch (LB_table[before][after]) {
+ case LB_BREAKABLE:
return TRUE;
- /* Do not break between a CR and LF.
- GB3. CR × LF */
- case GCBcase(GCB_CR, GCB_LF):
+ case LB_NOBREAK:
+ case LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
return FALSE;
- /* Do not break Hangul syllable sequences.
- GB6. L × ( L | V | LV | LVT ) */
- case GCBcase(GCB_L, GCB_L):
- case GCBcase(GCB_L, GCB_V):
- case GCBcase(GCB_L, GCB_LV):
- case GCBcase(GCB_L, GCB_LVT):
- return FALSE;
+ case LB_SP_foo + LB_BREAKABLE:
+ case LB_SP_foo + LB_NOBREAK:
+ case LB_SP_foo + LB_NOBREAK_EVEN_WITH_SP_BETWEEN:
- /* GB7. ( LV | V ) × ( V | T ) */
- case GCBcase(GCB_LV, GCB_V):
- case GCBcase(GCB_LV, GCB_T):
- case GCBcase(GCB_V, GCB_V):
- case GCBcase(GCB_V, GCB_T):
- return FALSE;
+ /* When we have something following a SP, we have to look at the
+ * context in order to know what to do.
+ *
+ * SP SP should not reach here because LB7: Do not break before
+ * spaces. (For two spaces in a row there is nothing that
+ * overrides that) */
+ assert(after != LB_Space);
+
+ /* Here we have a space followed by a non-space. Mostly this is a
+ * case of LB18: "Break after spaces". But there are complications
+ * as the handling of spaces is somewhat tricky. They are in a
+ * number of rules, which have to be applied in priority order, but
+ * something earlier in the string can cause a rule to be skipped
+ * and a lower priority rule invoked. A prime example is LB7 which
+ * says don't break before a space. But rule LB8 (lower priority)
+ * says that the first break opportunity after a ZW is after any
+ * span of spaces immediately after it. If a ZW comes before a SP
+ * in the input, rule LB8 applies, and not LB7. Other such rules
+ * involve combining marks which are rules 9 and 10, but they may
+ * override higher priority rules if they come earlier in the
+ * string. Since we're doing random access into the middle of the
+ * string, we have to look for rules that should get applied based
+ * on both string position and priority. Combining marks do not
+ * attach to either ZW nor SP, so we don't have to consider them
+ * until later.
+ *
+ * To check for LB8, we have to find the first non-space character
+ * before this span of spaces */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Space);
+
+ /* LB8 Break before any character following a zero-width space,
+ * even if one or more spaces intervene.
+ * ZW SP* ÷
+ * So if we have a ZW just before this span, and to get here this
+ * is the final space in the span. */
+ if (prev == LB_ZWSpace) {
+ return TRUE;
+ }
- /* GB8. ( LVT | T) × T */
- case GCBcase(GCB_LVT, GCB_T):
- case GCBcase(GCB_T, GCB_T):
- return FALSE;
+ /* Here, not ZW SP+. There are several rules that have higher
+ * priority than LB18 and can be resolved now, as they don't depend
+ * on anything earlier in the string (except ZW, which we have
+ * already handled). One of these rules is LB11 Do not break
+ * before Word joiner, but we have specially encoded that in the
+ * lookup table so it is caught by the single test below which
+ * catches the other ones. */
+ if (LB_table[LB_Space][after] - LB_SP_foo
+ == LB_NOBREAK_EVEN_WITH_SP_BETWEEN)
+ {
+ return FALSE;
+ }
- /* Do not break between regional indicator symbols.
- GB8a. Regional_Indicator × Regional_Indicator */
- case GCBcase(GCB_Regional_Indicator, GCB_Regional_Indicator):
- return FALSE;
+ /* If we get here, we have to XXX consider combining marks. */
+ if (prev == LB_Combining_Mark) {
- /* Do not break before extending characters.
- GB9. × Extend */
- case GCBcase(GCB_Other, GCB_Extend):
- case GCBcase(GCB_Extend, GCB_Extend):
- case GCBcase(GCB_L, GCB_Extend):
- case GCBcase(GCB_LV, GCB_Extend):
- case GCBcase(GCB_LVT, GCB_Extend):
- case GCBcase(GCB_Prepend, GCB_Extend):
- case GCBcase(GCB_Regional_Indicator, GCB_Extend):
- case GCBcase(GCB_SpacingMark, GCB_Extend):
- case GCBcase(GCB_T, GCB_Extend):
- case GCBcase(GCB_V, GCB_Extend):
- return FALSE;
+ /* What happens with these depends on the character they
+ * follow. */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Combining_Mark);
- /* Do not break before SpacingMarks, or after Prepend characters.
- GB9a. × SpacingMark */
- case GCBcase(GCB_Other, GCB_SpacingMark):
- case GCBcase(GCB_Extend, GCB_SpacingMark):
- case GCBcase(GCB_L, GCB_SpacingMark):
- case GCBcase(GCB_LV, GCB_SpacingMark):
- case GCBcase(GCB_LVT, GCB_SpacingMark):
- case GCBcase(GCB_Prepend, GCB_SpacingMark):
- case GCBcase(GCB_Regional_Indicator, GCB_SpacingMark):
- case GCBcase(GCB_SpacingMark, GCB_SpacingMark):
- case GCBcase(GCB_T, GCB_SpacingMark):
- case GCBcase(GCB_V, GCB_SpacingMark):
- return FALSE;
+ /* Most times these attach to and inherit the characteristics
+ * of that character, but not always, and when not, they are to
+ * be treated as AL by rule LB10. */
+ if (! LB_CM_ATTACHES_TO(prev)) {
+ prev = LB_Alphabetic;
+ }
+ }
- /* GB9b. Prepend × */
- case GCBcase(GCB_Prepend, GCB_Other):
- case GCBcase(GCB_Prepend, GCB_L):
- case GCBcase(GCB_Prepend, GCB_LV):
- case GCBcase(GCB_Prepend, GCB_LVT):
- case GCBcase(GCB_Prepend, GCB_Prepend):
- case GCBcase(GCB_Prepend, GCB_Regional_Indicator):
- case GCBcase(GCB_Prepend, GCB_T):
- case GCBcase(GCB_Prepend, GCB_V):
- return FALSE;
+ /* Here, we have the character preceding the span of spaces all set
+ * up. We follow LB18: "Break after spaces" unless the table shows
+ * that is overriden */
+ return LB_table[prev][after] != LB_NOBREAK_EVEN_WITH_SP_BETWEEN;
+
+ case LB_CM_foo:
+
+ /* We don't know how to treat the CM except by looking at the first
+ * non-CM character preceding it */
+ do {
+ prev = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (prev == LB_Combining_Mark);
+
+ /* Here, 'prev' is that first earlier non-CM character. If the CM
+ * attatches to it, then it inherits the behavior of 'prev'. If it
+ * doesn't attach, it is to be treated as an AL */
+ if (! LB_CM_ATTACHES_TO(prev)) {
+ prev = LB_Alphabetic;
+ }
+
+ goto redo;
+
+ case LB_HY_or_BA_then_foo + LB_BREAKABLE:
+ case LB_HY_or_BA_then_foo + LB_NOBREAK:
+
+ /* LB21a Don't break after Hebrew + Hyphen.
+ * HL (HY | BA) × */
+
+ if (backup_one_LB(strbeg, &temp_pos, utf8_target)
+ == LB_Hebrew_Letter)
+ {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_HY_or_BA_then_foo == LB_BREAKABLE;
+
+ case LB_PR_or_PO_then_OP_or_HY + LB_BREAKABLE:
+ case LB_PR_or_PO_then_OP_or_HY + LB_NOBREAK:
+
+ /* LB25a (PR | PO) × ( OP | HY )? NU */
+ if (advance_one_LB(&temp_pos, strend, utf8_target) == LB_Numeric) {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_PR_or_PO_then_OP_or_HY
+ == LB_BREAKABLE;
+
+ case LB_SY_or_IS_then_various + LB_BREAKABLE:
+ case LB_SY_or_IS_then_various + LB_NOBREAK:
+ {
+ /* LB25d NU (SY | IS)* × (NU | SY | IS | CL | CP ) */
+
+ LB_enum temp = prev;
+ do {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric);
+ if (temp == LB_Numeric) {
+ return FALSE;
+ }
+
+ return LB_table[prev][after] - LB_SY_or_IS_then_various
+ == LB_BREAKABLE;
+ }
+
+ case LB_various_then_PO_or_PR + LB_BREAKABLE:
+ case LB_various_then_PO_or_PR + LB_NOBREAK:
+ {
+ /* LB25e NU (SY | IS)* (CL | CP)? × (PO | PR) */
+
+ LB_enum temp = prev;
+ if (temp == LB_Close_Punctuation || temp == LB_Close_Parenthesis)
+ {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ while (temp == LB_Break_Symbols || temp == LB_Infix_Numeric) {
+ temp = backup_one_LB(strbeg, &temp_pos, utf8_target);
+ }
+ if (temp == LB_Numeric) {
+ return FALSE;
+ }
+ return LB_various_then_PO_or_PR;
+ }
+
+ default:
+ break;
}
- NOT_REACHED; /* NOTREACHED */
+#ifdef DEBUGGING
+ PerlIO_printf(Perl_error_log, "Unhandled LB pair: LB_table[%d, %d] = %d\n",
+ before, after, LB_table[before][after]);
+ assert(0);
+#endif
+ return TRUE;
+}
+
+STATIC LB_enum
+S_advance_one_LB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
+{
+ LB_enum lb;
+
+ PERL_ARGS_ASSERT_ADVANCE_ONE_LB;
+
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+
+ if (utf8_target) {
+ *curpos += UTF8SKIP(*curpos);
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+ lb = getLB_VAL_UTF8(*curpos, strend);
+ }
+ else {
+ (*curpos)++;
+ if (*curpos >= strend) {
+ return LB_EDGE;
+ }
+ lb = getLB_VAL_CP(**curpos);
+ }
+
+ return lb;
}
+STATIC LB_enum
+S_backup_one_LB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
+{
+ LB_enum lb;
+
+ PERL_ARGS_ASSERT_BACKUP_ONE_LB;
+
+ if (*curpos < strbeg) {
+ return LB_EDGE;
+ }
+
+ if (utf8_target) {
+ U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
+ U8 * prev_prev_char_pos;
+
+ if (! prev_char_pos) {
+ return LB_EDGE;
+ }
+
+ if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1, strbeg))) {
+ lb = getLB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
+ *curpos = prev_char_pos;
+ prev_char_pos = prev_prev_char_pos;
+ }
+ else {
+ *curpos = (U8 *) strbeg;
+ return LB_EDGE;
+ }
+ }
+ else {
+ if (*curpos - 2 < strbeg) {
+ *curpos = (U8 *) strbeg;
+ return LB_EDGE;
+ }
+ (*curpos)--;
+ lb = getLB_VAL_CP(*(*curpos - 1));
+ }
+
+ return lb;
+}
+
+/* This creates a single number by combining two, with 'before' being like the
+ * 10's digit, but this isn't necessarily base 10; it is base however many
+ * elements of the enum there are */
#define SBcase(before, after) ((SB_ENUM_COUNT * before) + after)
STATIC bool
const bool utf8_target)
{
/* Return a boolean as to if the boundary between 'before' and 'after' is
- * a Unicode word break, using their published algorithm. Context may be
+ * a Unicode word break, using their published algorithm, but tailored for
+ * Perl by treating spans of white space as one unit. Context may be
* needed to make this determination. If the value for the character
* before 'before' is known, it is passed as 'previous'; otherwise that
* should be set to WB_UNKNOWN. The other input parameters give the
return TRUE;
}
- /* WB 3: Do not break within CRLF. */
- if (before == WB_CR && after == WB_LF) {
- return FALSE;
+ /* WB 3 is: "Do not break within CRLF." Perl extends this so that all
+ * white space sequences ending in a vertical space are treated as one
+ * unit. */
+
+ if (after == WB_CR || after == WB_LF || after == WB_Newline) {
+ if (before == WB_CR || before == WB_LF || before == WB_Newline
+ || before == WB_Perl_Tailored_HSpace)
+ {
+ return FALSE;
+ }
+
+ /* WB 3a: Otherwise break before Newlines (including CR and LF) */
+ return TRUE;
}
- /* WB 3a and WB 3b: Otherwise break before and after Newlines (including CR
- * and LF) */
+ /* Here, we know that 'after' is not a vertical space character, but
+ * 'before' could be. WB 3b is: "Otherwise break after Newlines (including
+ * CR and LF)." Perl changes that to not break-up spans of white space,
+ * except when horizontal space is followed by an Extend or Format
+ * character. These apply just to the final white space character in the
+ * span, so it is broken away from the rest. (If the Extend or Format
+ * character follows a vertical space character, it is treated as beginning
+ * a line, and doesn't modify the preceeding character.) */
if ( before == WB_CR || before == WB_LF || before == WB_Newline
- || after == WB_CR || after == WB_LF || after == WB_Newline)
+ || before == WB_Perl_Tailored_HSpace)
{
- return TRUE;
+ if (after == WB_Perl_Tailored_HSpace) {
+ U8 * temp_pos = (U8 *) curpos;
+ const WB_enum next
+ = advance_one_WB(&temp_pos, strend, utf8_target,
+ FALSE /* Don't skip Extend nor Format */ );
+ return next == WB_Extend || next == WB_Format;
+ }
+ else if (before != WB_Perl_Tailored_HSpace) {
+
+ /* Here, 'before' must be one of the vertical space characters, and
+ * after is not any type of white-space. Follow WB 3b. */
+ return TRUE;
+ }
+
+ /* Here, 'before' is horizontal space, and 'after' is not any kind of
+ * space. Normal rules apply */
}
/* Ignore Format and Extend characters, except when they appear at the
case WBcase(WB_Hebrew_Letter, WB_MidLetter):
case WBcase(WB_Hebrew_Letter, WB_MidNumLet):
/*case WBcase(WB_Hebrew_Letter, WB_Single_Quote):*/
- after = advance_one_WB(&after_pos, strend, utf8_target);
+ after = advance_one_WB(&after_pos, strend, utf8_target,
+ TRUE /* Do skip Extend and Format */ );
return after != WB_ALetter && after != WB_Hebrew_Letter;
/* WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet |
/* WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter */
case WBcase(WB_Hebrew_Letter, WB_Double_Quote):
- return advance_one_WB(&after_pos, strend, utf8_target)
- != WB_Hebrew_Letter;
+ return advance_one_WB(&after_pos, strend, utf8_target,
+ TRUE /* Do skip Extend and Format */ )
+ != WB_Hebrew_Letter;
/* WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter */
case WBcase(WB_Double_Quote, WB_Hebrew_Letter):
case WBcase(WB_Numeric, WB_MidNum):
case WBcase(WB_Numeric, WB_MidNumLet):
case WBcase(WB_Numeric, WB_Single_Quote):
- return advance_one_WB(&after_pos, strend, utf8_target)
- != WB_Numeric;
+ return advance_one_WB(&after_pos, strend, utf8_target,
+ TRUE /* Do skip Extend and Format */ )
+ != WB_Numeric;
/* Do not break between Katakana.
WB13. Katakana × Katakana */
}
STATIC WB_enum
-S_advance_one_WB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
+S_advance_one_WB(pTHX_ U8 ** curpos,
+ const U8 * const strend,
+ const bool utf8_target,
+ const bool skip_Extend_Format)
{
WB_enum wb;
return WB_EDGE;
}
wb = getWB_VAL_UTF8(*curpos, strend);
- } while (wb == WB_Extend || wb == WB_Format);
+ } while ( skip_Extend_Format
+ && (wb == WB_Extend || wb == WB_Format));
}
else {
do {
return WB_EDGE;
}
wb = getWB_VAL_CP(**curpos);
- } while (wb == WB_Extend || wb == WB_Format);
+ } while ( skip_Extend_Format
+ && (wb == WB_Extend || wb == WB_Format));
}
return wb;
* to look it up */
if (*previous != WB_UNKNOWN) {
wb = *previous;
- *previous = WB_UNKNOWN;
- /* XXX Note that doesn't change curpos, and maybe should */
- /* But we always back up over these two types */
+ /* But we need to move backwards by one */
+ if (utf8_target) {
+ *curpos = reghopmaybe3(*curpos, -1, strbeg);
+ if (! *curpos) {
+ *previous = WB_EDGE;
+ *curpos = (U8 *) strbeg;
+ }
+ else {
+ *previous = WB_UNKNOWN;
+ }
+ }
+ else {
+ (*curpos)--;
+ *previous = (*curpos <= strbeg) ? WB_EDGE : WB_UNKNOWN;
+ }
+
+ /* And we always back up over these two types */
if (wb != WB_Extend && wb != WB_Format) {
return wb;
}
l++;
}
else {
- if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
+ if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
{
sayNO;
}
s++;
}
else {
- if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
+ if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
{
sayNO;
}
/* FALLTHROUGH */
case BOUNDL: /* /\b/l */
+ {
+ bool b1, b2;
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (FLAGS(scan) != TRADITIONAL_BOUND) {
if (utf8_target) {
if (locinput == reginfo->strbeg)
- ln = isWORDCHAR_LC('\n');
+ b1 = isWORDCHAR_LC('\n');
else {
- ln = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
+ b1 = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
(U8*)(reginfo->strbeg)));
}
- n = (NEXTCHR_IS_EOS)
+ b2 = (NEXTCHR_IS_EOS)
? isWORDCHAR_LC('\n')
: isWORDCHAR_LC_utf8((U8*)locinput);
}
else { /* Here the string isn't utf8 */
- ln = (locinput == reginfo->strbeg)
+ b1 = (locinput == reginfo->strbeg)
? isWORDCHAR_LC('\n')
: isWORDCHAR_LC(UCHARAT(locinput - 1));
- n = (NEXTCHR_IS_EOS)
+ b2 = (NEXTCHR_IS_EOS)
? isWORDCHAR_LC('\n')
: isWORDCHAR_LC(nextchr);
}
- if (to_complement ^ (ln == n)) {
+ if (to_complement ^ (b1 == b2)) {
sayNO;
}
break;
+ }
case NBOUND: /* /\B/ */
to_complement = 1;
/* FALLTHROUGH */
case BOUNDA: /* /\b/a */
+ {
+ bool b1, b2;
bound_ascii_match_only:
/* Here the string isn't utf8, or is utf8 and only ascii characters
* 2) it is a multi-byte character, in which case the final byte is
* never mistakable for ASCII, and so the test will say it is
* not a word character, which is the correct answer. */
- ln = (locinput == reginfo->strbeg)
+ b1 = (locinput == reginfo->strbeg)
? isWORDCHAR_A('\n')
: isWORDCHAR_A(UCHARAT(locinput - 1));
- n = (NEXTCHR_IS_EOS)
+ b2 = (NEXTCHR_IS_EOS)
? isWORDCHAR_A('\n')
: isWORDCHAR_A(nextchr);
- if (to_complement ^ (ln == n)) {
+ if (to_complement ^ (b1 == b2)) {
sayNO;
}
break;
+ }
case NBOUNDU: /* /\B/u */
to_complement = 1;
case BOUNDU: /* /\b/u */
boundu:
- if (utf8_target) {
-
+ if (UNLIKELY(reginfo->strbeg >= reginfo->strend)) {
+ match = FALSE;
+ }
+ else if (utf8_target) {
bound_utf8:
switch((bound_type) FLAGS(scan)) {
case TRADITIONAL_BOUND:
- ln = (locinput == reginfo->strbeg)
+ {
+ bool b1, b2;
+ b1 = (locinput == reginfo->strbeg)
? 0 /* isWORDCHAR_L1('\n') */
: isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
(U8*)(reginfo->strbeg)));
- n = (NEXTCHR_IS_EOS)
+ b2 = (NEXTCHR_IS_EOS)
? 0 /* isWORDCHAR_L1('\n') */
: isWORDCHAR_utf8((U8*)locinput);
- match = cBOOL(ln != n);
+ match = cBOOL(b1 != b2);
break;
+ }
case GCB_BOUND:
if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
match = TRUE; /* GCB always matches at begin and
}
break;
+ case LB_BOUND:
+ if (locinput == reginfo->strbeg) {
+ match = FALSE;
+ }
+ else if (NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isLB(getLB_VAL_UTF8(
+ reghop3((U8*)locinput,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend),
+ getLB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
+
case SB_BOUND: /* Always matches at begin and end */
if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
match = TRUE;
else { /* Not utf8 target */
switch((bound_type) FLAGS(scan)) {
case TRADITIONAL_BOUND:
- ln = (locinput == reginfo->strbeg)
+ {
+ bool b1, b2;
+ b1 = (locinput == reginfo->strbeg)
? 0 /* isWORDCHAR_L1('\n') */
: isWORDCHAR_L1(UCHARAT(locinput - 1));
- n = (NEXTCHR_IS_EOS)
+ b2 = (NEXTCHR_IS_EOS)
? 0 /* isWORDCHAR_L1('\n') */
: isWORDCHAR_L1(nextchr);
- match = cBOOL(ln != n);
+ match = cBOOL(b1 != b2);
break;
+ }
case GCB_BOUND:
if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
}
break;
+ case LB_BOUND:
+ if (locinput == reginfo->strbeg) {
+ match = FALSE;
+ }
+ else if (NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isLB(getLB_VAL_CP(UCHARAT(locinput -1)),
+ getLB_VAL_CP(UCHARAT(locinput)),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
+
case SB_BOUND: /* Always matches at begin and end */
if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
match = TRUE;
case ANYOFL: /* /[abc]/l */
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(scan)) && ! IN_UTF8_CTYPE_LOCALE)
+ {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
/* FALLTHROUGH */
+ case ANYOFD: /* /[abc]/d */
case ANYOF: /* /[abc]/ */
if (NEXTCHR_IS_EOS)
sayNO;
- if (utf8_target) {
+ if (utf8_target && ! UTF8_IS_INVARIANT(*locinput)) {
if (!reginclass(rex, scan, (U8*)locinput, (U8*)reginfo->strend,
utf8_target))
sayNO;
}
else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
- (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
- *(locinput + 1))))))
+ EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
+ *(locinput + 1))))))
{
sayNO;
}
}
else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
if (! (to_complement
- ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
+ ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
*(locinput + 1)),
FLAGS(scan)))))
{
U8 flags = (CXp_SUB_RE |
((newcv == caller_cv) ? CXp_SUB_RE_FAKE : 0));
if (last_pushed_cv) {
+ /* PUSH/POP_MULTICALL save and restore the
+ * caller's PL_comppad; if we call multiple subs
+ * using the same CX block, we have to save and
+ * unwind the varying PL_comppad's ourselves,
+ * especially restoring the right PL_comppad on
+ * backtrack - so save it on the save stack */
+ SAVECOMPPAD();
CHANGE_MULTICALL_FLAGS(newcv, flags);
}
else {
break;
case ACCEPT: /* (*ACCEPT) */
- if (ARG(scan)){
+ if (scan->flags)
+ sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
+ if (ARG2L(scan)){
regnode *cursor;
for (cursor=scan;
cursor && OP(cursor)!=END;
NOT_REACHED; /* NOTREACHED */
case CUTGROUP: /* /(*THEN)/ */
- sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
- MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
+ sv_yes_mark = st->u.mark.mark_name = scan->flags
+ ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
+ : NULL;
PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
/* NOTREACHED */
NOT_REACHED; /* NOTREACHED */
/* FALLTHROUGH */
case PRUNE: /* (*PRUNE) */
- if (!scan->flags)
+ if (scan->flags)
sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
PUSH_STATE_GOTO(COMMIT_next, next, locinput);
/* NOTREACHED */
case COMMIT_next_fail:
no_final = 1;
/* FALLTHROUGH */
+ sayNO;
+ NOT_REACHED; /* NOTREACHED */
case OPFAIL: /* (*FAIL) */
- sayNO;
+ if (scan->flags)
+ sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
+ if (logical) {
+ /* deal with (?(?!)X|Y) properly,
+ * make sure we trigger the no branch
+ * of the trailing IFTHEN structure*/
+ sw= 0;
+ break;
+ } else {
+ sayNO;
+ }
/* NOTREACHED */
NOT_REACHED; /* NOTREACHED */
NOT_REACHED; /* NOTREACHED */
case SKIP: /* (*SKIP) */
- if (scan->flags) {
+ if (!scan->flags) {
/* (*SKIP) : if we fail we cut here*/
ST.mark_name = NULL;
ST.mark_loc = locinput;
/* Target isn't utf8; convert the character in the UTF-8
* pattern to non-UTF8, and do a simple loop */
- c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
+ c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
while (scan < loceol && UCHARAT(scan) == c) {
scan++;
}
}
case ANYOFL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if (ANYOFL_UTF8_LOCALE_REQD(FLAGS(p)) && ! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
/* FALLTHROUGH */
+ case ANYOFD:
case ANYOF:
if (utf8_target) {
while (hardcount < max
}
else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
if (! (to_complement
- ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
+ ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
*(scan + 1)),
classnum))))
{
* UTF8_ALLOW_FFFF */
if (c_len == (STRLEN)-1)
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
- if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) {
+ if (c > 255 && OP(n) == ANYOFL && ! ANYOFL_UTF8_LOCALE_REQD(flags)) {
_CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
}
}
if (c < NUM_ANYOF_CODE_POINTS) {
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
- else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII)
+ else if ((flags
+ & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+ && OP(n) == ANYOFD
&& ! utf8_target
&& ! isASCII(c))
{
match = TRUE;
}
else if (flags & ANYOF_LOCALE_FLAGS) {
- if ((flags & ANYOF_LOC_FOLD)
+ if ((flags & ANYOFL_FOLD)
&& c < 256
&& ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
{
{
match = TRUE; /* Everything above the bitmap matches */
}
- else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)
- || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES))
- || ((flags & ANYOF_LOC_FOLD)
- && IN_UTF8_CTYPE_LOCALE
- && ARG(n) != ANYOF_ONLY_HAS_BITMAP))
+ /* Here doesn't match everything above the bitmap. If there is
+ * some information available beyond the bitmap, we may find a
+ * match in it. If so, this is most likely because the code point
+ * is outside the bitmap range. But rarely, it could be because of
+ * some other reason. If so, various flags are set to indicate
+ * this possibility. On ANYOFD nodes, there may be matches that
+ * happen only when the target string is UTF-8; or for other node
+ * types, because runtime lookup is needed, regardless of the
+ * UTF-8ness of the target string. Finally, under /il, there may
+ * be some matches only possible if the locale is a UTF-8 one. */
+ else if ( ARG(n) != ANYOF_ONLY_HAS_BITMAP
+ && ( c >= NUM_ANYOF_CODE_POINTS
+ || ( (flags & ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)
+ && ( UNLIKELY(OP(n) != ANYOFD)
+ || (utf8_target && ! isASCII_uni(c)
+# if NUM_ANYOF_CODE_POINTS > 256
+ && c < 256
+# endif
+ )))
+ || ( ANYOFL_SOME_FOLDS_ONLY_IN_UTF8_LOCALE(flags)
+ && IN_UTF8_CTYPE_LOCALE)))
{
SV* only_utf8_locale = NULL;
SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
}
if (UNICODE_IS_SUPER(c)
- && (flags & ANYOF_WARN_SUPER)
+ && (flags
+ & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+ && OP(n) != ANYOFD
&& ckWARN_d(WARN_NON_UNICODE))
{
Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
if (UTF8_IS_CONTINUED(*s)) {
while (s > lim && UTF8_IS_CONTINUATION(*s))
s--;
+ if (! UTF8_IS_START(*s)) {
+ Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
+ }
}
/* XXX could check well-formedness here */
}
if (UTF8_IS_CONTINUED(*s)) {
while (s > llim && UTF8_IS_CONTINUATION(*s))
s--;
+ if (! UTF8_IS_START(*s)) {
+ Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
+ }
}
/* XXX could check well-formedness here */
}
if (UTF8_IS_CONTINUED(*s)) {
while (s > lim && UTF8_IS_CONTINUATION(*s))
s--;
+ if (! UTF8_IS_START(*s)) {
+ Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
+ }
}
/* XXX could check well-formedness here */
}