*/
/*
- * One Ring to rule them all, One Ring to find them
- &
+ * One Ring to rule them all, One Ring to find them
+ *
* [p.v of _The Lord of the Rings_, opening poem]
* [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
* [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
#include "re_top.h"
#endif
-#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
- "Use of \\b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
-
/*
* pregcomp and pregexec -- regsub and regerror are not used in perl
*
# include "regcomp.h"
#endif
-#include "inline_invlist.c"
+#include "invlist_inline.h"
#include "unicode_constants.h"
+#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
+ "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
+
+static const char utf8_locale_required[] =
+ "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
+
#ifdef DEBUGGING
/* At least one required character in the target string is expressible only in
* UTF-8. */
PL_utf8_swash_ptrs[_CC_WORDCHAR], \
"", \
PL_XPosix_ptrs[_CC_WORDCHAR], \
- LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
+ LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
#define PLACEHOLDER /* Something for the preprocessor to grab onto */
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
case _CC_ENUM_LOWER: return isLOWER_LC(character);
case _CC_ENUM_PRINT: return isPRINT_LC(character);
- case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
case _CC_ENUM_SPACE: return isSPACE_LC(character);
case _CC_ENUM_UPPER: return isUPPER_LC(character);
}
else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
return isFOO_lc(classnum,
- TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
+ EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
}
_CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
}
switch ((_char_class_number) classnum) {
- case _CC_ENUM_SPACE:
- case _CC_ENUM_PSXSPC: return is_XPERLSPACE_high(character);
-
+ case _CC_ENUM_SPACE: return is_XPERLSPACE_high(character);
case _CC_ENUM_BLANK: return is_HORIZWS_high(character);
case _CC_ENUM_XDIGIT: return is_XDIGIT_high(character);
case _CC_ENUM_VERTSPACE: return is_VERTWS_high(character);
* caller will have set strpos=pos()-4; we look for the substr
* at position pos()-4+1, which lines up with the "a" */
- if (prog->check_offset_min == prog->check_offset_max
- && !(prog->intflags & PREGf_CANY_SEEN))
- {
+ if (prog->check_offset_min == prog->check_offset_max) {
/* Substring at constant offset from beg-of-str... */
SSize_t slen = SvCUR(check);
char *s = HOP3c(strpos, prog->check_offset_min, strend);
" At restart: rx_origin=%"IVdf" Check offset min: %"IVdf
" Start shift: %"IVdf" End shift %"IVdf
" Real end Shift: %"IVdf"\n",
- (IV)(rx_origin - strpos),
+ (IV)(rx_origin - strbeg),
(IV)prog->check_offset_min,
(IV)start_shift,
(IV)end_shift,
(IV)prog->check_end_shift);
});
- if (prog->intflags & PREGf_CANY_SEEN) {
- start_point= (U8*)(rx_origin + start_shift);
- end_point= (U8*)(strend - end_shift);
- if (start_point > end_point)
- goto fail_finish;
- } else {
- end_point = HOP3(strend, -end_shift, strbeg);
- start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
- if (!start_point)
- goto fail_finish;
- }
+ end_point = HOP3(strend, -end_shift, strbeg);
+ start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
+ if (!start_point)
+ goto fail_finish;
/* If the regex is absolutely anchored to either the start of the
}
}
- DEBUG_OPTIMISE_MORE_r({
- PerlIO_printf(Perl_debug_log, " fbm_instr len=%d str=<%.*s>\n",
- (int)(end_point - start_point),
- (int)(end_point - start_point) > 20 ? 20 : (int)(end_point - start_point),
- start_point);
- });
-
check_at = fbm_instr( start_point, end_point,
check, multiline ? FBMrf_MULTILINE : 0);
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " doing 'check' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
+ (IV)((char*)start_point - strbeg),
+ (IV)((char*)end_point - strbeg),
+ (IV)(check_at ? check_at - strbeg : -1)
+ ));
+
/* Update the count-of-usability, remove useless subpatterns,
unshift s. */
if (!check_at)
goto fail_finish;
- /* Finish the diagnostic message */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%ld...\n", (long)(check_at - strpos)) );
-
/* set rx_origin to the minimum position where the regex could start
* matching, given the constraint of the just-matched check substring.
* But don't set it lower than previously.
if (check_at - rx_origin > prog->check_offset_max)
rx_origin = HOP3c(check_at, -prog->check_offset_max, rx_origin);
+ /* Finish the diagnostic message */
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ "%ld (rx_origin now %"IVdf")...\n",
+ (long)(check_at - strbeg),
+ (IV)(rx_origin - strbeg)
+ ));
}
must = utf8_target ? other->utf8_substr : other->substr;
assert(SvPOK(must));
- s = fbm_instr(
- (unsigned char*)s,
- (unsigned char*)last + SvCUR(must) - (SvTAIL(must)!=0),
- must,
- multiline ? FBMrf_MULTILINE : 0
- );
+ {
+ char *from = s;
+ char *to = last + SvCUR(must) - (SvTAIL(must)!=0);
+
+ if (from > to) {
+ s = NULL;
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " skipping 'other' fbm scan: %"IVdf" > %"IVdf"\n",
+ (IV)(from - strbeg),
+ (IV)(to - strbeg)
+ ));
+ }
+ else {
+ s = fbm_instr(
+ (unsigned char*)from,
+ (unsigned char*)to,
+ must,
+ multiline ? FBMrf_MULTILINE : 0
+ );
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " doing 'other' fbm scan, [%"IVdf"..%"IVdf"] gave %"IVdf"\n",
+ (IV)(from - strbeg),
+ (IV)(to - strbeg),
+ (IV)(s ? s - strbeg : -1)
+ ));
+ }
+ }
+
DEBUG_EXECUTE_r({
RE_PV_QUOTED_DECL(quoted, utf8_target, PERL_DEBUG_PAD_ZERO(0),
SvPVX_const(must), RE_SV_DUMPLEN(must), 30);
* find it before there, we never will */
if (last >= last1) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- ", giving up...\n"));
+ "; giving up...\n"));
goto fail_finish;
}
/* try to find the check substr again at a later
* position. Maybe next time we'll find the "other" substr
* in range too */
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- ", trying %s at offset %ld...\n",
- (other_ix ? "floating" : "anchored"),
- (long)(HOP3c(check_at, 1, strend) - strpos)));
-
other_last = HOP3c(last, 1, strend) /* highest failure */;
rx_origin =
other_ix /* i.e. if other-is-float */
? HOP3c(rx_origin, 1, strend)
: HOP4c(last, 1 - other->min_offset, strbeg, strend);
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ "; about to retry %s at offset %ld (rx_origin now %"IVdf")...\n",
+ (other_ix ? "floating" : "anchored"),
+ (long)(HOP3c(check_at, 1, strend) - strbeg),
+ (IV)(rx_origin - strbeg)
+ ));
goto restart;
}
else {
- DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, " at offset %ld...\n",
- (long)(s - strpos)));
-
if (other_ix) { /* if (other-is-float) */
/* other_last is set to s, not s+1, since its possible for
* a floating substr to fail first time, then succeed
rx_origin = HOP3c(s, -other->min_offset, strbeg);
other_last = HOP3c(s, 1, strend);
}
+ DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
+ " at offset %ld (rx_origin now %"IVdf")...\n",
+ (long)(s - strbeg),
+ (IV)(rx_origin - strbeg)
+ ));
+
}
}
else {
PerlIO_printf(Perl_debug_log,
" Check-only match: offset min:%"IVdf" max:%"IVdf
" check_at:%"IVdf" rx_origin:%"IVdf" rx_origin-check_at:%"IVdf
- " strend-strpos:%"IVdf"\n",
+ " strend:%"IVdf"\n",
(IV)prog->check_offset_min,
(IV)prog->check_offset_max,
- (IV)(check_at-strpos),
- (IV)(rx_origin-strpos),
+ (IV)(check_at-strbeg),
+ (IV)(rx_origin-strbeg),
(IV)(rx_origin-check_at),
- (IV)(strend-strpos)
+ (IV)(strend-strbeg)
)
);
}
* scanning ahead for the next \n or the next substr is debatable.
* On the one hand you'd expect rare substrings to appear less
* often than \n's. On the other hand, searching for \n means
- * we're effectively flipping been check_substr and "\n" on each
+ * we're effectively flipping between check_substr and "\n" on each
* iteration as the current "rarest" string candidate, which
* means for example that we'll quickly reject the whole string if
* hasn't got a \n, rather than trying every substr position
* check was anchored (and thus has no wiggle room),
* or check was float and rx_origin is above the float range */
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- " Found /%s^%s/m, restarting lookup for check-string at offset %ld...\n",
- PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
+ " Found /%s^%s/m, about to restart lookup for check-string with rx_origin %ld...\n",
+ PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
goto restart;
}
* didn't contradict, so just retry the anchored "other"
* substr */
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- " Found /%s^%s/m at offset %ld, rescanning for anchored from offset %ld...\n",
+ " Found /%s^%s/m, rescanning for anchored from offset %"IVdf" (rx_origin now %"IVdf")...\n",
PL_colors[0], PL_colors[1],
- (long)(rx_origin - strpos),
- (long)(rx_origin - strpos + prog->anchored_offset)));
+ (IV)(rx_origin - strbeg + prog->anchored_offset),
+ (IV)(rx_origin - strbeg)
+ ));
goto do_other_substr;
}
/* success: we don't contradict the found floating substring
* (and there's no anchored substr). */
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- " Found /%s^%s/m at offset %ld...\n",
- PL_colors[0], PL_colors[1], (long)(rx_origin - strpos)));
+ " Found /%s^%s/m with rx_origin %ld...\n",
+ PL_colors[0], PL_colors[1], (long)(rx_origin - strbeg)));
}
else {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
* The condition above is in bytes rather than
* chars for efficiency. It's conservative, in
* that it errs on the side of doing 'goto
- * do_other_substr', where a more accurate
- * char-based calculation will be done */
+ * do_other_substr'. In this case, at worst,
+ * an extra anchored search may get done, but in
+ * practice the extra fbm_instr() is likely to
+ * get skipped anyway. */
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- " Looking for anchored substr starting at offset %ld...\n",
- (long)(other_last - strpos)) );
+ " about to retry anchored at offset %ld (rx_origin now %"IVdf")...\n",
+ (long)(other_last - strbeg),
+ (IV)(rx_origin - strbeg)
+ ));
goto do_other_substr;
}
}
* search for the next \n if any, its safe here */
rx_origin++;
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- " Looking for /%s^%s/m starting at offset %ld...\n",
+ " about to look for /%s^%s/m starting at rx_origin %ld...\n",
PL_colors[0], PL_colors[1],
- (long)(rx_origin - strpos)) );
+ (long)(rx_origin - strbeg)) );
goto postprocess_substr_matches;
}
goto fail;
}
DEBUG_EXECUTE_r( PerlIO_printf(Perl_debug_log,
- " Looking for %s substr starting at offset %ld...\n",
+ " about to look for %s substr starting at offset %ld (rx_origin now %"IVdf")...\n",
(prog->substrs->check_ix ? "floating" : "anchored"),
- (long)(rx_origin + start_shift - strpos)) );
+ (long)(rx_origin + start_shift - strbeg),
+ (IV)(rx_origin - strbeg)
+ ));
goto restart;
}
if (rx_origin != s) {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
" By STCLASS: moving %ld --> %ld\n",
- (long)(rx_origin - strpos), (long)(s - strpos))
+ (long)(rx_origin - strbeg), (long)(s - strbeg))
);
}
else {
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
"Intuit: %sSuccessfully guessed:%s match at offset %ld\n",
- PL_colors[4], PL_colors[5], (long)(rx_origin - strpos)) );
+ PL_colors[4], PL_colors[5], (long)(rx_origin - strbeg)) );
return rx_origin;
} else { \
uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
len = UTF8SKIP(uc); \
- skiplen = UNISKIP( uvc ); \
+ skiplen = UVCHR_SKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
} \
} else { \
len = 1; \
uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
- skiplen = UNISKIP( uvc ); \
+ skiplen = UVCHR_SKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
} \
#define getGCB_VAL_CP(cp) \
_generic_GET_BREAK_VAL_CP( \
PL_GCB_invlist, \
- Grapheme_Cluster_Break_invmap, \
+ _Perl_GCB_invmap, \
(cp))
/* Returns the GCB value for the first code point in the UTF-8 encoded string
#define getGCB_VAL_UTF8(pos, strend) \
_generic_GET_BREAK_VAL_UTF8(getGCB_VAL_CP, pos, strend)
+
+/* Returns the SB value for the input code point */
+#define getSB_VAL_CP(cp) \
+ _generic_GET_BREAK_VAL_CP( \
+ PL_SB_invlist, \
+ _Perl_SB_invmap, \
+ (cp))
+
+/* Returns the SB value for the first code point in the UTF-8 encoded string
+ * bounded by pos and strend */
+#define getSB_VAL_UTF8(pos, strend) \
+ _generic_GET_BREAK_VAL_UTF8(getSB_VAL_CP, pos, strend)
+
/* Returns the WB value for the input code point */
#define getWB_VAL_CP(cp) \
_generic_GET_BREAK_VAL_CP( \
PL_WB_invlist, \
- Word_Break_invmap, \
+ _Perl_WB_invmap, \
(cp))
/* Returns the WB value for the first code point in the UTF-8 encoded string
switch (OP(c)) {
case ANYOFL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if ((FLAGS(c) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
+
/* FALLTHROUGH */
+ case ANYOFD:
case ANYOF:
if (utf8_target) {
REXEC_FBC_UTF8_CLASS_SCAN(
REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
}
break;
- case CANY:
- REXEC_FBC_SCAN(
- if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
- goto got_it;
- else
- tmp = doevery;
- );
- break;
case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
assert(! is_utf8_pat);
case BOUNDL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (FLAGS(c) != TRADITIONAL_BOUND) {
- Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ if (! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ }
goto do_boundu;
}
case NBOUNDL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (FLAGS(c) != TRADITIONAL_BOUND) {
- Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ if (! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ }
goto do_nboundu;
}
}
if (utf8_target) {
- PL_GCB_enum before = getGCB_VAL_UTF8(
+ GCB_enum before = getGCB_VAL_UTF8(
reghop3((U8*)s, -1,
(U8*)(reginfo->strbeg)),
(U8*) reginfo->strend);
while (s < strend) {
- PL_GCB_enum after = getGCB_VAL_UTF8((U8*) s,
+ GCB_enum after = getGCB_VAL_UTF8((U8*) s,
(U8*) reginfo->strend);
if (to_complement ^ isGCB(before, after)) {
if (reginfo->intuit || regtry(reginfo, &s)) {
}
break;
+ case SB_BOUND:
+ if (s == reginfo->strbeg) { /* SB always matches at beginning */
+ if (to_complement
+ ^ cBOOL(reginfo->intuit || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+
+ /* Didn't match. Go try at the next position */
+ s += (utf8_target) ? UTF8SKIP(s) : 1;
+ }
+
+ if (utf8_target) {
+ SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend);
+ while (s < strend) {
+ SB_enum after = getSB_VAL_UTF8((U8*) s,
+ (U8*) reginfo->strend);
+ if (to_complement ^ isSB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ before = after;
+ }
+ s += UTF8SKIP(s);
+ }
+ }
+ else { /* Not utf8. */
+ SB_enum before = getSB_VAL_CP((U8) *(s -1));
+ while (s < strend) {
+ SB_enum after = getSB_VAL_CP((U8) *s);
+ if (to_complement ^ isSB(before,
+ after,
+ (U8*) reginfo->strbeg,
+ (U8*) s,
+ (U8*) reginfo->strend,
+ utf8_target))
+ {
+ if (reginfo->intuit || regtry(reginfo, &s)) {
+ goto got_it;
+ }
+ before = after;
+ }
+ s++;
+ }
+ }
+
+ /* Here are at the final position in the target string. The SB
+ * value is always true here, so matches, depending on other
+ * constraints */
+ if (to_complement ^ cBOOL(reginfo->intuit
+ || regtry(reginfo, &s)))
+ {
+ goto got_it;
+ }
+
+ break;
+
case WB_BOUND:
if (s == reginfo->strbeg) {
if (to_complement ^ cBOOL(reginfo->intuit
* loop through the line. Context may be needed to make a
* determination, and if so, this can save having to
* recalculate it */
- PL_WB_enum previous = PL_WB_UNKNOWN;
- PL_WB_enum before = getWB_VAL_UTF8(
+ WB_enum previous = WB_UNKNOWN;
+ WB_enum before = getWB_VAL_UTF8(
reghop3((U8*)s,
-1,
(U8*)(reginfo->strbeg)),
(U8*) reginfo->strend);
while (s < strend) {
- PL_WB_enum after = getWB_VAL_UTF8((U8*) s,
+ WB_enum after = getWB_VAL_UTF8((U8*) s,
(U8*) reginfo->strend);
if (to_complement ^ isWB(previous,
before,
}
}
else { /* Not utf8. */
- PL_WB_enum previous = PL_WB_UNKNOWN;
- PL_WB_enum before = getWB_VAL_CP((U8) *(s -1));
+ WB_enum previous = WB_UNKNOWN;
+ WB_enum before = getWB_VAL_CP((U8) *(s -1));
while (s < strend) {
- PL_WB_enum after = getWB_VAL_CP((U8) *s);
+ WB_enum after = getWB_VAL_CP((U8) *s);
if (to_complement ^ isWB(previous,
before,
after,
classnum)))
|| (UTF8_IS_DOWNGRADEABLE_START(*s)
&& to_complement ^ cBOOL(
- _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
+ _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
*(s + 1)),
classnum))))
{
}
else switch (classnum) { /* These classes are implemented as
macros */
- case _CC_ENUM_SPACE: /* XXX would require separate code if we
- revert the change of \v matching this */
- /* FALLTHROUGH */
-
- case _CC_ENUM_PSXSPC:
+ case _CC_ENUM_SPACE:
REXEC_FBC_UTF8_CLASS_SCAN(
to_complement ^ cBOOL(isSPACE_utf8(s)));
break;
PERL_UNUSED_ARG(data);
/* Be paranoid... */
- if (prog == NULL || stringarg == NULL) {
+ if (prog == NULL) {
Perl_croak(aTHX_ "NULL regexp parameter");
}
reginfo->ganch =
(flags & REXEC_IGNOREPOS)
? stringarg /* use start pos rather than pos() */
- : (sv && (mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
+ : ((mg = mg_find_mglob(sv)) && mg->mg_len >= 0)
/* Defined pos(): */
? strbeg + MgBYTEPOS(mg, sv, strbeg, strend-strbeg)
: strbeg; /* pos() not defined; use start of string */
if (minlen) {
const OPCODE op = OP(progi->regstclass);
/* don't bother with what can't match */
- if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
+ if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
strend = HOPc(strend, -(minlen - 1));
}
DEBUG_EXECUTE_r({
if (pref0_len > pref_len)
pref0_len = pref_len;
{
- const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
+ const int is_uni = utf8_target ? 1 : 0;
RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
(locinput - pref_len),pref0_len, 60, 4, 5);
/* This creates a single number by combining two, with 'before' being like the
* 10's digit, but this isn't necessarily base 10; it is base however many
* elements of the enum there are */
-#define GCBcase(before, after) ((PL_GCB_ENUM_COUNT * before) + after)
+#define GCBcase(before, after) ((GCB_ENUM_COUNT * before) + after)
STATIC bool
-S_isGCB(const PL_GCB_enum before, const PL_GCB_enum after)
+S_isGCB(const GCB_enum before, const GCB_enum after)
{
/* returns a boolean indicating if there is a Grapheme Cluster Boundary
* between the inputs. See http://www.unicode.org/reports/tr29/ */
/* Do not break between a CR and LF.
GB3. CR × LF */
- case GCBcase(PL_GCB_CR, PL_GCB_LF):
+ case GCBcase(GCB_CR, GCB_LF):
return FALSE;
/* Do not break Hangul syllable sequences.
GB6. L × ( L | V | LV | LVT ) */
- case GCBcase(PL_GCB_L, PL_GCB_L):
- case GCBcase(PL_GCB_L, PL_GCB_V):
- case GCBcase(PL_GCB_L, PL_GCB_LV):
- case GCBcase(PL_GCB_L, PL_GCB_LVT):
+ case GCBcase(GCB_L, GCB_L):
+ case GCBcase(GCB_L, GCB_V):
+ case GCBcase(GCB_L, GCB_LV):
+ case GCBcase(GCB_L, GCB_LVT):
return FALSE;
/* GB7. ( LV | V ) × ( V | T ) */
- case GCBcase(PL_GCB_LV, PL_GCB_V):
- case GCBcase(PL_GCB_LV, PL_GCB_T):
- case GCBcase(PL_GCB_V, PL_GCB_V):
- case GCBcase(PL_GCB_V, PL_GCB_T):
+ case GCBcase(GCB_LV, GCB_V):
+ case GCBcase(GCB_LV, GCB_T):
+ case GCBcase(GCB_V, GCB_V):
+ case GCBcase(GCB_V, GCB_T):
return FALSE;
/* GB8. ( LVT | T) × T */
- case GCBcase(PL_GCB_LVT, PL_GCB_T):
- case GCBcase(PL_GCB_T, PL_GCB_T):
+ case GCBcase(GCB_LVT, GCB_T):
+ case GCBcase(GCB_T, GCB_T):
return FALSE;
/* Do not break between regional indicator symbols.
GB8a. Regional_Indicator × Regional_Indicator */
- case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_Regional_Indicator):
+ case GCBcase(GCB_Regional_Indicator, GCB_Regional_Indicator):
return FALSE;
/* Do not break before extending characters.
GB9. × Extend */
- case GCBcase(PL_GCB_Other, PL_GCB_Extend):
- case GCBcase(PL_GCB_Extend, PL_GCB_Extend):
- case GCBcase(PL_GCB_L, PL_GCB_Extend):
- case GCBcase(PL_GCB_LV, PL_GCB_Extend):
- case GCBcase(PL_GCB_LVT, PL_GCB_Extend):
- case GCBcase(PL_GCB_Prepend, PL_GCB_Extend):
- case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_Extend):
- case GCBcase(PL_GCB_SpacingMark, PL_GCB_Extend):
- case GCBcase(PL_GCB_T, PL_GCB_Extend):
- case GCBcase(PL_GCB_V, PL_GCB_Extend):
+ case GCBcase(GCB_Other, GCB_Extend):
+ case GCBcase(GCB_Extend, GCB_Extend):
+ case GCBcase(GCB_L, GCB_Extend):
+ case GCBcase(GCB_LV, GCB_Extend):
+ case GCBcase(GCB_LVT, GCB_Extend):
+ case GCBcase(GCB_Prepend, GCB_Extend):
+ case GCBcase(GCB_Regional_Indicator, GCB_Extend):
+ case GCBcase(GCB_SpacingMark, GCB_Extend):
+ case GCBcase(GCB_T, GCB_Extend):
+ case GCBcase(GCB_V, GCB_Extend):
return FALSE;
/* Do not break before SpacingMarks, or after Prepend characters.
GB9a. × SpacingMark */
- case GCBcase(PL_GCB_Other, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_Extend, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_L, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_LV, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_LVT, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_Prepend, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_SpacingMark, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_T, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_V, PL_GCB_SpacingMark):
+ case GCBcase(GCB_Other, GCB_SpacingMark):
+ case GCBcase(GCB_Extend, GCB_SpacingMark):
+ case GCBcase(GCB_L, GCB_SpacingMark):
+ case GCBcase(GCB_LV, GCB_SpacingMark):
+ case GCBcase(GCB_LVT, GCB_SpacingMark):
+ case GCBcase(GCB_Prepend, GCB_SpacingMark):
+ case GCBcase(GCB_Regional_Indicator, GCB_SpacingMark):
+ case GCBcase(GCB_SpacingMark, GCB_SpacingMark):
+ case GCBcase(GCB_T, GCB_SpacingMark):
+ case GCBcase(GCB_V, GCB_SpacingMark):
return FALSE;
/* GB9b. Prepend × */
- case GCBcase(PL_GCB_Prepend, PL_GCB_Other):
- case GCBcase(PL_GCB_Prepend, PL_GCB_L):
- case GCBcase(PL_GCB_Prepend, PL_GCB_LV):
- case GCBcase(PL_GCB_Prepend, PL_GCB_LVT):
- case GCBcase(PL_GCB_Prepend, PL_GCB_Prepend):
- case GCBcase(PL_GCB_Prepend, PL_GCB_Regional_Indicator):
- case GCBcase(PL_GCB_Prepend, PL_GCB_T):
- case GCBcase(PL_GCB_Prepend, PL_GCB_V):
+ case GCBcase(GCB_Prepend, GCB_Other):
+ case GCBcase(GCB_Prepend, GCB_L):
+ case GCBcase(GCB_Prepend, GCB_LV):
+ case GCBcase(GCB_Prepend, GCB_LVT):
+ case GCBcase(GCB_Prepend, GCB_Prepend):
+ case GCBcase(GCB_Prepend, GCB_Regional_Indicator):
+ case GCBcase(GCB_Prepend, GCB_T):
+ case GCBcase(GCB_Prepend, GCB_V):
return FALSE;
}
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
}
-#define WBcase(before, after) ((PL_WB_ENUM_COUNT * before) + after)
+#define SBcase(before, after) ((SB_ENUM_COUNT * before) + after)
STATIC bool
-S_isWB(pTHX_ PL_WB_enum previous,
- PL_WB_enum before,
- PL_WB_enum after,
+S_isSB(pTHX_ SB_enum before,
+ SB_enum after,
+ const U8 * const strbeg,
+ const U8 * const curpos,
+ const U8 * const strend,
+ const bool utf8_target)
+{
+ /* returns a boolean indicating if there is a Sentence Boundary Break
+ * between the inputs. See http://www.unicode.org/reports/tr29/ */
+
+ U8 * lpos = (U8 *) curpos;
+ U8 * temp_pos;
+ SB_enum backup;
+
+ PERL_ARGS_ASSERT_ISSB;
+
+ /* Break at the start and end of text.
+ SB1. sot ÷
+ SB2. ÷ eot */
+ if (before == SB_EDGE || after == SB_EDGE) {
+ return TRUE;
+ }
+
+ /* SB 3: Do not break within CRLF. */
+ if (before == SB_CR && after == SB_LF) {
+ return FALSE;
+ }
+
+ /* Break after paragraph separators. (though why CR and LF are considered
+ * so is beyond me (khw)
+ SB4. Sep | CR | LF ÷ */
+ if (before == SB_Sep || before == SB_CR || before == SB_LF) {
+ return TRUE;
+ }
+
+ /* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
+ * (See Section 6.2, Replacing Ignore Rules.)
+ SB5. X (Extend | Format)* → X */
+ if (after == SB_Extend || after == SB_Format) {
+ return FALSE;
+ }
+
+ if (before == SB_Extend || before == SB_Format) {
+ before = backup_one_SB(strbeg, &lpos, utf8_target);
+ }
+
+ /* Do not break after ambiguous terminators like period, if they are
+ * immediately followed by a number or lowercase letter, if they are
+ * between uppercase letters, if the first following letter (optionally
+ * after certain punctuation) is lowercase, or if they are followed by
+ * "continuation" punctuation such as comma, colon, or semicolon. For
+ * example, a period may be an abbreviation or numeric period, and thus may
+ * not mark the end of a sentence.
+
+ * SB6. ATerm × Numeric */
+ if (before == SB_ATerm && after == SB_Numeric) {
+ return FALSE;
+ }
+
+ /* SB7. (Upper | Lower) ATerm × Upper */
+ if (before == SB_ATerm && after == SB_Upper) {
+ temp_pos = lpos;
+ backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
+ if (backup == SB_Upper || backup == SB_Lower) {
+ return FALSE;
+ }
+ }
+
+ /* SB8a. (STerm | ATerm) Close* Sp* × (SContinue | STerm | ATerm)
+ * SB10. (STerm | ATerm) Close* Sp* × ( Sp | Sep | CR | LF ) */
+ backup = before;
+ temp_pos = lpos;
+ while (backup == SB_Sp) {
+ backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
+ }
+ while (backup == SB_Close) {
+ backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
+ }
+ if ((backup == SB_STerm || backup == SB_ATerm)
+ && ( after == SB_SContinue
+ || after == SB_STerm
+ || after == SB_ATerm
+ || after == SB_Sp
+ || after == SB_Sep
+ || after == SB_CR
+ || after == SB_LF))
+ {
+ return FALSE;
+ }
+
+ /* SB8. ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR | LF |
+ * STerm | ATerm) )* Lower */
+ if (backup == SB_ATerm) {
+ U8 * rpos = (U8 *) curpos;
+ SB_enum later = after;
+
+ while ( later != SB_OLetter
+ && later != SB_Upper
+ && later != SB_Lower
+ && later != SB_Sep
+ && later != SB_CR
+ && later != SB_LF
+ && later != SB_STerm
+ && later != SB_ATerm
+ && later != SB_EDGE)
+ {
+ later = advance_one_SB(&rpos, strend, utf8_target);
+ }
+ if (later == SB_Lower) {
+ return FALSE;
+ }
+ }
+
+ /* Break after sentence terminators, but include closing punctuation,
+ * trailing spaces, and a paragraph separator (if present). [See note
+ * below.]
+ * SB9. ( STerm | ATerm ) Close* × ( Close | Sp | Sep | CR | LF ) */
+ backup = before;
+ temp_pos = lpos;
+ while (backup == SB_Close) {
+ backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
+ }
+ if ((backup == SB_STerm || backup == SB_ATerm)
+ && ( after == SB_Close
+ || after == SB_Sp
+ || after == SB_Sep
+ || after == SB_CR
+ || after == SB_LF))
+ {
+ return FALSE;
+ }
+
+
+ /* SB11. ( STerm | ATerm ) Close* Sp* ( Sep | CR | LF )? ÷ */
+ temp_pos = lpos;
+ backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
+ if ( backup == SB_Sep
+ || backup == SB_CR
+ || backup == SB_LF)
+ {
+ lpos = temp_pos;
+ }
+ else {
+ backup = before;
+ }
+ while (backup == SB_Sp) {
+ backup = backup_one_SB(strbeg, &lpos, utf8_target);
+ }
+ while (backup == SB_Close) {
+ backup = backup_one_SB(strbeg, &lpos, utf8_target);
+ }
+ if (backup == SB_STerm || backup == SB_ATerm) {
+ return TRUE;
+ }
+
+ /* Otherwise, do not break.
+ SB12. Any × Any */
+
+ return FALSE;
+}
+
+STATIC SB_enum
+S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
+{
+ SB_enum sb;
+
+ PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
+
+ if (*curpos >= strend) {
+ return SB_EDGE;
+ }
+
+ if (utf8_target) {
+ do {
+ *curpos += UTF8SKIP(*curpos);
+ if (*curpos >= strend) {
+ return SB_EDGE;
+ }
+ sb = getSB_VAL_UTF8(*curpos, strend);
+ } while (sb == SB_Extend || sb == SB_Format);
+ }
+ else {
+ do {
+ (*curpos)++;
+ if (*curpos >= strend) {
+ return SB_EDGE;
+ }
+ sb = getSB_VAL_CP(**curpos);
+ } while (sb == SB_Extend || sb == SB_Format);
+ }
+
+ return sb;
+}
+
+STATIC SB_enum
+S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
+{
+ SB_enum sb;
+
+ PERL_ARGS_ASSERT_BACKUP_ONE_SB;
+
+ if (*curpos < strbeg) {
+ return SB_EDGE;
+ }
+
+ if (utf8_target) {
+ U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
+ if (! prev_char_pos) {
+ return SB_EDGE;
+ }
+
+ /* Back up over Extend and Format. curpos is always just to the right
+ * of the characater whose value we are getting */
+ do {
+ U8 * prev_prev_char_pos;
+ if ((prev_prev_char_pos = reghopmaybe3((U8 *) prev_char_pos, -1,
+ strbeg)))
+ {
+ sb = getSB_VAL_UTF8(prev_prev_char_pos, prev_char_pos);
+ *curpos = prev_char_pos;
+ prev_char_pos = prev_prev_char_pos;
+ }
+ else {
+ *curpos = (U8 *) strbeg;
+ return SB_EDGE;
+ }
+ } while (sb == SB_Extend || sb == SB_Format);
+ }
+ else {
+ do {
+ if (*curpos - 2 < strbeg) {
+ *curpos = (U8 *) strbeg;
+ return SB_EDGE;
+ }
+ (*curpos)--;
+ sb = getSB_VAL_CP(*(*curpos - 1));
+ } while (sb == SB_Extend || sb == SB_Format);
+ }
+
+ return sb;
+}
+
+#define WBcase(before, after) ((WB_ENUM_COUNT * before) + after)
+
+STATIC bool
+S_isWB(pTHX_ WB_enum previous,
+ WB_enum before,
+ WB_enum after,
const U8 * const strbeg,
const U8 * const curpos,
const U8 * const strend,
* a Unicode word break, using their published algorithm. Context may be
* needed to make this determination. If the value for the character
* before 'before' is known, it is passed as 'previous'; otherwise that
- * should be set to PL_WB_UNKNOWN. The other input parameters give the
+ * should be set to WB_UNKNOWN. The other input parameters give the
* boundaries and current position in the matching of the string. That
* is, 'curpos' marks the position where the character whose wb value is
* 'after' begins. See http://www.unicode.org/reports/tr29/ */
PERL_ARGS_ASSERT_ISWB;
/* WB1 and WB2: Break at the start and end of text. */
- if (before == PL_WB_EDGE || after == PL_WB_EDGE) {
+ if (before == WB_EDGE || after == WB_EDGE) {
return TRUE;
}
/* WB 3: Do not break within CRLF. */
- if (before == PL_WB_CR && after == PL_WB_LF) {
+ if (before == WB_CR && after == WB_LF) {
return FALSE;
}
/* WB 3a and WB 3b: Otherwise break before and after Newlines (including CR
* and LF) */
- if ( before == PL_WB_CR || before == PL_WB_LF || before == PL_WB_Newline
- || after == PL_WB_CR || after == PL_WB_LF || after == PL_WB_Newline)
+ if ( before == WB_CR || before == WB_LF || before == WB_Newline
+ || after == WB_CR || after == WB_LF || after == WB_Newline)
{
return TRUE;
}
* beginning of a region of text.
* WB4. X (Extend | Format)* → X. */
- if (after == PL_WB_Extend || after == PL_WB_Format) {
+ if (after == WB_Extend || after == WB_Format) {
return FALSE;
}
- if (before == PL_WB_Extend || before == PL_WB_Format) {
+ if (before == WB_Extend || before == WB_Format) {
before = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
}
/* Do not break between most letters.
WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) */
- case WBcase(PL_WB_ALetter, PL_WB_ALetter):
- case WBcase(PL_WB_ALetter, PL_WB_Hebrew_Letter):
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_ALetter):
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_Hebrew_Letter):
+ case WBcase(WB_ALetter, WB_ALetter):
+ case WBcase(WB_ALetter, WB_Hebrew_Letter):
+ case WBcase(WB_Hebrew_Letter, WB_ALetter):
+ case WBcase(WB_Hebrew_Letter, WB_Hebrew_Letter):
return FALSE;
/* Do not break letters across certain punctuation.
WB6. (ALetter | Hebrew_Letter)
× (MidLetter | MidNumLet | Single_Quote) (ALetter
| Hebrew_Letter) */
- case WBcase(PL_WB_ALetter, PL_WB_MidLetter):
- case WBcase(PL_WB_ALetter, PL_WB_MidNumLet):
- case WBcase(PL_WB_ALetter, PL_WB_Single_Quote):
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_MidLetter):
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_MidNumLet):
- /*case WBcase(PL_WB_Hebrew_Letter, PL_WB_Single_Quote):*/
+ case WBcase(WB_ALetter, WB_MidLetter):
+ case WBcase(WB_ALetter, WB_MidNumLet):
+ case WBcase(WB_ALetter, WB_Single_Quote):
+ case WBcase(WB_Hebrew_Letter, WB_MidLetter):
+ case WBcase(WB_Hebrew_Letter, WB_MidNumLet):
+ /*case WBcase(WB_Hebrew_Letter, WB_Single_Quote):*/
after = advance_one_WB(&after_pos, strend, utf8_target);
- return after != PL_WB_ALetter && after != PL_WB_Hebrew_Letter;
+ return after != WB_ALetter && after != WB_Hebrew_Letter;
/* WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet |
* Single_Quote) × (ALetter | Hebrew_Letter) */
- case WBcase(PL_WB_MidLetter, PL_WB_ALetter):
- case WBcase(PL_WB_MidLetter, PL_WB_Hebrew_Letter):
- case WBcase(PL_WB_MidNumLet, PL_WB_ALetter):
- case WBcase(PL_WB_MidNumLet, PL_WB_Hebrew_Letter):
- case WBcase(PL_WB_Single_Quote, PL_WB_ALetter):
- case WBcase(PL_WB_Single_Quote, PL_WB_Hebrew_Letter):
+ case WBcase(WB_MidLetter, WB_ALetter):
+ case WBcase(WB_MidLetter, WB_Hebrew_Letter):
+ case WBcase(WB_MidNumLet, WB_ALetter):
+ case WBcase(WB_MidNumLet, WB_Hebrew_Letter):
+ case WBcase(WB_Single_Quote, WB_ALetter):
+ case WBcase(WB_Single_Quote, WB_Hebrew_Letter):
before
= backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
- return before != PL_WB_ALetter && before != PL_WB_Hebrew_Letter;
+ return before != WB_ALetter && before != WB_Hebrew_Letter;
/* WB7a. Hebrew_Letter × Single_Quote */
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_Single_Quote):
+ case WBcase(WB_Hebrew_Letter, WB_Single_Quote):
return FALSE;
/* WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter */
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_Double_Quote):
+ case WBcase(WB_Hebrew_Letter, WB_Double_Quote):
return advance_one_WB(&after_pos, strend, utf8_target)
- != PL_WB_Hebrew_Letter;
+ != WB_Hebrew_Letter;
/* WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter */
- case WBcase(PL_WB_Double_Quote, PL_WB_Hebrew_Letter):
+ case WBcase(WB_Double_Quote, WB_Hebrew_Letter):
return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
- != PL_WB_Hebrew_Letter;
+ != WB_Hebrew_Letter;
/* Do not break within sequences of digits, or digits adjacent to
* letters (“3a”, or “A3”).
WB8. Numeric × Numeric */
- case WBcase(PL_WB_Numeric, PL_WB_Numeric):
+ case WBcase(WB_Numeric, WB_Numeric):
return FALSE;
/* WB9. (ALetter | Hebrew_Letter) × Numeric */
- case WBcase(PL_WB_ALetter, PL_WB_Numeric):
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_Numeric):
+ case WBcase(WB_ALetter, WB_Numeric):
+ case WBcase(WB_Hebrew_Letter, WB_Numeric):
return FALSE;
/* WB10. Numeric × (ALetter | Hebrew_Letter) */
- case WBcase(PL_WB_Numeric, PL_WB_ALetter):
- case WBcase(PL_WB_Numeric, PL_WB_Hebrew_Letter):
+ case WBcase(WB_Numeric, WB_ALetter):
+ case WBcase(WB_Numeric, WB_Hebrew_Letter):
return FALSE;
/* Do not break within sequences, such as “3.2” or “3,456.789”.
WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
*/
- case WBcase(PL_WB_MidNum, PL_WB_Numeric):
- case WBcase(PL_WB_MidNumLet, PL_WB_Numeric):
- case WBcase(PL_WB_Single_Quote, PL_WB_Numeric):
+ case WBcase(WB_MidNum, WB_Numeric):
+ case WBcase(WB_MidNumLet, WB_Numeric):
+ case WBcase(WB_Single_Quote, WB_Numeric):
return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
- != PL_WB_Numeric;
+ != WB_Numeric;
/* WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
* */
- case WBcase(PL_WB_Numeric, PL_WB_MidNum):
- case WBcase(PL_WB_Numeric, PL_WB_MidNumLet):
- case WBcase(PL_WB_Numeric, PL_WB_Single_Quote):
+ case WBcase(WB_Numeric, WB_MidNum):
+ case WBcase(WB_Numeric, WB_MidNumLet):
+ case WBcase(WB_Numeric, WB_Single_Quote):
return advance_one_WB(&after_pos, strend, utf8_target)
- != PL_WB_Numeric;
+ != WB_Numeric;
/* Do not break between Katakana.
WB13. Katakana × Katakana */
- case WBcase(PL_WB_Katakana, PL_WB_Katakana):
+ case WBcase(WB_Katakana, WB_Katakana):
return FALSE;
/* Do not break from extenders.
WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana |
ExtendNumLet) × ExtendNumLet */
- case WBcase(PL_WB_ALetter, PL_WB_ExtendNumLet):
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_ExtendNumLet):
- case WBcase(PL_WB_Numeric, PL_WB_ExtendNumLet):
- case WBcase(PL_WB_Katakana, PL_WB_ExtendNumLet):
- case WBcase(PL_WB_ExtendNumLet, PL_WB_ExtendNumLet):
+ case WBcase(WB_ALetter, WB_ExtendNumLet):
+ case WBcase(WB_Hebrew_Letter, WB_ExtendNumLet):
+ case WBcase(WB_Numeric, WB_ExtendNumLet):
+ case WBcase(WB_Katakana, WB_ExtendNumLet):
+ case WBcase(WB_ExtendNumLet, WB_ExtendNumLet):
return FALSE;
/* WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric
* | Katakana) */
- case WBcase(PL_WB_ExtendNumLet, PL_WB_ALetter):
- case WBcase(PL_WB_ExtendNumLet, PL_WB_Hebrew_Letter):
- case WBcase(PL_WB_ExtendNumLet, PL_WB_Numeric):
- case WBcase(PL_WB_ExtendNumLet, PL_WB_Katakana):
+ case WBcase(WB_ExtendNumLet, WB_ALetter):
+ case WBcase(WB_ExtendNumLet, WB_Hebrew_Letter):
+ case WBcase(WB_ExtendNumLet, WB_Numeric):
+ case WBcase(WB_ExtendNumLet, WB_Katakana):
return FALSE;
/* Do not break between regional indicator symbols.
WB13c. Regional_Indicator × Regional_Indicator */
- case WBcase(PL_WB_Regional_Indicator, PL_WB_Regional_Indicator):
+ case WBcase(WB_Regional_Indicator, WB_Regional_Indicator):
return FALSE;
}
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
}
-STATIC PL_WB_enum
+STATIC WB_enum
S_advance_one_WB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
{
- PL_WB_enum wb;
+ WB_enum wb;
PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
if (*curpos >= strend) {
- return PL_WB_EDGE;
+ return WB_EDGE;
}
if (utf8_target) {
do {
*curpos += UTF8SKIP(*curpos);
if (*curpos >= strend) {
- return PL_WB_EDGE;
+ return WB_EDGE;
}
wb = getWB_VAL_UTF8(*curpos, strend);
- } while (wb == PL_WB_Extend || wb == PL_WB_Format);
+ } while (wb == WB_Extend || wb == WB_Format);
}
else {
do {
(*curpos)++;
if (*curpos >= strend) {
- return PL_WB_EDGE;
+ return WB_EDGE;
}
wb = getWB_VAL_CP(**curpos);
- } while (wb == PL_WB_Extend || wb == PL_WB_Format);
+ } while (wb == WB_Extend || wb == WB_Format);
}
return wb;
}
-STATIC PL_WB_enum
-S_backup_one_WB(pTHX_ PL_WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
+STATIC WB_enum
+S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
{
- PL_WB_enum wb;
+ WB_enum wb;
PERL_ARGS_ASSERT_BACKUP_ONE_WB;
/* If we know what the previous character's break value is, don't have
* to look it up */
- if (*previous != PL_WB_UNKNOWN) {
+ if (*previous != WB_UNKNOWN) {
wb = *previous;
- *previous = PL_WB_UNKNOWN;
+ *previous = WB_UNKNOWN;
/* XXX Note that doesn't change curpos, and maybe should */
/* But we always back up over these two types */
- if (wb != PL_WB_Extend && wb != PL_WB_Format) {
+ if (wb != WB_Extend && wb != WB_Format) {
return wb;
}
}
if (*curpos < strbeg) {
- return PL_WB_EDGE;
+ return WB_EDGE;
}
if (utf8_target) {
U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
if (! prev_char_pos) {
- return PL_WB_EDGE;
+ return WB_EDGE;
}
/* Back up over Extend and Format. curpos is always just to the right
}
else {
*curpos = (U8 *) strbeg;
- return PL_WB_EDGE;
+ return WB_EDGE;
}
- } while (wb == PL_WB_Extend || wb == PL_WB_Format);
+ } while (wb == WB_Extend || wb == WB_Format);
}
else {
do {
if (*curpos - 2 < strbeg) {
*curpos = (U8 *) strbeg;
- return PL_WB_EDGE;
+ return WB_EDGE;
}
(*curpos)--;
wb = getWB_VAL_CP(*(*curpos - 1));
- } while (wb == PL_WB_Extend || wb == PL_WB_Format);
+ } while (wb == WB_Extend || wb == WB_Format);
}
return wb;
rex->offs[0].start = locinput - reginfo->strbeg;
PUSH_STATE_GOTO(KEEPS_next, next, locinput);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case KEEPS_next_fail:
/* rollback the start point change */
rex->offs[0].start = st->u.keeper.val;
sayNO_SILENT;
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case MEOL: /* /..$/m */
if (!NEXTCHR_IS_EOS && nextchr != '\n')
sayNO;
goto increment_locinput;
- case CANY: /* \C */
- if (NEXTCHR_IS_EOS)
- sayNO;
- locinput++;
- break;
-
case REG_ANY: /* /./ */
if ((NEXTCHR_IS_EOS) || nextchr == '\n')
sayNO;
);
sayNO_SILENT;
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
}
/* FALLTHROUGH */
case TRIE: /* (ab|cd) */
goto trie_first_try; /* jump into the fail handler */
}}
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case TRIE_next_fail: /* we failed - try next alternative */
{
if (ST.accepted > 1 || has_cutgroup) {
PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
}
/* only one choice left - just continue */
DEBUG_EXECUTE_r({
l++;
}
else {
- if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
+ if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
{
sayNO;
}
s++;
}
else {
- if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
+ if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
{
sayNO;
}
/* FALLTHROUGH */
case BOUNDL: /* /\b/l */
+ {
+ bool b1, b2;
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (FLAGS(scan) != TRADITIONAL_BOUND) {
if (utf8_target) {
if (locinput == reginfo->strbeg)
- ln = isWORDCHAR_LC('\n');
+ b1 = isWORDCHAR_LC('\n');
else {
- ln = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
+ b1 = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
(U8*)(reginfo->strbeg)));
}
- n = (NEXTCHR_IS_EOS)
+ b2 = (NEXTCHR_IS_EOS)
? isWORDCHAR_LC('\n')
: isWORDCHAR_LC_utf8((U8*)locinput);
}
else { /* Here the string isn't utf8 */
- ln = (locinput == reginfo->strbeg)
+ b1 = (locinput == reginfo->strbeg)
? isWORDCHAR_LC('\n')
: isWORDCHAR_LC(UCHARAT(locinput - 1));
- n = (NEXTCHR_IS_EOS)
+ b2 = (NEXTCHR_IS_EOS)
? isWORDCHAR_LC('\n')
: isWORDCHAR_LC(nextchr);
}
- if (to_complement ^ (ln == n)) {
+ if (to_complement ^ (b1 == b2)) {
sayNO;
}
break;
+ }
case NBOUND: /* /\B/ */
to_complement = 1;
/* FALLTHROUGH */
case BOUNDA: /* /\b/a */
+ {
+ bool b1, b2;
bound_ascii_match_only:
/* Here the string isn't utf8, or is utf8 and only ascii characters
* 2) it is a multi-byte character, in which case the final byte is
* never mistakable for ASCII, and so the test will say it is
* not a word character, which is the correct answer. */
- ln = (locinput == reginfo->strbeg)
+ b1 = (locinput == reginfo->strbeg)
? isWORDCHAR_A('\n')
: isWORDCHAR_A(UCHARAT(locinput - 1));
- n = (NEXTCHR_IS_EOS)
+ b2 = (NEXTCHR_IS_EOS)
? isWORDCHAR_A('\n')
: isWORDCHAR_A(nextchr);
- if (to_complement ^ (ln == n)) {
+ if (to_complement ^ (b1 == b2)) {
sayNO;
}
break;
+ }
case NBOUNDU: /* /\B/u */
to_complement = 1;
bound_utf8:
switch((bound_type) FLAGS(scan)) {
case TRADITIONAL_BOUND:
- ln = (locinput == reginfo->strbeg)
- ? isWORDCHAR_L1('\n')
+ {
+ bool b1, b2;
+ b1 = (locinput == reginfo->strbeg)
+ ? 0 /* isWORDCHAR_L1('\n') */
: isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
(U8*)(reginfo->strbeg)));
- n = (NEXTCHR_IS_EOS)
- ? isWORDCHAR_L1('\n')
+ b2 = (NEXTCHR_IS_EOS)
+ ? 0 /* isWORDCHAR_L1('\n') */
: isWORDCHAR_utf8((U8*)locinput);
- match = ln != n;
+ match = cBOOL(b1 != b2);
break;
+ }
case GCB_BOUND:
if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
match = TRUE; /* GCB always matches at begin and
(U8*) reginfo->strend));
}
break;
+
+ case SB_BOUND: /* Always matches at begin and end */
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isSB(getSB_VAL_UTF8(
+ reghop3((U8*)locinput,
+ -1,
+ (U8*)(reginfo->strbeg)),
+ (U8*) reginfo->strend),
+ getSB_VAL_UTF8((U8*) locinput,
+ (U8*) reginfo->strend),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
+
case WB_BOUND:
if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
match = TRUE;
}
else {
- match = isWB(PL_WB_UNKNOWN,
+ match = isWB(WB_UNKNOWN,
getWB_VAL_UTF8(
reghop3((U8*)locinput,
-1,
else { /* Not utf8 target */
switch((bound_type) FLAGS(scan)) {
case TRADITIONAL_BOUND:
- ln = (locinput == reginfo->strbeg)
- ? isWORDCHAR_L1('\n')
+ {
+ bool b1, b2;
+ b1 = (locinput == reginfo->strbeg)
+ ? 0 /* isWORDCHAR_L1('\n') */
: isWORDCHAR_L1(UCHARAT(locinput - 1));
- n = (NEXTCHR_IS_EOS)
- ? isWORDCHAR_L1('\n')
+ b2 = (NEXTCHR_IS_EOS)
+ ? 0 /* isWORDCHAR_L1('\n') */
: isWORDCHAR_L1(nextchr);
- match = ln != n;
+ match = cBOOL(b1 != b2);
break;
+ }
case GCB_BOUND:
if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
|| UCHARAT(locinput) != '\n';
}
break;
+
+ case SB_BOUND: /* Always matches at begin and end */
+ if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
+ match = TRUE;
+ }
+ else {
+ match = isSB(getSB_VAL_CP(UCHARAT(locinput -1)),
+ getSB_VAL_CP(UCHARAT(locinput)),
+ (U8*) reginfo->strbeg,
+ (U8*) locinput,
+ (U8*) reginfo->strend,
+ utf8_target);
+ }
+ break;
+
case WB_BOUND:
if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
match = TRUE;
}
else {
- match = isWB(PL_WB_UNKNOWN,
+ match = isWB(WB_UNKNOWN,
getWB_VAL_CP(UCHARAT(locinput -1)),
getWB_VAL_CP(UCHARAT(locinput)),
(U8*) reginfo->strbeg,
case ANYOFL: /* /[abc]/l */
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if ((FLAGS(scan) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE)
+ {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
/* FALLTHROUGH */
+ case ANYOFD: /* /[abc]/d */
case ANYOF: /* /[abc]/ */
if (NEXTCHR_IS_EOS)
sayNO;
}
else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
- (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
+ (U8) EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
*(locinput + 1))))))
{
sayNO;
}
else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
if (! (to_complement
- ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
+ ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
*(locinput + 1)),
FLAGS(scan)))))
{
}
else { /* Here, uses macros to find above Latin-1 code points */
switch (classnum) {
- case _CC_ENUM_SPACE: /* XXX would require separate
- code if we revert the change
- of \v matching this */
- case _CC_ENUM_PSXSPC:
+ case _CC_ENUM_SPACE:
if (! (to_complement
^ cBOOL(is_XPERLSPACE_high(locinput))))
{
else {
/* Get the gcb type for the current character */
- PL_GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
+ GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
(U8*) reginfo->strend);
/* Then scan through the input until we get to the first
* end-of-input) */
locinput += UTF8SKIP(locinput);
while (locinput < reginfo->strend) {
- PL_GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
+ GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
(U8*) reginfo->strend);
if (isGCB(prev_gcb, cur_gcb)) {
break;
/* now continue from first node in postoned RE */
PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
}
case EVAL_AB: /* cleanup after a successful (??{A})B */
break;
case ACCEPT: /* (*ACCEPT) */
- if (ARG(scan)){
+ if (scan->flags)
+ sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
+ if (ARG2L(scan)){
regnode *cursor;
for (cursor=scan;
cursor && OP(cursor)!=END;
PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
}
case CURLYX_end: /* just finished matching all of A*B */
cur_curlyx = ST.prev_curlyx;
sayYES;
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case CURLYX_end_fail: /* just failed to match all of A*B */
regcpblow(ST.cp);
cur_curlyx = ST.prev_curlyx;
sayNO;
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
#undef ST
PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
}
/* If degenerate A matches "", assume A done. */
PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
locinput);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
}
/* Prefer A over B for maximal matching. */
REGCP_SET(ST.lastcp);
PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
}
goto do_whilem_B_max;
}
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case WHILEM_B_min: /* just matched B in a minimal match */
case WHILEM_B_max: /* just matched B in a maximal match */
cur_curlyx = ST.save_curlyx;
sayYES;
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
cur_curlyx = ST.save_curlyx;
cur_curlyx->u.curlyx.count--;
CACHEsayNO;
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
/* FALLTHROUGH */
cur_curlyx->u.curlyx.count--;
CACHEsayNO;
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
REGCP_UNWIND(ST.lastcp);
PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
locinput);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
cur_curlyx = ST.save_curlyx;
/*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
locinput);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
#undef ST
#define ST st->u.branch
PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
}
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case CUTGROUP: /* /(*THEN)/ */
- sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
- MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
+ sv_yes_mark = st->u.mark.mark_name = scan->flags
+ ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
+ : NULL;
PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case CUTGROUP_next_fail:
do_cutgroup = 1;
sv_commit = st->u.mark.mark_name;
sayNO;
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case BRANCH_next:
sayYES;
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case BRANCH_next_fail: /* that branch failed; try the next, if any */
if (do_cutgroup) {
curlym_do_A: /* execute the A in /A{m,n}B/ */
PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case CURLYM_A: /* we've just matched an A */
ST.count++;
PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case CURLYM_B_fail: /* just failed to match a B */
REGCP_UNWIND(ST.cp);
goto curly_try_B_max;
}
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case CURLY_B_min_known_fail:
/* failed to find B in a non-greedy match where c1,c2 valid */
PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
}
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case CURLY_B_min_fail:
/* failed to find B in a non-greedy match where c1,c2 invalid */
}
sayNO;
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
curly_try_B_max:
/* a successful greedy match: now try to match B */
CURLY_SETPAREN(ST.paren, ST.count);
PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
}
}
/* FALLTHROUGH */
/* execute body of (?...A) */
PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
}
case IFMATCH_A_fail: /* body of (?...A) failed */
/* FALLTHROUGH */
case PRUNE: /* (*PRUNE) */
- if (!scan->flags)
+ if (scan->flags)
sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
PUSH_STATE_GOTO(COMMIT_next, next, locinput);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case COMMIT_next_fail:
no_final = 1;
/* FALLTHROUGH */
+ sayNO;
+ NOT_REACHED; /* NOTREACHED */
case OPFAIL: /* (*FAIL) */
- sayNO;
+ if (scan->flags)
+ sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
+ if (logical) {
+ /* deal with (?(?!)X|Y) properly,
+ * make sure we trigger the no branch
+ * of the trailing IFTHEN structure*/
+ sw= 0;
+ break;
+ } else {
+ sayNO;
+ }
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
#define ST st->u.mark
case MARKPOINT: /* (*MARK:foo) */
ST.mark_loc = locinput;
PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case MARKPOINT_next:
mark_state = ST.prev_mark;
sayYES;
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case MARKPOINT_next_fail:
if (popmark && sv_eq(ST.mark_name,popmark))
mark_state->u.mark.mark_name : NULL;
sayNO;
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
case SKIP: /* (*SKIP) */
- if (scan->flags) {
+ if (!scan->flags) {
/* (*SKIP) : if we fail we cut here*/
ST.mark_name = NULL;
ST.mark_loc = locinput;
no_final = 1;
sayNO;
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
#undef ST
case LNBREAK: /* \R */
Perl_croak(aTHX_ "corrupted regexp pointers");
/* NOTREACHED */
sayNO;
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
yes:
if (yes_state) {
char *loceol = reginfo->strend; /* local version */
I32 hardcount = 0; /* How many matches so far */
bool utf8_target = reginfo->is_utf8_target;
- int to_complement = 0; /* Invert the result? */
+ unsigned int to_complement = 0; /* Invert the result? */
UV utf8_flags;
_char_class_number classnum;
#ifndef DEBUGGING
else
scan = loceol;
break;
- case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
- if (utf8_target && loceol - scan > max) {
-
- /* <loceol> hadn't been adjusted in the UTF-8 case */
- scan += max;
- }
- else {
- scan = loceol;
- }
- break;
case EXACTL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
/* Target isn't utf8; convert the character in the UTF-8
* pattern to non-UTF8, and do a simple loop */
- c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
+ c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
while (scan < loceol && UCHARAT(scan) == c) {
scan++;
}
}
case ANYOFL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if ((FLAGS(p) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
/* FALLTHROUGH */
+ case ANYOFD:
case ANYOF:
if (utf8_target) {
while (hardcount < max
}
else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
if (! (to_complement
- ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
+ ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
*(scan + 1)),
classnum))))
{
* code is written for making the loops as tight as possible.
* It could be refactored to save space instead */
switch (classnum) {
- case _CC_ENUM_SPACE: /* XXX would require separate code
- if we revert the change of \v
- matching this */
- /* FALLTHROUGH */
- case _CC_ENUM_PSXSPC:
+ case _CC_ENUM_SPACE:
while (hardcount < max
&& scan < loceol
&& (to_complement ^ cBOOL(isSPACE_utf8(scan))))
default:
Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
/* NOTREACHED */
- NOT_REACHED;
+ NOT_REACHED; /* NOTREACHED */
}
* UTF8_ALLOW_FFFF */
if (c_len == (STRLEN)-1)
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
- if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) {
+ if (c > 255 && OP(n) == ANYOFL && ! (flags & ANYOF_LOC_REQ_UTF8)) {
_CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
}
}
if (c < NUM_ANYOF_CODE_POINTS) {
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
- else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII)
+ else if ((flags
+ & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+ && OP(n) == ANYOFD
&& ! utf8_target
&& ! isASCII(c))
{
}
if (UNICODE_IS_SUPER(c)
- && (flags & ANYOF_WARN_SUPER)
+ && (flags
+ & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+ && OP(n) != ANYOFD
&& ckWARN_d(WARN_NON_UNICODE))
{
Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
if (UTF8_IS_CONTINUED(*s)) {
while (s > lim && UTF8_IS_CONTINUATION(*s))
s--;
+ if (! UTF8_IS_START(*s)) {
+ Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
+ }
}
/* XXX could check well-formedness here */
}
if (UTF8_IS_CONTINUED(*s)) {
while (s > llim && UTF8_IS_CONTINUATION(*s))
s--;
+ if (! UTF8_IS_START(*s)) {
+ Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
+ }
}
/* XXX could check well-formedness here */
}
if (UTF8_IS_CONTINUED(*s)) {
while (s > lim && UTF8_IS_CONTINUATION(*s))
s--;
+ if (! UTF8_IS_START(*s)) {
+ Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
+ }
}
/* XXX could check well-formedness here */
}
}
/*
- * Local variables:
- * c-indentation-style: bsd
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- *
* ex: set ts=8 sts=4 sw=4 et:
*/