*/
/*
- * One Ring to rule them all, One Ring to find them
- &
+ * One Ring to rule them all, One Ring to find them
+ *
* [p.v of _The Lord of the Rings_, opening poem]
* [p.50 of _The Lord of the Rings_, I/iii: "The Shadow of the Past"]
* [p.254 of _The Lord of the Rings_, II/ii: "The Council of Elrond"]
#include "re_top.h"
#endif
-#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
- "Use of \\b{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
-
/*
* pregcomp and pregexec -- regsub and regerror are not used in perl
*
# include "regcomp.h"
#endif
-#include "inline_invlist.c"
+#include "invlist_inline.h"
#include "unicode_constants.h"
+#define B_ON_NON_UTF8_LOCALE_IS_WRONG \
+ "Use of \\b{} or \\B{} for non-UTF-8 locale is wrong. Assuming a UTF-8 locale"
+
+static const char utf8_locale_required[] =
+ "Use of (?[ ]) for non-UTF-8 locale is wrong. Assuming a UTF-8 locale";
+
#ifdef DEBUGGING
/* At least one required character in the target string is expressible only in
* UTF-8. */
PL_utf8_swash_ptrs[_CC_WORDCHAR], \
"", \
PL_XPosix_ptrs[_CC_WORDCHAR], \
- LATIN_CAPITAL_LETTER_SHARP_S_UTF8);
+ LATIN_SMALL_LIGATURE_LONG_S_T_UTF8);
#define PLACEHOLDER /* Something for the preprocessor to grab onto */
/* TODO: Combine JUMPABLE and HAS_TEXT to cache OP(rn) */
}
else if (UTF8_IS_DOWNGRADEABLE_START(*character)) {
return isFOO_lc(classnum,
- TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
+ EIGHT_BIT_UTF8_TO_NATIVE(*character, *(character + 1)));
}
_CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
* caller will have set strpos=pos()-4; we look for the substr
* at position pos()-4+1, which lines up with the "a" */
- if (prog->check_offset_min == prog->check_offset_max
- && !(prog->intflags & PREGf_CANY_SEEN))
- {
+ if (prog->check_offset_min == prog->check_offset_max) {
/* Substring at constant offset from beg-of-str... */
SSize_t slen = SvCUR(check);
char *s = HOP3c(strpos, prog->check_offset_min, strend);
(IV)prog->check_end_shift);
});
- if (prog->intflags & PREGf_CANY_SEEN) {
- start_point= (U8*)(rx_origin + start_shift);
- end_point= (U8*)(strend - end_shift);
- if (start_point > end_point)
- goto fail_finish;
- } else {
- end_point = HOP3(strend, -end_shift, strbeg);
- start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
- if (!start_point)
- goto fail_finish;
- }
+ end_point = HOP3(strend, -end_shift, strbeg);
+ start_point = HOPMAYBE3(rx_origin, start_shift, end_point);
+ if (!start_point)
+ goto fail_finish;
/* If the regex is absolutely anchored to either the start of the
* didn't contradict, so just retry the anchored "other"
* substr */
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
- " Found /%s^%s/m, rescanning for anchored from offset %ld (rx_origin now %"IVdf")...\n",
+ " Found /%s^%s/m, rescanning for anchored from offset %"IVdf" (rx_origin now %"IVdf")...\n",
PL_colors[0], PL_colors[1],
- (long)(rx_origin - strbeg + prog->anchored_offset),
- (long)(rx_origin - strbeg)
+ (IV)(rx_origin - strbeg + prog->anchored_offset),
+ (IV)(rx_origin - strbeg)
));
goto do_other_substr;
}
} else { \
uvc = _to_utf8_fold_flags( (const U8*) uc, foldbuf, &foldlen, flags); \
len = UTF8SKIP(uc); \
- skiplen = UNISKIP( uvc ); \
+ skiplen = UVCHR_SKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
} \
} else { \
len = 1; \
uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, flags); \
- skiplen = UNISKIP( uvc ); \
+ skiplen = UVCHR_SKIP( uvc ); \
foldlen -= skiplen; \
uscan = foldbuf + skiplen; \
} \
#define getGCB_VAL_CP(cp) \
_generic_GET_BREAK_VAL_CP( \
PL_GCB_invlist, \
- Grapheme_Cluster_Break_invmap, \
+ _Perl_GCB_invmap, \
(cp))
/* Returns the GCB value for the first code point in the UTF-8 encoded string
#define getSB_VAL_CP(cp) \
_generic_GET_BREAK_VAL_CP( \
PL_SB_invlist, \
- Sentence_Break_invmap, \
+ _Perl_SB_invmap, \
(cp))
/* Returns the SB value for the first code point in the UTF-8 encoded string
#define getWB_VAL_CP(cp) \
_generic_GET_BREAK_VAL_CP( \
PL_WB_invlist, \
- Word_Break_invmap, \
+ _Perl_WB_invmap, \
(cp))
/* Returns the WB value for the first code point in the UTF-8 encoded string
switch (OP(c)) {
case ANYOFL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if ((FLAGS(c) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
+
/* FALLTHROUGH */
+ case ANYOFD:
case ANYOF:
if (utf8_target) {
REXEC_FBC_UTF8_CLASS_SCAN(
REXEC_FBC_CLASS_SCAN(REGINCLASS(prog, c, (U8*)s));
}
break;
- case CANY:
- REXEC_FBC_SCAN(
- if (tmp && (reginfo->intuit || regtry(reginfo, &s)))
- goto got_it;
- else
- tmp = doevery;
- );
- break;
case EXACTFA_NO_TRIE: /* This node only generated for non-utf8 patterns */
assert(! is_utf8_pat);
case BOUNDL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (FLAGS(c) != TRADITIONAL_BOUND) {
- Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ if (! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ }
goto do_boundu;
}
case NBOUNDL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (FLAGS(c) != TRADITIONAL_BOUND) {
- Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
+ if (! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE),
B_ON_NON_UTF8_LOCALE_IS_WRONG);
+ }
goto do_nboundu;
}
}
if (utf8_target) {
- PL_GCB_enum before = getGCB_VAL_UTF8(
+ GCB_enum before = getGCB_VAL_UTF8(
reghop3((U8*)s, -1,
(U8*)(reginfo->strbeg)),
(U8*) reginfo->strend);
while (s < strend) {
- PL_GCB_enum after = getGCB_VAL_UTF8((U8*) s,
+ GCB_enum after = getGCB_VAL_UTF8((U8*) s,
(U8*) reginfo->strend);
if (to_complement ^ isGCB(before, after)) {
if (reginfo->intuit || regtry(reginfo, &s)) {
}
if (utf8_target) {
- PL_SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
+ SB_enum before = getSB_VAL_UTF8(reghop3((U8*)s,
-1,
(U8*)(reginfo->strbeg)),
(U8*) reginfo->strend);
while (s < strend) {
- PL_SB_enum after = getSB_VAL_UTF8((U8*) s,
+ SB_enum after = getSB_VAL_UTF8((U8*) s,
(U8*) reginfo->strend);
if (to_complement ^ isSB(before,
after,
}
}
else { /* Not utf8. */
- PL_SB_enum before = getSB_VAL_CP((U8) *(s -1));
+ SB_enum before = getSB_VAL_CP((U8) *(s -1));
while (s < strend) {
- PL_SB_enum after = getSB_VAL_CP((U8) *s);
+ SB_enum after = getSB_VAL_CP((U8) *s);
if (to_complement ^ isSB(before,
after,
(U8*) reginfo->strbeg,
* loop through the line. Context may be needed to make a
* determination, and if so, this can save having to
* recalculate it */
- PL_WB_enum previous = PL_WB_UNKNOWN;
- PL_WB_enum before = getWB_VAL_UTF8(
+ WB_enum previous = WB_UNKNOWN;
+ WB_enum before = getWB_VAL_UTF8(
reghop3((U8*)s,
-1,
(U8*)(reginfo->strbeg)),
(U8*) reginfo->strend);
while (s < strend) {
- PL_WB_enum after = getWB_VAL_UTF8((U8*) s,
+ WB_enum after = getWB_VAL_UTF8((U8*) s,
(U8*) reginfo->strend);
if (to_complement ^ isWB(previous,
before,
}
}
else { /* Not utf8. */
- PL_WB_enum previous = PL_WB_UNKNOWN;
- PL_WB_enum before = getWB_VAL_CP((U8) *(s -1));
+ WB_enum previous = WB_UNKNOWN;
+ WB_enum before = getWB_VAL_CP((U8) *(s -1));
while (s < strend) {
- PL_WB_enum after = getWB_VAL_CP((U8) *s);
+ WB_enum after = getWB_VAL_CP((U8) *s);
if (to_complement ^ isWB(previous,
before,
after,
classnum)))
|| (UTF8_IS_DOWNGRADEABLE_START(*s)
&& to_complement ^ cBOOL(
- _generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*s,
+ _generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*s,
*(s + 1)),
classnum))))
{
if (minlen) {
const OPCODE op = OP(progi->regstclass);
/* don't bother with what can't match */
- if (PL_regkind[op] != EXACT && op != CANY && PL_regkind[op] != TRIE)
+ if (PL_regkind[op] != EXACT && PL_regkind[op] != TRIE)
strend = HOPc(strend, -(minlen - 1));
}
DEBUG_EXECUTE_r({
if (pref0_len > pref_len)
pref0_len = pref_len;
{
- const int is_uni = (utf8_target && OP(scan) != CANY) ? 1 : 0;
+ const int is_uni = utf8_target ? 1 : 0;
RE_PV_COLOR_DECL(s0,len0,is_uni,PERL_DEBUG_PAD(0),
(locinput - pref_len),pref0_len, 60, 4, 5);
/* This creates a single number by combining two, with 'before' being like the
* 10's digit, but this isn't necessarily base 10; it is base however many
* elements of the enum there are */
-#define GCBcase(before, after) ((PL_GCB_ENUM_COUNT * before) + after)
+#define GCBcase(before, after) ((GCB_ENUM_COUNT * before) + after)
STATIC bool
-S_isGCB(const PL_GCB_enum before, const PL_GCB_enum after)
+S_isGCB(const GCB_enum before, const GCB_enum after)
{
/* returns a boolean indicating if there is a Grapheme Cluster Boundary
* between the inputs. See http://www.unicode.org/reports/tr29/ */
/* Do not break between a CR and LF.
GB3. CR × LF */
- case GCBcase(PL_GCB_CR, PL_GCB_LF):
+ case GCBcase(GCB_CR, GCB_LF):
return FALSE;
/* Do not break Hangul syllable sequences.
GB6. L × ( L | V | LV | LVT ) */
- case GCBcase(PL_GCB_L, PL_GCB_L):
- case GCBcase(PL_GCB_L, PL_GCB_V):
- case GCBcase(PL_GCB_L, PL_GCB_LV):
- case GCBcase(PL_GCB_L, PL_GCB_LVT):
+ case GCBcase(GCB_L, GCB_L):
+ case GCBcase(GCB_L, GCB_V):
+ case GCBcase(GCB_L, GCB_LV):
+ case GCBcase(GCB_L, GCB_LVT):
return FALSE;
/* GB7. ( LV | V ) × ( V | T ) */
- case GCBcase(PL_GCB_LV, PL_GCB_V):
- case GCBcase(PL_GCB_LV, PL_GCB_T):
- case GCBcase(PL_GCB_V, PL_GCB_V):
- case GCBcase(PL_GCB_V, PL_GCB_T):
+ case GCBcase(GCB_LV, GCB_V):
+ case GCBcase(GCB_LV, GCB_T):
+ case GCBcase(GCB_V, GCB_V):
+ case GCBcase(GCB_V, GCB_T):
return FALSE;
/* GB8. ( LVT | T) × T */
- case GCBcase(PL_GCB_LVT, PL_GCB_T):
- case GCBcase(PL_GCB_T, PL_GCB_T):
+ case GCBcase(GCB_LVT, GCB_T):
+ case GCBcase(GCB_T, GCB_T):
return FALSE;
/* Do not break between regional indicator symbols.
GB8a. Regional_Indicator × Regional_Indicator */
- case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_Regional_Indicator):
+ case GCBcase(GCB_Regional_Indicator, GCB_Regional_Indicator):
return FALSE;
/* Do not break before extending characters.
GB9. × Extend */
- case GCBcase(PL_GCB_Other, PL_GCB_Extend):
- case GCBcase(PL_GCB_Extend, PL_GCB_Extend):
- case GCBcase(PL_GCB_L, PL_GCB_Extend):
- case GCBcase(PL_GCB_LV, PL_GCB_Extend):
- case GCBcase(PL_GCB_LVT, PL_GCB_Extend):
- case GCBcase(PL_GCB_Prepend, PL_GCB_Extend):
- case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_Extend):
- case GCBcase(PL_GCB_SpacingMark, PL_GCB_Extend):
- case GCBcase(PL_GCB_T, PL_GCB_Extend):
- case GCBcase(PL_GCB_V, PL_GCB_Extend):
+ case GCBcase(GCB_Other, GCB_Extend):
+ case GCBcase(GCB_Extend, GCB_Extend):
+ case GCBcase(GCB_L, GCB_Extend):
+ case GCBcase(GCB_LV, GCB_Extend):
+ case GCBcase(GCB_LVT, GCB_Extend):
+ case GCBcase(GCB_Prepend, GCB_Extend):
+ case GCBcase(GCB_Regional_Indicator, GCB_Extend):
+ case GCBcase(GCB_SpacingMark, GCB_Extend):
+ case GCBcase(GCB_T, GCB_Extend):
+ case GCBcase(GCB_V, GCB_Extend):
return FALSE;
/* Do not break before SpacingMarks, or after Prepend characters.
GB9a. × SpacingMark */
- case GCBcase(PL_GCB_Other, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_Extend, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_L, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_LV, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_LVT, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_Prepend, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_Regional_Indicator, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_SpacingMark, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_T, PL_GCB_SpacingMark):
- case GCBcase(PL_GCB_V, PL_GCB_SpacingMark):
+ case GCBcase(GCB_Other, GCB_SpacingMark):
+ case GCBcase(GCB_Extend, GCB_SpacingMark):
+ case GCBcase(GCB_L, GCB_SpacingMark):
+ case GCBcase(GCB_LV, GCB_SpacingMark):
+ case GCBcase(GCB_LVT, GCB_SpacingMark):
+ case GCBcase(GCB_Prepend, GCB_SpacingMark):
+ case GCBcase(GCB_Regional_Indicator, GCB_SpacingMark):
+ case GCBcase(GCB_SpacingMark, GCB_SpacingMark):
+ case GCBcase(GCB_T, GCB_SpacingMark):
+ case GCBcase(GCB_V, GCB_SpacingMark):
return FALSE;
/* GB9b. Prepend × */
- case GCBcase(PL_GCB_Prepend, PL_GCB_Other):
- case GCBcase(PL_GCB_Prepend, PL_GCB_L):
- case GCBcase(PL_GCB_Prepend, PL_GCB_LV):
- case GCBcase(PL_GCB_Prepend, PL_GCB_LVT):
- case GCBcase(PL_GCB_Prepend, PL_GCB_Prepend):
- case GCBcase(PL_GCB_Prepend, PL_GCB_Regional_Indicator):
- case GCBcase(PL_GCB_Prepend, PL_GCB_T):
- case GCBcase(PL_GCB_Prepend, PL_GCB_V):
+ case GCBcase(GCB_Prepend, GCB_Other):
+ case GCBcase(GCB_Prepend, GCB_L):
+ case GCBcase(GCB_Prepend, GCB_LV):
+ case GCBcase(GCB_Prepend, GCB_LVT):
+ case GCBcase(GCB_Prepend, GCB_Prepend):
+ case GCBcase(GCB_Prepend, GCB_Regional_Indicator):
+ case GCBcase(GCB_Prepend, GCB_T):
+ case GCBcase(GCB_Prepend, GCB_V):
return FALSE;
}
#define SBcase(before, after) ((SB_ENUM_COUNT * before) + after)
STATIC bool
-S_isSB(pTHX_ PL_SB_enum before,
- PL_SB_enum after,
+S_isSB(pTHX_ SB_enum before,
+ SB_enum after,
const U8 * const strbeg,
const U8 * const curpos,
const U8 * const strend,
U8 * lpos = (U8 *) curpos;
U8 * temp_pos;
- PL_SB_enum backup;
+ SB_enum backup;
PERL_ARGS_ASSERT_ISSB;
/* Break at the start and end of text.
SB1. sot ÷
SB2. ÷ eot */
- if (before == PL_SB_EDGE || after == PL_SB_EDGE) {
+ if (before == SB_EDGE || after == SB_EDGE) {
return TRUE;
}
/* SB 3: Do not break within CRLF. */
- if (before == PL_SB_CR && after == PL_SB_LF) {
+ if (before == SB_CR && after == SB_LF) {
return FALSE;
}
/* Break after paragraph separators. (though why CR and LF are considered
* so is beyond me (khw)
SB4. Sep | CR | LF ÷ */
- if (before == PL_SB_Sep || before == PL_SB_CR || before == PL_SB_LF) {
+ if (before == SB_Sep || before == SB_CR || before == SB_LF) {
return TRUE;
}
/* Ignore Format and Extend characters, except after sot, Sep, CR, or LF.
* (See Section 6.2, Replacing Ignore Rules.)
SB5. X (Extend | Format)* → X */
- if (after == PL_SB_Extend || after == PL_SB_Format) {
+ if (after == SB_Extend || after == SB_Format) {
return FALSE;
}
- if (before == PL_SB_Extend || before == PL_SB_Format) {
+ if (before == SB_Extend || before == SB_Format) {
before = backup_one_SB(strbeg, &lpos, utf8_target);
}
* not mark the end of a sentence.
* SB6. ATerm × Numeric */
- if (before == PL_SB_ATerm && after == PL_SB_Numeric) {
+ if (before == SB_ATerm && after == SB_Numeric) {
return FALSE;
}
- /* SB7. Upper ATerm × Upper */
- if (before == PL_SB_ATerm && after == PL_SB_Upper) {
+ /* SB7. (Upper | Lower) ATerm × Upper */
+ if (before == SB_ATerm && after == SB_Upper) {
temp_pos = lpos;
- if (PL_SB_Upper == backup_one_SB(strbeg, &temp_pos, utf8_target)) {
+ backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
+ if (backup == SB_Upper || backup == SB_Lower) {
return FALSE;
}
}
* SB10. (STerm | ATerm) Close* Sp* × ( Sp | Sep | CR | LF ) */
backup = before;
temp_pos = lpos;
- while (backup == PL_SB_Sp) {
+ while (backup == SB_Sp) {
backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
}
- while (backup == PL_SB_Close) {
+ while (backup == SB_Close) {
backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
}
- if ((backup == PL_SB_STerm || backup == PL_SB_ATerm)
- && ( after == PL_SB_SContinue
- || after == PL_SB_STerm
- || after == PL_SB_ATerm
- || after == PL_SB_Sp
- || after == PL_SB_Sep
- || after == PL_SB_CR
- || after == PL_SB_LF))
+ if ((backup == SB_STerm || backup == SB_ATerm)
+ && ( after == SB_SContinue
+ || after == SB_STerm
+ || after == SB_ATerm
+ || after == SB_Sp
+ || after == SB_Sep
+ || after == SB_CR
+ || after == SB_LF))
{
return FALSE;
}
/* SB8. ATerm Close* Sp* × ( ¬(OLetter | Upper | Lower | Sep | CR | LF |
* STerm | ATerm) )* Lower */
- if (backup == PL_SB_ATerm) {
+ if (backup == SB_ATerm) {
U8 * rpos = (U8 *) curpos;
- PL_SB_enum later = after;
-
- while ( later != PL_SB_OLetter
- && later != PL_SB_Upper
- && later != PL_SB_Lower
- && later != PL_SB_Sep
- && later != PL_SB_CR
- && later != PL_SB_LF
- && later != PL_SB_STerm
- && later != PL_SB_ATerm
- && later != PL_SB_EDGE)
+ SB_enum later = after;
+
+ while ( later != SB_OLetter
+ && later != SB_Upper
+ && later != SB_Lower
+ && later != SB_Sep
+ && later != SB_CR
+ && later != SB_LF
+ && later != SB_STerm
+ && later != SB_ATerm
+ && later != SB_EDGE)
{
later = advance_one_SB(&rpos, strend, utf8_target);
}
- if (later == PL_SB_Lower) {
+ if (later == SB_Lower) {
return FALSE;
}
}
* SB9. ( STerm | ATerm ) Close* × ( Close | Sp | Sep | CR | LF ) */
backup = before;
temp_pos = lpos;
- while (backup == PL_SB_Close) {
+ while (backup == SB_Close) {
backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
}
- if ((backup == PL_SB_STerm || backup == PL_SB_ATerm)
- && ( after == PL_SB_Close
- || after == PL_SB_Sp
- || after == PL_SB_Sep
- || after == PL_SB_CR
- || after == PL_SB_LF))
+ if ((backup == SB_STerm || backup == SB_ATerm)
+ && ( after == SB_Close
+ || after == SB_Sp
+ || after == SB_Sep
+ || after == SB_CR
+ || after == SB_LF))
{
return FALSE;
}
/* SB11. ( STerm | ATerm ) Close* Sp* ( Sep | CR | LF )? ÷ */
temp_pos = lpos;
backup = backup_one_SB(strbeg, &temp_pos, utf8_target);
- if ( backup == PL_SB_Sep
- || backup == PL_SB_CR
- || backup == PL_SB_LF)
+ if ( backup == SB_Sep
+ || backup == SB_CR
+ || backup == SB_LF)
{
lpos = temp_pos;
}
else {
backup = before;
}
- while (backup == PL_SB_Sp) {
+ while (backup == SB_Sp) {
backup = backup_one_SB(strbeg, &lpos, utf8_target);
}
- while (backup == PL_SB_Close) {
+ while (backup == SB_Close) {
backup = backup_one_SB(strbeg, &lpos, utf8_target);
}
- if (backup == PL_SB_STerm || backup == PL_SB_ATerm) {
+ if (backup == SB_STerm || backup == SB_ATerm) {
return TRUE;
}
return FALSE;
}
-STATIC PL_SB_enum
+STATIC SB_enum
S_advance_one_SB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
{
- PL_SB_enum sb;
+ SB_enum sb;
PERL_ARGS_ASSERT_ADVANCE_ONE_SB;
if (*curpos >= strend) {
- return PL_SB_EDGE;
+ return SB_EDGE;
}
if (utf8_target) {
do {
*curpos += UTF8SKIP(*curpos);
if (*curpos >= strend) {
- return PL_SB_EDGE;
+ return SB_EDGE;
}
sb = getSB_VAL_UTF8(*curpos, strend);
- } while (sb == PL_SB_Extend || sb == PL_SB_Format);
+ } while (sb == SB_Extend || sb == SB_Format);
}
else {
do {
(*curpos)++;
if (*curpos >= strend) {
- return PL_SB_EDGE;
+ return SB_EDGE;
}
sb = getSB_VAL_CP(**curpos);
- } while (sb == PL_SB_Extend || sb == PL_SB_Format);
+ } while (sb == SB_Extend || sb == SB_Format);
}
return sb;
}
-STATIC PL_SB_enum
+STATIC SB_enum
S_backup_one_SB(pTHX_ const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
{
- PL_SB_enum sb;
+ SB_enum sb;
PERL_ARGS_ASSERT_BACKUP_ONE_SB;
if (*curpos < strbeg) {
- return PL_SB_EDGE;
+ return SB_EDGE;
}
if (utf8_target) {
U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
if (! prev_char_pos) {
- return PL_SB_EDGE;
+ return SB_EDGE;
}
/* Back up over Extend and Format. curpos is always just to the right
}
else {
*curpos = (U8 *) strbeg;
- return PL_SB_EDGE;
+ return SB_EDGE;
}
- } while (sb == PL_SB_Extend || sb == PL_SB_Format);
+ } while (sb == SB_Extend || sb == SB_Format);
}
else {
do {
if (*curpos - 2 < strbeg) {
*curpos = (U8 *) strbeg;
- return PL_SB_EDGE;
+ return SB_EDGE;
}
(*curpos)--;
sb = getSB_VAL_CP(*(*curpos - 1));
- } while (sb == PL_SB_Extend || sb == PL_SB_Format);
+ } while (sb == SB_Extend || sb == SB_Format);
}
return sb;
}
-#define WBcase(before, after) ((PL_WB_ENUM_COUNT * before) + after)
+#define WBcase(before, after) ((WB_ENUM_COUNT * before) + after)
STATIC bool
-S_isWB(pTHX_ PL_WB_enum previous,
- PL_WB_enum before,
- PL_WB_enum after,
+S_isWB(pTHX_ WB_enum previous,
+ WB_enum before,
+ WB_enum after,
const U8 * const strbeg,
const U8 * const curpos,
const U8 * const strend,
* a Unicode word break, using their published algorithm. Context may be
* needed to make this determination. If the value for the character
* before 'before' is known, it is passed as 'previous'; otherwise that
- * should be set to PL_WB_UNKNOWN. The other input parameters give the
+ * should be set to WB_UNKNOWN. The other input parameters give the
* boundaries and current position in the matching of the string. That
* is, 'curpos' marks the position where the character whose wb value is
* 'after' begins. See http://www.unicode.org/reports/tr29/ */
PERL_ARGS_ASSERT_ISWB;
/* WB1 and WB2: Break at the start and end of text. */
- if (before == PL_WB_EDGE || after == PL_WB_EDGE) {
+ if (before == WB_EDGE || after == WB_EDGE) {
return TRUE;
}
/* WB 3: Do not break within CRLF. */
- if (before == PL_WB_CR && after == PL_WB_LF) {
+ if (before == WB_CR && after == WB_LF) {
return FALSE;
}
/* WB 3a and WB 3b: Otherwise break before and after Newlines (including CR
* and LF) */
- if ( before == PL_WB_CR || before == PL_WB_LF || before == PL_WB_Newline
- || after == PL_WB_CR || after == PL_WB_LF || after == PL_WB_Newline)
+ if ( before == WB_CR || before == WB_LF || before == WB_Newline
+ || after == WB_CR || after == WB_LF || after == WB_Newline)
{
return TRUE;
}
* beginning of a region of text.
* WB4. X (Extend | Format)* → X. */
- if (after == PL_WB_Extend || after == PL_WB_Format) {
+ if (after == WB_Extend || after == WB_Format) {
return FALSE;
}
- if (before == PL_WB_Extend || before == PL_WB_Format) {
+ if (before == WB_Extend || before == WB_Format) {
before = backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
}
/* Do not break between most letters.
WB5. (ALetter | Hebrew_Letter) × (ALetter | Hebrew_Letter) */
- case WBcase(PL_WB_ALetter, PL_WB_ALetter):
- case WBcase(PL_WB_ALetter, PL_WB_Hebrew_Letter):
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_ALetter):
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_Hebrew_Letter):
+ case WBcase(WB_ALetter, WB_ALetter):
+ case WBcase(WB_ALetter, WB_Hebrew_Letter):
+ case WBcase(WB_Hebrew_Letter, WB_ALetter):
+ case WBcase(WB_Hebrew_Letter, WB_Hebrew_Letter):
return FALSE;
/* Do not break letters across certain punctuation.
WB6. (ALetter | Hebrew_Letter)
× (MidLetter | MidNumLet | Single_Quote) (ALetter
| Hebrew_Letter) */
- case WBcase(PL_WB_ALetter, PL_WB_MidLetter):
- case WBcase(PL_WB_ALetter, PL_WB_MidNumLet):
- case WBcase(PL_WB_ALetter, PL_WB_Single_Quote):
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_MidLetter):
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_MidNumLet):
- /*case WBcase(PL_WB_Hebrew_Letter, PL_WB_Single_Quote):*/
+ case WBcase(WB_ALetter, WB_MidLetter):
+ case WBcase(WB_ALetter, WB_MidNumLet):
+ case WBcase(WB_ALetter, WB_Single_Quote):
+ case WBcase(WB_Hebrew_Letter, WB_MidLetter):
+ case WBcase(WB_Hebrew_Letter, WB_MidNumLet):
+ /*case WBcase(WB_Hebrew_Letter, WB_Single_Quote):*/
after = advance_one_WB(&after_pos, strend, utf8_target);
- return after != PL_WB_ALetter && after != PL_WB_Hebrew_Letter;
+ return after != WB_ALetter && after != WB_Hebrew_Letter;
/* WB7. (ALetter | Hebrew_Letter) (MidLetter | MidNumLet |
* Single_Quote) × (ALetter | Hebrew_Letter) */
- case WBcase(PL_WB_MidLetter, PL_WB_ALetter):
- case WBcase(PL_WB_MidLetter, PL_WB_Hebrew_Letter):
- case WBcase(PL_WB_MidNumLet, PL_WB_ALetter):
- case WBcase(PL_WB_MidNumLet, PL_WB_Hebrew_Letter):
- case WBcase(PL_WB_Single_Quote, PL_WB_ALetter):
- case WBcase(PL_WB_Single_Quote, PL_WB_Hebrew_Letter):
+ case WBcase(WB_MidLetter, WB_ALetter):
+ case WBcase(WB_MidLetter, WB_Hebrew_Letter):
+ case WBcase(WB_MidNumLet, WB_ALetter):
+ case WBcase(WB_MidNumLet, WB_Hebrew_Letter):
+ case WBcase(WB_Single_Quote, WB_ALetter):
+ case WBcase(WB_Single_Quote, WB_Hebrew_Letter):
before
= backup_one_WB(&previous, strbeg, &before_pos, utf8_target);
- return before != PL_WB_ALetter && before != PL_WB_Hebrew_Letter;
+ return before != WB_ALetter && before != WB_Hebrew_Letter;
/* WB7a. Hebrew_Letter × Single_Quote */
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_Single_Quote):
+ case WBcase(WB_Hebrew_Letter, WB_Single_Quote):
return FALSE;
/* WB7b. Hebrew_Letter × Double_Quote Hebrew_Letter */
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_Double_Quote):
+ case WBcase(WB_Hebrew_Letter, WB_Double_Quote):
return advance_one_WB(&after_pos, strend, utf8_target)
- != PL_WB_Hebrew_Letter;
+ != WB_Hebrew_Letter;
/* WB7c. Hebrew_Letter Double_Quote × Hebrew_Letter */
- case WBcase(PL_WB_Double_Quote, PL_WB_Hebrew_Letter):
+ case WBcase(WB_Double_Quote, WB_Hebrew_Letter):
return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
- != PL_WB_Hebrew_Letter;
+ != WB_Hebrew_Letter;
/* Do not break within sequences of digits, or digits adjacent to
* letters (“3a”, or “A3”).
WB8. Numeric × Numeric */
- case WBcase(PL_WB_Numeric, PL_WB_Numeric):
+ case WBcase(WB_Numeric, WB_Numeric):
return FALSE;
/* WB9. (ALetter | Hebrew_Letter) × Numeric */
- case WBcase(PL_WB_ALetter, PL_WB_Numeric):
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_Numeric):
+ case WBcase(WB_ALetter, WB_Numeric):
+ case WBcase(WB_Hebrew_Letter, WB_Numeric):
return FALSE;
/* WB10. Numeric × (ALetter | Hebrew_Letter) */
- case WBcase(PL_WB_Numeric, PL_WB_ALetter):
- case WBcase(PL_WB_Numeric, PL_WB_Hebrew_Letter):
+ case WBcase(WB_Numeric, WB_ALetter):
+ case WBcase(WB_Numeric, WB_Hebrew_Letter):
return FALSE;
/* Do not break within sequences, such as “3.2” or “3,456.789”.
WB11. Numeric (MidNum | MidNumLet | Single_Quote) × Numeric
*/
- case WBcase(PL_WB_MidNum, PL_WB_Numeric):
- case WBcase(PL_WB_MidNumLet, PL_WB_Numeric):
- case WBcase(PL_WB_Single_Quote, PL_WB_Numeric):
+ case WBcase(WB_MidNum, WB_Numeric):
+ case WBcase(WB_MidNumLet, WB_Numeric):
+ case WBcase(WB_Single_Quote, WB_Numeric):
return backup_one_WB(&previous, strbeg, &before_pos, utf8_target)
- != PL_WB_Numeric;
+ != WB_Numeric;
/* WB12. Numeric × (MidNum | MidNumLet | Single_Quote) Numeric
* */
- case WBcase(PL_WB_Numeric, PL_WB_MidNum):
- case WBcase(PL_WB_Numeric, PL_WB_MidNumLet):
- case WBcase(PL_WB_Numeric, PL_WB_Single_Quote):
+ case WBcase(WB_Numeric, WB_MidNum):
+ case WBcase(WB_Numeric, WB_MidNumLet):
+ case WBcase(WB_Numeric, WB_Single_Quote):
return advance_one_WB(&after_pos, strend, utf8_target)
- != PL_WB_Numeric;
+ != WB_Numeric;
/* Do not break between Katakana.
WB13. Katakana × Katakana */
- case WBcase(PL_WB_Katakana, PL_WB_Katakana):
+ case WBcase(WB_Katakana, WB_Katakana):
return FALSE;
/* Do not break from extenders.
WB13a. (ALetter | Hebrew_Letter | Numeric | Katakana |
ExtendNumLet) × ExtendNumLet */
- case WBcase(PL_WB_ALetter, PL_WB_ExtendNumLet):
- case WBcase(PL_WB_Hebrew_Letter, PL_WB_ExtendNumLet):
- case WBcase(PL_WB_Numeric, PL_WB_ExtendNumLet):
- case WBcase(PL_WB_Katakana, PL_WB_ExtendNumLet):
- case WBcase(PL_WB_ExtendNumLet, PL_WB_ExtendNumLet):
+ case WBcase(WB_ALetter, WB_ExtendNumLet):
+ case WBcase(WB_Hebrew_Letter, WB_ExtendNumLet):
+ case WBcase(WB_Numeric, WB_ExtendNumLet):
+ case WBcase(WB_Katakana, WB_ExtendNumLet):
+ case WBcase(WB_ExtendNumLet, WB_ExtendNumLet):
return FALSE;
/* WB13b. ExtendNumLet × (ALetter | Hebrew_Letter | Numeric
* | Katakana) */
- case WBcase(PL_WB_ExtendNumLet, PL_WB_ALetter):
- case WBcase(PL_WB_ExtendNumLet, PL_WB_Hebrew_Letter):
- case WBcase(PL_WB_ExtendNumLet, PL_WB_Numeric):
- case WBcase(PL_WB_ExtendNumLet, PL_WB_Katakana):
+ case WBcase(WB_ExtendNumLet, WB_ALetter):
+ case WBcase(WB_ExtendNumLet, WB_Hebrew_Letter):
+ case WBcase(WB_ExtendNumLet, WB_Numeric):
+ case WBcase(WB_ExtendNumLet, WB_Katakana):
return FALSE;
/* Do not break between regional indicator symbols.
WB13c. Regional_Indicator × Regional_Indicator */
- case WBcase(PL_WB_Regional_Indicator, PL_WB_Regional_Indicator):
+ case WBcase(WB_Regional_Indicator, WB_Regional_Indicator):
return FALSE;
}
NOT_REACHED; /* NOTREACHED */
}
-STATIC PL_WB_enum
+STATIC WB_enum
S_advance_one_WB(pTHX_ U8 ** curpos, const U8 * const strend, const bool utf8_target)
{
- PL_WB_enum wb;
+ WB_enum wb;
PERL_ARGS_ASSERT_ADVANCE_ONE_WB;
if (*curpos >= strend) {
- return PL_WB_EDGE;
+ return WB_EDGE;
}
if (utf8_target) {
do {
*curpos += UTF8SKIP(*curpos);
if (*curpos >= strend) {
- return PL_WB_EDGE;
+ return WB_EDGE;
}
wb = getWB_VAL_UTF8(*curpos, strend);
- } while (wb == PL_WB_Extend || wb == PL_WB_Format);
+ } while (wb == WB_Extend || wb == WB_Format);
}
else {
do {
(*curpos)++;
if (*curpos >= strend) {
- return PL_WB_EDGE;
+ return WB_EDGE;
}
wb = getWB_VAL_CP(**curpos);
- } while (wb == PL_WB_Extend || wb == PL_WB_Format);
+ } while (wb == WB_Extend || wb == WB_Format);
}
return wb;
}
-STATIC PL_WB_enum
-S_backup_one_WB(pTHX_ PL_WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
+STATIC WB_enum
+S_backup_one_WB(pTHX_ WB_enum * previous, const U8 * const strbeg, U8 ** curpos, const bool utf8_target)
{
- PL_WB_enum wb;
+ WB_enum wb;
PERL_ARGS_ASSERT_BACKUP_ONE_WB;
/* If we know what the previous character's break value is, don't have
* to look it up */
- if (*previous != PL_WB_UNKNOWN) {
+ if (*previous != WB_UNKNOWN) {
wb = *previous;
- *previous = PL_WB_UNKNOWN;
+ *previous = WB_UNKNOWN;
/* XXX Note that doesn't change curpos, and maybe should */
/* But we always back up over these two types */
- if (wb != PL_WB_Extend && wb != PL_WB_Format) {
+ if (wb != WB_Extend && wb != WB_Format) {
return wb;
}
}
if (*curpos < strbeg) {
- return PL_WB_EDGE;
+ return WB_EDGE;
}
if (utf8_target) {
U8 * prev_char_pos = reghopmaybe3(*curpos, -1, strbeg);
if (! prev_char_pos) {
- return PL_WB_EDGE;
+ return WB_EDGE;
}
/* Back up over Extend and Format. curpos is always just to the right
}
else {
*curpos = (U8 *) strbeg;
- return PL_WB_EDGE;
+ return WB_EDGE;
}
- } while (wb == PL_WB_Extend || wb == PL_WB_Format);
+ } while (wb == WB_Extend || wb == WB_Format);
}
else {
do {
if (*curpos - 2 < strbeg) {
*curpos = (U8 *) strbeg;
- return PL_WB_EDGE;
+ return WB_EDGE;
}
(*curpos)--;
wb = getWB_VAL_CP(*(*curpos - 1));
- } while (wb == PL_WB_Extend || wb == PL_WB_Format);
+ } while (wb == WB_Extend || wb == WB_Format);
}
return wb;
sayNO;
goto increment_locinput;
- case CANY: /* \C */
- if (NEXTCHR_IS_EOS)
- sayNO;
- locinput++;
- break;
-
case REG_ANY: /* /./ */
if ((NEXTCHR_IS_EOS) || nextchr == '\n')
sayNO;
l++;
}
else {
- if (TWO_BYTE_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
+ if (EIGHT_BIT_UTF8_TO_NATIVE(*l, *(l+1)) != * (U8*) s)
{
sayNO;
}
s++;
}
else {
- if (TWO_BYTE_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
+ if (EIGHT_BIT_UTF8_TO_NATIVE(*s, *(s+1)) != * (U8*) l)
{
sayNO;
}
/* FALLTHROUGH */
case BOUNDL: /* /\b/l */
+ {
+ bool b1, b2;
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (FLAGS(scan) != TRADITIONAL_BOUND) {
if (utf8_target) {
if (locinput == reginfo->strbeg)
- ln = isWORDCHAR_LC('\n');
+ b1 = isWORDCHAR_LC('\n');
else {
- ln = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
+ b1 = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
(U8*)(reginfo->strbeg)));
}
- n = (NEXTCHR_IS_EOS)
+ b2 = (NEXTCHR_IS_EOS)
? isWORDCHAR_LC('\n')
: isWORDCHAR_LC_utf8((U8*)locinput);
}
else { /* Here the string isn't utf8 */
- ln = (locinput == reginfo->strbeg)
+ b1 = (locinput == reginfo->strbeg)
? isWORDCHAR_LC('\n')
: isWORDCHAR_LC(UCHARAT(locinput - 1));
- n = (NEXTCHR_IS_EOS)
+ b2 = (NEXTCHR_IS_EOS)
? isWORDCHAR_LC('\n')
: isWORDCHAR_LC(nextchr);
}
- if (to_complement ^ (ln == n)) {
+ if (to_complement ^ (b1 == b2)) {
sayNO;
}
break;
+ }
case NBOUND: /* /\B/ */
to_complement = 1;
/* FALLTHROUGH */
case BOUNDA: /* /\b/a */
+ {
+ bool b1, b2;
bound_ascii_match_only:
/* Here the string isn't utf8, or is utf8 and only ascii characters
* 2) it is a multi-byte character, in which case the final byte is
* never mistakable for ASCII, and so the test will say it is
* not a word character, which is the correct answer. */
- ln = (locinput == reginfo->strbeg)
+ b1 = (locinput == reginfo->strbeg)
? isWORDCHAR_A('\n')
: isWORDCHAR_A(UCHARAT(locinput - 1));
- n = (NEXTCHR_IS_EOS)
+ b2 = (NEXTCHR_IS_EOS)
? isWORDCHAR_A('\n')
: isWORDCHAR_A(nextchr);
- if (to_complement ^ (ln == n)) {
+ if (to_complement ^ (b1 == b2)) {
sayNO;
}
break;
+ }
case NBOUNDU: /* /\B/u */
to_complement = 1;
bound_utf8:
switch((bound_type) FLAGS(scan)) {
case TRADITIONAL_BOUND:
- ln = (locinput == reginfo->strbeg)
- ? isWORDCHAR_L1('\n')
+ {
+ bool b1, b2;
+ b1 = (locinput == reginfo->strbeg)
+ ? 0 /* isWORDCHAR_L1('\n') */
: isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
(U8*)(reginfo->strbeg)));
- n = (NEXTCHR_IS_EOS)
- ? isWORDCHAR_L1('\n')
+ b2 = (NEXTCHR_IS_EOS)
+ ? 0 /* isWORDCHAR_L1('\n') */
: isWORDCHAR_utf8((U8*)locinput);
- match = cBOOL(ln != n);
+ match = cBOOL(b1 != b2);
break;
+ }
case GCB_BOUND:
if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
match = TRUE; /* GCB always matches at begin and
match = TRUE;
}
else {
- match = isWB(PL_WB_UNKNOWN,
+ match = isWB(WB_UNKNOWN,
getWB_VAL_UTF8(
reghop3((U8*)locinput,
-1,
else { /* Not utf8 target */
switch((bound_type) FLAGS(scan)) {
case TRADITIONAL_BOUND:
- ln = (locinput == reginfo->strbeg)
- ? isWORDCHAR_L1('\n')
+ {
+ bool b1, b2;
+ b1 = (locinput == reginfo->strbeg)
+ ? 0 /* isWORDCHAR_L1('\n') */
: isWORDCHAR_L1(UCHARAT(locinput - 1));
- n = (NEXTCHR_IS_EOS)
- ? isWORDCHAR_L1('\n')
+ b2 = (NEXTCHR_IS_EOS)
+ ? 0 /* isWORDCHAR_L1('\n') */
: isWORDCHAR_L1(nextchr);
- match = cBOOL(ln != n);
+ match = cBOOL(b1 != b2);
break;
+ }
case GCB_BOUND:
if (locinput == reginfo->strbeg || NEXTCHR_IS_EOS) {
match = TRUE;
}
else {
- match = isWB(PL_WB_UNKNOWN,
+ match = isWB(WB_UNKNOWN,
getWB_VAL_CP(UCHARAT(locinput -1)),
getWB_VAL_CP(UCHARAT(locinput)),
(U8*) reginfo->strbeg,
case ANYOFL: /* /[abc]/l */
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if ((FLAGS(scan) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE)
+ {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
/* FALLTHROUGH */
+ case ANYOFD: /* /[abc]/d */
case ANYOF: /* /[abc]/ */
if (NEXTCHR_IS_EOS)
sayNO;
}
else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
if (! (to_complement ^ cBOOL(isFOO_lc(FLAGS(scan),
- (U8) TWO_BYTE_UTF8_TO_NATIVE(nextchr,
+ (U8) EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
*(locinput + 1))))))
{
sayNO;
}
else if (UTF8_IS_DOWNGRADEABLE_START(nextchr)) {
if (! (to_complement
- ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(nextchr,
+ ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(nextchr,
*(locinput + 1)),
FLAGS(scan)))))
{
else {
/* Get the gcb type for the current character */
- PL_GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
+ GCB_enum prev_gcb = getGCB_VAL_UTF8((U8*) locinput,
(U8*) reginfo->strend);
/* Then scan through the input until we get to the first
* end-of-input) */
locinput += UTF8SKIP(locinput);
while (locinput < reginfo->strend) {
- PL_GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
+ GCB_enum cur_gcb = getGCB_VAL_UTF8((U8*) locinput,
(U8*) reginfo->strend);
if (isGCB(prev_gcb, cur_gcb)) {
break;
break;
case ACCEPT: /* (*ACCEPT) */
- if (ARG(scan)){
+ if (scan->flags)
+ sv_yes_mark = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
+ if (ARG2L(scan)){
regnode *cursor;
for (cursor=scan;
cursor && OP(cursor)!=END;
NOT_REACHED; /* NOTREACHED */
case CUTGROUP: /* /(*THEN)/ */
- sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
- MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
+ sv_yes_mark = st->u.mark.mark_name = scan->flags
+ ? MUTABLE_SV(rexi->data->data[ ARG( scan ) ])
+ : NULL;
PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
/* NOTREACHED */
NOT_REACHED; /* NOTREACHED */
/* FALLTHROUGH */
case PRUNE: /* (*PRUNE) */
- if (!scan->flags)
+ if (scan->flags)
sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
PUSH_STATE_GOTO(COMMIT_next, next, locinput);
/* NOTREACHED */
case COMMIT_next_fail:
no_final = 1;
/* FALLTHROUGH */
+ sayNO;
+ NOT_REACHED; /* NOTREACHED */
case OPFAIL: /* (*FAIL) */
- sayNO;
+ if (scan->flags)
+ sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
+ if (logical) {
+ /* deal with (?(?!)X|Y) properly,
+ * make sure we trigger the no branch
+ * of the trailing IFTHEN structure*/
+ sw= 0;
+ break;
+ } else {
+ sayNO;
+ }
/* NOTREACHED */
NOT_REACHED; /* NOTREACHED */
NOT_REACHED; /* NOTREACHED */
case SKIP: /* (*SKIP) */
- if (scan->flags) {
+ if (!scan->flags) {
/* (*SKIP) : if we fail we cut here*/
ST.mark_name = NULL;
ST.mark_loc = locinput;
else
scan = loceol;
break;
- case CANY: /* Move <scan> forward <max> bytes, unless goes off end */
- if (utf8_target && loceol - scan > max) {
-
- /* <loceol> hadn't been adjusted in the UTF-8 case */
- scan += max;
- }
- else {
- scan = loceol;
- }
- break;
case EXACTL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
/* Target isn't utf8; convert the character in the UTF-8
* pattern to non-UTF8, and do a simple loop */
- c = TWO_BYTE_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
+ c = EIGHT_BIT_UTF8_TO_NATIVE(c, *(STRING(p) + 1));
while (scan < loceol && UCHARAT(scan) == c) {
scan++;
}
}
case ANYOFL:
_CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ if ((FLAGS(p) & ANYOF_LOC_REQ_UTF8) && ! IN_UTF8_CTYPE_LOCALE) {
+ Perl_ck_warner(aTHX_ packWARN(WARN_LOCALE), utf8_locale_required);
+ }
/* FALLTHROUGH */
+ case ANYOFD:
case ANYOF:
if (utf8_target) {
while (hardcount < max
}
else if (UTF8_IS_DOWNGRADEABLE_START(*scan)) {
if (! (to_complement
- ^ cBOOL(_generic_isCC(TWO_BYTE_UTF8_TO_NATIVE(*scan,
+ ^ cBOOL(_generic_isCC(EIGHT_BIT_UTF8_TO_NATIVE(*scan,
*(scan + 1)),
classnum))))
{
* UTF8_ALLOW_FFFF */
if (c_len == (STRLEN)-1)
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
- if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) {
+ if (c > 255 && OP(n) == ANYOFL && ! (flags & ANYOF_LOC_REQ_UTF8)) {
_CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
}
}
if (c < NUM_ANYOF_CODE_POINTS) {
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
- else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII)
+ else if ((flags
+ & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+ && OP(n) == ANYOFD
&& ! utf8_target
&& ! isASCII(c))
{
}
if (UNICODE_IS_SUPER(c)
- && (flags & ANYOF_WARN_SUPER)
+ && (flags
+ & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER)
+ && OP(n) != ANYOFD
&& ckWARN_d(WARN_NON_UNICODE))
{
Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
if (UTF8_IS_CONTINUED(*s)) {
while (s > lim && UTF8_IS_CONTINUATION(*s))
s--;
+ if (! UTF8_IS_START(*s)) {
+ Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
+ }
}
/* XXX could check well-formedness here */
}
if (UTF8_IS_CONTINUED(*s)) {
while (s > llim && UTF8_IS_CONTINUATION(*s))
s--;
+ if (! UTF8_IS_START(*s)) {
+ Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
+ }
}
/* XXX could check well-formedness here */
}
if (UTF8_IS_CONTINUED(*s)) {
while (s > lim && UTF8_IS_CONTINUATION(*s))
s--;
+ if (! UTF8_IS_START(*s)) {
+ Perl_croak_nocontext("Malformed UTF-8 character (fatal)");
+ }
}
/* XXX could check well-formedness here */
}
}
/*
- * Local variables:
- * c-indentation-style: bsd
- * c-basic-offset: 4
- * indent-tabs-mode: nil
- * End:
- *
* ex: set ts=8 sts=4 sw=4 et:
*/