#if 0
/* Currently these are only used when PL_regkind[OP(rn)] == EXACT so
- we don't need this definition. */
+ we don't need this definition. XXX These are now out-of-sync*/
#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==REF || OP(rn)==NREF )
#define IS_TEXTF(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn)==EXACTFA || OP(rn)==EXACTFA_NO_TRIE || OP(rn)==EXACTF || OP(rn)==REFF || OP(rn)==NREFF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL || OP(rn)==REFFL || OP(rn)==NREFFL )
#else
/* ... so we use this as its faster. */
-#define IS_TEXT(rn) ( OP(rn)==EXACT )
-#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
+#define IS_TEXT(rn) ( OP(rn)==EXACT || OP(rn)==EXACTL )
+#define IS_TEXTFU(rn) ( OP(rn)==EXACTFU || OP(rn)==EXACTFLU8 || OP(rn)==EXACTFU_SS || OP(rn) == EXACTFA || OP(rn) == EXACTFA_NO_TRIE)
#define IS_TEXTF(rn) ( OP(rn)==EXACTF )
#define IS_TEXTFL(rn) ( OP(rn)==EXACTFL )
Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
}
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
return FALSE;
}
* '_char_class_number'.
*
* This just calls isFOO_lc on the code point for the character if it is in
- * the range 0-255. Outside that range, all characters avoid Unicode
+ * the range 0-255. Outside that range, all characters use Unicode
* rules, ignoring any locale. So use the Unicode function if this class
* requires a swash, and use the Unicode macro otherwise. */
TWO_BYTE_UTF8_TO_NATIVE(*character, *(character + 1)));
}
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(character, character + UTF8SKIP(character));
+
if (classnum < _FIRST_NON_SWASH_CC) {
/* Initialize the swash unless done already */
goto fail;
}
+ RX_MATCH_UTF8_set(rx,utf8_target);
reginfo->is_utf8_target = cBOOL(utf8_target);
reginfo->info_aux = NULL;
reginfo->strbeg = strbeg;
* be too fiddly (e.g. REXEC_IGNOREPOS).
*/
if ( strpos != strbeg
- && (prog->intflags & (PREGf_ANCH_BOL|PREGf_ANCH_SBOL)))
+ && (prog->intflags & PREGf_ANCH_SBOL))
{
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log,
" Not at start...\n"));
/* If the regex is absolutely anchored to either the start of the
- * string (BOL,SBOL) or to pos() (ANCH_GPOS), then
+ * string (SBOL) or to pos() (ANCH_GPOS), then
* check_offset_max represents an upper bound on the string where
* the substr could start. For the ANCH_GPOS case, we assume that
* the caller of intuit will have already set strpos to
#define DECL_TRIE_TYPE(scan) \
- const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
- trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold } \
- trie_type = ((scan->flags == EXACT) \
- ? (utf8_target ? trie_utf8 : trie_plain) \
- : (scan->flags == EXACTFA) \
- ? (utf8_target ? trie_utf8_exactfa_fold : trie_latin_utf8_exactfa_fold) \
- : (utf8_target ? trie_utf8_fold : trie_latin_utf8_fold))
+ const enum { trie_plain, trie_utf8, trie_utf8_fold, trie_latin_utf8_fold, \
+ trie_utf8_exactfa_fold, trie_latin_utf8_exactfa_fold, \
+ trie_utf8l, trie_flu8 } \
+ trie_type = ((scan->flags == EXACT) \
+ ? (utf8_target ? trie_utf8 : trie_plain) \
+ : (scan->flags == EXACTL) \
+ ? (utf8_target ? trie_utf8l : trie_plain) \
+ : (scan->flags == EXACTFA) \
+ ? (utf8_target \
+ ? trie_utf8_exactfa_fold \
+ : trie_latin_utf8_exactfa_fold) \
+ : (scan->flags == EXACTFLU8 \
+ ? trie_flu8 \
+ : (utf8_target \
+ ? trie_utf8_fold \
+ : trie_latin_utf8_fold)))
#define REXEC_TRIE_READ_CHAR(trie_type, trie, widecharmap, uc, uscan, len, uvc, charid, foldlen, foldbuf, uniflags) \
STMT_START { \
STRLEN skiplen; \
U8 flags = FOLD_FLAGS_FULL; \
switch (trie_type) { \
+ case trie_flu8: \
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
+ if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
+ } \
+ goto do_trie_utf8_fold; \
case trie_utf8_exactfa_fold: \
flags |= FOLD_FLAGS_NOMIX_ASCII; \
- /* FALLTHROUGH */ \
+ /* FALLTHROUGH */ \
case trie_utf8_fold: \
+ do_trie_utf8_fold: \
if ( foldlen>0 ) { \
uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
foldlen -= len; \
break; \
case trie_latin_utf8_exactfa_fold: \
flags |= FOLD_FLAGS_NOMIX_ASCII; \
- /* FALLTHROUGH */ \
+ /* FALLTHROUGH */ \
case trie_latin_utf8_fold: \
if ( foldlen>0 ) { \
uvc = utf8n_to_uvchr( (const U8*) uscan, UTF8_MAXLEN, &len, uniflags ); \
uscan = foldbuf + skiplen; \
} \
break; \
+ case trie_utf8l: \
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE; \
+ if (utf8_target && UTF8_IS_ABOVE_LATIN1(*uc)) { \
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(uc, uc + UTF8SKIP(uc)); \
+ } \
+ /* FALLTHROUGH */ \
case trie_utf8: \
uvc = utf8n_to_uvchr( (const U8*) uc, UTF8_MAXLEN, &len, uniflags ); \
break; \
/* We know what class it must start with. */
switch (OP(c)) {
+ case ANYOFL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ /* FALLTHROUGH */
case ANYOF:
if (utf8_target) {
REXEC_FBC_UTF8_CLASS_SCAN(
goto do_exactf_non_utf8;
case EXACTFL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (is_utf8_pat || utf8_target || IN_UTF8_CTYPE_LOCALE) {
utf8_fold_flags = FOLDEQ_LOCALE;
goto do_exactf_utf8;
}
goto do_exactf_utf8;
+ case EXACTFLU8:
+ if (! utf8_target) { /* All code points in this node require
+ UTF-8 to express. */
+ break;
+ }
+ utf8_fold_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
+ | FOLDEQ_S2_FOLDS_SANE;
+ goto do_exactf_utf8;
+
case EXACTFU:
if (is_utf8_pat || utf8_target) {
utf8_fold_flags = is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
}
case BOUNDL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
FBC_BOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
break;
case NBOUNDL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
FBC_NBOUND(isWORDCHAR_LC, isWORDCHAR_LC_uvchr, isWORDCHAR_LC_utf8);
break;
case BOUND:
/* FALLTHROUGH */
case POSIXL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
break;
default:
Perl_croak(aTHX_ "panic: find_byclass() node %d='%s' has an unexpected character class '%d'", OP(c), PL_reg_name[OP(c)], classnum);
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
}
break;
}
RX_MATCH_TAINTED_off(rx);
+ RX_MATCH_UTF8_set(rx, utf8_target);
reginfo->prog = rx; /* Yes, sorry that this is confusing. */
reginfo->intuit = 0;
magic belonging to this SV.
Not newSVsv, either, as it does not COW.
*/
- assert(!IS_PADGV(sv));
reginfo->sv = newSV(0);
SvSetSV_nosteal(reginfo->sv, sv);
SAVEFREESV(reginfo->sv);
}
/* Simplest case: anchored match need be tried only once. */
- /* [unless only anchor is BOL and multiline is set] */
+ /* [unless only anchor is MBOL - implying multiline is set] */
if (prog->intflags & (PREGf_ANCH & ~PREGf_ANCH_GPOS)) {
if (s == startpos && regtry(reginfo, &s))
goto got_it;
}
DEBUG_EXECUTE_r({
SV * const prop = sv_newmortal();
- regprop(prog, prop, c, reginfo);
+ regprop(prog, prop, c, reginfo, NULL);
{
RE_PV_QUOTED_DECL(quoted,utf8_target,PERL_DEBUG_PAD_ZERO(1),
s,strend-s,60);
* and replaced it with this one. Yves */
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
- "String does not contain required substring, cannot match.\n"
+ "%sString does not contain required substring, cannot match.%s\n",
+ PL_colors[4], PL_colors[5]
));
goto phooey;
}
/* Failure. */
goto phooey;
-got_it:
+ got_it:
/* s/// doesn't like it if $& is earlier than where we asked it to
* start searching (which can happen on something like /.\G/) */
if ( (flags & REXEC_FAIL_ON_UNDERFLOW)
if (RXp_PAREN_NAMES(prog))
(void)hv_iterinit(RXp_PAREN_NAMES(prog));
- RX_MATCH_UTF8_set(rx, utf8_target);
-
/* make sure $`, $&, $', and $digit will work later */
if ( !(flags & REXEC_NOT_FIRST) )
S_reg_set_capture_string(aTHX_ rx,
return 1;
-phooey:
+ phooey:
DEBUG_EXECUTE_r(PerlIO_printf(Perl_debug_log, "%sMatch failed%s\n",
PL_colors[4], PL_colors[5]));
U8 *pat = (U8*)STRING(text_node);
U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
- if (OP(text_node) == EXACT) {
+ if (OP(text_node) == EXACT || OP(text_node) == EXACTL) {
/* In an exact node, only one thing can be matched, that first
* character. If both the pat and the target are UTF-8, we can just
default:
Perl_croak(aTHX_ "panic: Unexpected op %u", OP(text_node));
- assert(0); /* NOTREACHED */
+ NOT_REACHED; /* NOTREACHED */
}
}
}
SV * const prop = sv_newmortal();
regnode *rnext=regnext(scan);
DUMP_EXEC_POS( locinput, scan, utf8_target );
- regprop(rex, prop, scan, reginfo);
+ regprop(rex, prop, scan, reginfo, NULL);
PerlIO_printf(Perl_debug_log,
"%3"IVdf":%*s%s(%"IVdf")\n",
assert(nextchr < 256 && (nextchr >= 0 || nextchr == NEXTCHR_EOS));
switch (state_num) {
- case BOL: /* /^../ */
- case SBOL: /* /^../s */
+ case SBOL: /* /^../ and /\A../ */
if (locinput == reginfo->strbeg)
break;
sayNO;
rex->offs[0].start = locinput - reginfo->strbeg;
PUSH_STATE_GOTO(KEEPS_next, next, locinput);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case KEEPS_next_fail:
/* rollback the start point change */
rex->offs[0].start = st->u.keeper.val;
sayNO_SILENT;
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case MEOL: /* /..$/m */
if (!NEXTCHR_IS_EOS && nextchr != '\n')
sayNO;
break;
- case EOL: /* /..$/ */
- /* FALLTHROUGH */
- case SEOL: /* /..$/s */
+ case SEOL: /* /..$/ */
if (!NEXTCHR_IS_EOS && nextchr != '\n')
sayNO;
if (reginfo->strend - locinput > 1)
);
sayNO_SILENT;
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
}
/* FALLTHROUGH */
case TRIE: /* (ab|cd) */
HV * widecharmap = MUTABLE_HV(rexi->data->data[ ARG( scan ) + 1 ]);
U32 state = trie->startstate;
+ if (scan->flags == EXACTL || scan->flags == EXACTFLU8) {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (utf8_target
+ && UTF8_IS_ABOVE_LATIN1(nextchr)
+ && scan->flags == EXACTL)
+ {
+ /* We only output for EXACTL, as we let the folder
+ * output this message for EXACTFLU8 to avoid
+ * duplication */
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput,
+ reginfo->strend);
+ }
+ }
if ( trie->bitmap
&& (NEXTCHR_IS_EOS || !TRIE_BITMAP_TEST(trie, nextchr)))
{
goto trie_first_try; /* jump into the fail handler */
}}
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case TRIE_next_fail: /* we failed - try next alternative */
{
if (ST.accepted > 1 || has_cutgroup) {
PUSH_STATE_GOTO(TRIE_next, scan, (char*)uc);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
}
/* only one choice left - just continue */
DEBUG_EXECUTE_r({
locinput = (char*)uc;
continue; /* execute rest of RE */
/* NOTREACHED */
- assert(0);
}
#undef ST
+ case EXACTL: /* /abc/l */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ /* Complete checking would involve going through every character
+ * matched by the string to see if any is above latin1. But the
+ * comparision otherwise might very well be a fast assembly
+ * language routine, and I (khw) don't think slowing things down
+ * just to check for this warning is worth it. So this just checks
+ * the first character */
+ if (utf8_target && UTF8_IS_ABOVE_LATIN1(*locinput)) {
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
+ }
+ /* FALLTHROUGH */
case EXACT: { /* /abc/ */
char *s = STRING(scan);
ln = STR_LEN(scan);
const char * s;
U32 fold_utf8_flags;
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
folder = foldEQ_locale;
fold_array = PL_fold_locale;
fold_utf8_flags = FOLDEQ_LOCALE;
goto do_exactf;
+ case EXACTFLU8: /* /abc/il; but all 'abc' are above 255, so
+ is effectively /u; hence to match, target
+ must be UTF-8. */
+ if (! utf8_target) {
+ sayNO;
+ }
+ fold_utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S1_ALREADY_FOLDED
+ | FOLDEQ_S1_FOLDS_SANE;
+ folder = foldEQ_latin1;
+ fold_array = PL_fold_latin1;
+ goto do_exactf;
+
case EXACTFU_SS: /* /\x{df}/iu */
case EXACTFU: /* /abc/iu */
folder = foldEQ_latin1;
break;
}
- /* XXX Could improve efficiency by separating these all out using a
- * macro or in-line function. At that point regcomp.c would no longer
- * have to set the FLAGS fields of these */
- case BOUNDL: /* /\b/l */
+ /* XXX At that point regcomp.c would no longer * have to set the FLAGS fields of these */
case NBOUNDL: /* /\B/l */
+ to_complement = 1;
+ /* FALLTHROUGH */
+
+ case BOUNDL: /* /\b/l */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (utf8_target) {
+ if (locinput == reginfo->strbeg)
+ ln = isWORDCHAR_LC('\n');
+ else {
+ ln = isWORDCHAR_LC_utf8(reghop3((U8*)locinput, -1,
+ (U8*)(reginfo->strbeg)));
+ }
+ n = (NEXTCHR_IS_EOS)
+ ? isWORDCHAR_LC('\n')
+ : isWORDCHAR_LC_utf8((U8*)locinput);
+ }
+ else { /* Here the string isn't utf8 */
+ ln = (locinput == reginfo->strbeg)
+ ? isWORDCHAR_LC('\n')
+ : isWORDCHAR_LC(UCHARAT(locinput - 1));
+ n = (NEXTCHR_IS_EOS)
+ ? isWORDCHAR_LC('\n')
+ : isWORDCHAR_LC(nextchr);
+ }
+ if (to_complement ^ (ln == n)) {
+ sayNO;
+ }
+ break;
+
+ case NBOUND: /* /\B/ */
+ to_complement = 1;
+ /* FALLTHROUGH */
+
case BOUND: /* /\b/ */
- case BOUNDU: /* /\b/u */
+ if (utf8_target) {
+ goto bound_utf8;
+ }
+ goto bound_ascii_match_only;
+
+ case NBOUNDA: /* /\B/a */
+ to_complement = 1;
+ /* FALLTHROUGH */
+
case BOUNDA: /* /\b/a */
- case NBOUND: /* /\B/ */
+
+ bound_ascii_match_only:
+ /* Here the string isn't utf8, or is utf8 and only ascii characters
+ * are to match \w. In the latter case looking at the byte just
+ * prior to the current one may be just the final byte of a
+ * multi-byte character. This is ok. There are two cases:
+ * 1) it is a single byte character, and then the test is doing
+ * just what it's supposed to.
+ * 2) it is a multi-byte character, in which case the final byte is
+ * never mistakable for ASCII, and so the test will say it is
+ * not a word character, which is the correct answer. */
+ ln = (locinput == reginfo->strbeg)
+ ? isWORDCHAR_A('\n')
+ : isWORDCHAR_A(UCHARAT(locinput - 1));
+ n = (NEXTCHR_IS_EOS)
+ ? isWORDCHAR_A('\n')
+ : isWORDCHAR_A(nextchr);
+ if (to_complement ^ (ln == n)) {
+ sayNO;
+ }
+ break;
+
case NBOUNDU: /* /\B/u */
- case NBOUNDA: /* /\B/a */
- /* was last char in word? */
- if (utf8_target
- && FLAGS(scan) != REGEX_ASCII_RESTRICTED_CHARSET
- && FLAGS(scan) != REGEX_ASCII_MORE_RESTRICTED_CHARSET)
- {
- if (locinput == reginfo->strbeg)
- ln = '\n';
- else {
- const U8 * const r =
- reghop3((U8*)locinput, -1, (U8*)(reginfo->strbeg));
+ to_complement = 1;
+ /* FALLTHROUGH */
- ln = utf8n_to_uvchr(r, (U8*) reginfo->strend - r,
- 0, uniflags);
- }
- if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
- ln = isWORDCHAR_uni(ln);
- if (NEXTCHR_IS_EOS)
- n = 0;
- else {
- LOAD_UTF8_CHARCLASS_ALNUM();
- n = swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)locinput,
- utf8_target);
- }
- }
- else {
- ln = isWORDCHAR_LC_uvchr(ln);
- n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
- }
+ case BOUNDU: /* /\b/u */
+ if (utf8_target) {
+
+ bound_utf8:
+ ln = (locinput == reginfo->strbeg)
+ ? isWORDCHAR_L1('\n')
+ : isWORDCHAR_utf8(reghop3((U8*)locinput, -1,
+ (U8*)(reginfo->strbeg)));
+ n = (NEXTCHR_IS_EOS)
+ ? isWORDCHAR_L1('\n')
+ : isWORDCHAR_utf8((U8*)locinput);
}
else {
+ ln = (locinput == reginfo->strbeg)
+ ? isWORDCHAR_L1('\n')
+ : isWORDCHAR_L1(UCHARAT(locinput - 1));
+ n = (NEXTCHR_IS_EOS)
+ ? isWORDCHAR_L1('\n')
+ : isWORDCHAR_L1(nextchr);
- /* Here the string isn't utf8, or is utf8 and only ascii
- * characters are to match \w. In the latter case looking at
- * the byte just prior to the current one may be just the final
- * byte of a multi-byte character. This is ok. There are two
- * cases:
- * 1) it is a single byte character, and then the test is doing
- * just what it's supposed to.
- * 2) it is a multi-byte character, in which case the final
- * byte is never mistakable for ASCII, and so the test
- * will say it is not a word character, which is the
- * correct answer. */
- ln = (locinput != reginfo->strbeg) ?
- UCHARAT(locinput - 1) : '\n';
- switch (FLAGS(scan)) {
- case REGEX_UNICODE_CHARSET:
- ln = isWORDCHAR_L1(ln);
- n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
- break;
- case REGEX_LOCALE_CHARSET:
- ln = isWORDCHAR_LC(ln);
- n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
- break;
- case REGEX_DEPENDS_CHARSET:
- ln = isWORDCHAR(ln);
- n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
- break;
- case REGEX_ASCII_RESTRICTED_CHARSET:
- case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
- ln = isWORDCHAR_A(ln);
- n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_A(nextchr);
- break;
- default:
- Perl_croak(aTHX_ "panic: Unexpected FLAGS %u in op %u", FLAGS(scan), OP(scan));
- }
}
- /* Note requires that all BOUNDs be lower than all NBOUNDs in
- * regcomp.sym */
- if (((!ln) == (!n)) == (OP(scan) < NBOUND))
- sayNO;
+
+ if (to_complement ^ (ln == n)) {
+ sayNO;
+ }
break;
- case ANYOF: /* /[abc]/ */
+ case ANYOFL: /* /[abc]/l */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ /* FALLTHROUGH */
+ case ANYOF: /* /[abc]/ */
if (NEXTCHR_IS_EOS)
sayNO;
if (utf8_target) {
/* FALLTHROUGH */
case POSIXL: /* \w or [:punct:] etc. under /l */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (NEXTCHR_IS_EOS)
sayNO;
}
}
else { /* Here, must be an above Latin-1 code point */
- goto utf8_posix_not_eos;
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(locinput, reginfo->strend);
+ goto utf8_posix_above_latin1;
}
/* Here, must be utf8 */
if (NEXTCHR_IS_EOS) {
sayNO;
}
- utf8_posix_not_eos:
/* Use _generic_isCC() for characters within Latin1. (Note that
* UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
locinput += 2;
}
else { /* Handle above Latin-1 code points */
+ utf8_posix_above_latin1:
classnum = (_char_class_number) FLAGS(scan);
if (classnum < _FIRST_NON_SWASH_CC) {
const U8 *fold_array;
UV utf8_fold_flags;
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
folder = foldEQ_locale;
fold_array = PL_fold_locale;
type = REFFL;
goto do_nref_ref_common;
case REFFL: /* /\1/il */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
folder = foldEQ_locale;
fold_array = PL_fold_locale;
utf8_fold_flags = FOLDEQ_LOCALE;
case TAIL: /* placeholder while compiling (A|B|C) */
break;
- case BACK: /* ??? doesn't appear to be used ??? */
- break;
-
#undef ST
#define ST st->u.eval
{
/* and then jump to the code we share with EVAL */
goto eval_recurse_doit;
-
/* NOTREACHED */
- assert(0);
case EVAL: /* /(?{A})B/ /(??{A})B/ and /(?(?{A})X|Y)B/ */
if (cur_eval && cur_eval->locinput==locinput) {
assert(o->op_targ == OP_LEAVE);
o = cUNOPo->op_first;
assert(o->op_type == OP_ENTER);
- o = OP_SIBLING(o);
+ o = OpSIBLING(o);
}
if (o->op_type != OP_STUB) {
assert(!(scan->flags & ~RXf_PMf_COMPILETIME));
re_sv = rex->engine->op_comp(aTHX_ &ret, 1, NULL,
rex->engine, NULL, NULL,
- /* copy /msix etc to inner pattern */
- scan->flags,
+ /* copy /msixn etc to inner pattern */
+ ARG2L(scan),
pm_flags);
if (!(SvFLAGS(ret)
/* now continue from first node in postoned RE */
PUSH_YES_STATE_GOTO(EVAL_AB, startpoint, locinput);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
}
case EVAL_AB: /* cleanup after a successful (??{A})B */
PUSH_YES_STATE_GOTO(CURLYX_end, PREVOPER(next), locinput);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
}
case CURLYX_end: /* just finished matching all of A*B */
cur_curlyx = ST.prev_curlyx;
sayYES;
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case CURLYX_end_fail: /* just failed to match all of A*B */
regcpblow(ST.cp);
cur_curlyx = ST.prev_curlyx;
sayNO;
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
#undef ST
PUSH_STATE_GOTO(WHILEM_A_pre, A, locinput);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
}
/* If degenerate A matches "", assume A done. */
PUSH_YES_STATE_GOTO(WHILEM_B_min, ST.save_curlyx->u.curlyx.B,
locinput);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
}
/* Prefer A over B for maximal matching. */
REGCP_SET(ST.lastcp);
PUSH_STATE_GOTO(WHILEM_A_max, A, locinput);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
}
goto do_whilem_B_max;
}
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case WHILEM_B_min: /* just matched B in a minimal match */
case WHILEM_B_max: /* just matched B in a maximal match */
cur_curlyx = ST.save_curlyx;
sayYES;
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case WHILEM_B_max_fail: /* just failed to match B in a maximal match */
cur_curlyx = ST.save_curlyx;
cur_curlyx->u.curlyx.count--;
CACHEsayNO;
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case WHILEM_A_min_fail: /* just failed to match A in a minimal match */
/* FALLTHROUGH */
cur_curlyx->u.curlyx.count--;
CACHEsayNO;
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case WHILEM_A_max_fail: /* just failed to match A in a maximal match */
REGCP_UNWIND(ST.lastcp);
PUSH_YES_STATE_GOTO(WHILEM_B_max, ST.save_curlyx->u.curlyx.B,
locinput);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case WHILEM_B_min_fail: /* just failed to match B in a minimal match */
cur_curlyx = ST.save_curlyx;
/*A*/ NEXTOPER(ST.save_curlyx->u.curlyx.me) + EXTRA_STEP_2ARGS,
locinput);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
#undef ST
#define ST st->u.branch
PUSH_STATE_GOTO(BRANCH_next, scan, locinput);
}
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case CUTGROUP: /* /(*THEN)/ */
sv_yes_mark = st->u.mark.mark_name = scan->flags ? NULL :
MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
PUSH_STATE_GOTO(CUTGROUP_next, next, locinput);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case CUTGROUP_next_fail:
do_cutgroup = 1;
sv_commit = st->u.mark.mark_name;
sayNO;
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case BRANCH_next:
sayYES;
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case BRANCH_next_fail: /* that branch failed; try the next, if any */
if (do_cutgroup) {
}
continue; /* execute next BRANCH[J] op */
/* NOTREACHED */
- assert(0);
case MINMOD: /* next op will be non-greedy, e.g. A*? */
minmod = 1;
curlym_do_A: /* execute the A in /A{m,n}B/ */
PUSH_YES_STATE_GOTO(CURLYM_A, ST.A, locinput); /* match A */
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case CURLYM_A: /* we've just matched an A */
ST.count++;
PUSH_STATE_GOTO(CURLYM_B, ST.B, locinput); /* match B */
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case CURLYM_B_fail: /* just failed to match a B */
REGCP_UNWIND(ST.cp);
goto curly_try_B_max;
}
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case CURLY_B_min_known_fail:
/* failed to find B in a non-greedy match where c1,c2 valid */
PUSH_STATE_GOTO(CURLY_B_min_known, ST.B, locinput);
}
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case CURLY_B_min_fail:
/* failed to find B in a non-greedy match where c1,c2 invalid */
}
sayNO;
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
curly_try_B_max:
/* a successful greedy match: now try to match B */
CURLY_SETPAREN(ST.paren, ST.count);
PUSH_STATE_GOTO(CURLY_B_max, ST.B, locinput);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
}
}
/* FALLTHROUGH */
/* execute body of (?...A) */
PUSH_YES_STATE_GOTO(IFMATCH_A, NEXTOPER(NEXTOPER(scan)), newstart);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
}
case IFMATCH_A_fail: /* body of (?...A) failed */
sv_yes_mark = sv_commit = MUTABLE_SV(rexi->data->data[ ARG( scan ) ]);
PUSH_STATE_GOTO(COMMIT_next, next, locinput);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case COMMIT_next_fail:
no_final = 1;
case OPFAIL: /* (*FAIL) */
sayNO;
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
#define ST st->u.mark
case MARKPOINT: /* (*MARK:foo) */
ST.mark_loc = locinput;
PUSH_YES_STATE_GOTO(MARKPOINT_next, next, locinput);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case MARKPOINT_next:
mark_state = ST.prev_mark;
sayYES;
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case MARKPOINT_next_fail:
if (popmark && sv_eq(ST.mark_name,popmark))
mark_state->u.mark.mark_name : NULL;
sayNO;
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
case SKIP: /* (*SKIP) */
if (scan->flags) {
no_final = 1;
sayNO;
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
#undef ST
case LNBREAK: /* \R */
scan = next; /* prepare to execute the next op and ... */
continue; /* ... jump back to the top, reusing st */
/* NOTREACHED */
- assert(0);
push_yes_state:
/* push a state that backtracks on success */
st = newst;
continue;
/* NOTREACHED */
- assert(0);
}
}
Perl_croak(aTHX_ "corrupted regexp pointers");
/* NOTREACHED */
sayNO;
+ NOT_REACHED;
-yes:
+ yes:
if (yes_state) {
/* we have successfully completed a subexpression, but we must now
* pop to the state marked by yes_state and continue from there */
result = 1;
goto final_exit;
-no:
+ no:
DEBUG_EXECUTE_r(
PerlIO_printf(Perl_debug_log,
"%*s %sfailed...%s\n",
PL_colors[4], PL_colors[5])
);
-no_silent:
+ no_silent:
if (no_final) {
if (yes_state) {
goto yes;
scan = loceol;
}
break;
+ case EXACTL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ if (utf8_target && UTF8_IS_ABOVE_LATIN1(*scan)) {
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_UTF8_MSG(scan, loceol);
+ }
+ /* FALLTHROUGH */
case EXACT:
assert(STR_LEN(p) == reginfo->is_utf8_pat ? UTF8SKIP(STRING(p)) : 1);
goto do_exactf;
case EXACTFL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
utf8_flags = FOLDEQ_LOCALE;
goto do_exactf;
utf8_flags = 0;
goto do_exactf;
+ case EXACTFLU8:
+ if (! utf8_target) {
+ break;
+ }
+ utf8_flags = FOLDEQ_LOCALE | FOLDEQ_S2_ALREADY_FOLDED
+ | FOLDEQ_S2_FOLDS_SANE;
+ goto do_exactf;
+
case EXACTFU_SS:
case EXACTFU:
utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
}
break;
}
+ case ANYOFL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ /* FALLTHROUGH */
case ANYOF:
if (utf8_target) {
while (hardcount < max
/* FALLTHROUGH */
case POSIXL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
if (! utf8_target) {
while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
*scan)))
}
break;
+ case BOUNDL:
+ case NBOUNDL:
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ /* FALLTHROUGH */
case BOUND:
case BOUNDA:
- case BOUNDL:
case BOUNDU:
case EOS:
case GPOS:
case KEEPS:
case NBOUND:
case NBOUNDA:
- case NBOUNDL:
case NBOUNDU:
case OPFAIL:
case SBOL:
default:
Perl_croak(aTHX_ "panic: regrepeat() called with unrecognized node type %d='%s'", OP(p), PL_reg_name[OP(p)]);
/* NOTREACHED */
- assert(0);
+ NOT_REACHED;
}
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_EXECUTE_r({
SV * const prop = sv_newmortal();
- regprop(prog, prop, p, reginfo);
+ regprop(prog, prop, p, reginfo, NULL);
PerlIO_printf(Perl_debug_log,
"%*s %s can match %"IVdf" times out of %"IVdf"...\n",
REPORT_CODE_OFF + depth*2, "", SvPVX_const(prop),(IV)c,(IV)max);
*altsvp = NULL;
}
- return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL));
+ return newSVsv(_get_regclass_nonbitmap_data(prog, node, doinit, listsvp, NULL, NULL));
}
-SV *
-Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
- const regnode* node,
- bool doinit,
- SV** listsvp,
- SV** only_utf8_locale_ptr)
-{
- /* For internal core use only.
- * Returns the swash for the input 'node' in the regex 'prog'.
- * If <doinit> is 'true', will attempt to create the swash if not already
- * done.
- * If <listsvp> is non-null, will return the printable contents of the
- * swash. This can be used to get debugging information even before the
- * swash exists, by calling this function with 'doinit' set to false, in
- * which case the components that will be used to eventually create the
- * swash are returned (in a printable form).
- * Tied intimately to how regcomp.c sets up the data structure */
-
- SV *sw = NULL;
- SV *si = NULL; /* Input swash initialization string */
- SV* invlist = NULL;
-
- RXi_GET_DECL(prog,progi);
- const struct reg_data * const data = prog ? progi->data : NULL;
-
- PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
-
- assert(ANYOF_FLAGS(node)
- & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
-
- if (data && data->count) {
- const U32 n = ARG(node);
-
- if (data->what[n] == 's') {
- SV * const rv = MUTABLE_SV(data->data[n]);
- AV * const av = MUTABLE_AV(SvRV(rv));
- SV **const ary = AvARRAY(av);
- U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
-
- si = *ary; /* ary[0] = the string to initialize the swash with */
-
- /* Elements 3 and 4 are either both present or both absent. [3] is
- * any inversion list generated at compile time; [4] indicates if
- * that inversion list has any user-defined properties in it. */
- if (av_tindex(av) >= 2) {
- if (only_utf8_locale_ptr
- && ary[2]
- && ary[2] != &PL_sv_undef)
- {
- *only_utf8_locale_ptr = ary[2];
- }
- else {
- assert(only_utf8_locale_ptr);
- *only_utf8_locale_ptr = NULL;
- }
-
- if (av_tindex(av) >= 3) {
- invlist = ary[3];
- if (SvUV(ary[4])) {
- swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
- }
- }
- else {
- invlist = NULL;
- }
- }
-
- /* Element [1] is reserved for the set-up swash. If already there,
- * return it; if not, create it and store it there */
- if (ary[1] && SvROK(ary[1])) {
- sw = ary[1];
- }
- else if (doinit && ((si && si != &PL_sv_undef)
- || (invlist && invlist != &PL_sv_undef))) {
- assert(si);
- sw = _core_swash_init("utf8", /* the utf8 package */
- "", /* nameless */
- si,
- 1, /* binary */
- 0, /* not from tr/// */
- invlist,
- &swash_init_flags);
- (void)av_store(av, 1, sw);
- }
- }
- }
-
- /* If requested, return a printable version of what this swash matches */
- if (listsvp) {
- SV* matches_string = newSVpvs("");
-
- /* The swash should be used, if possible, to get the data, as it
- * contains the resolved data. But this function can be called at
- * compile-time, before everything gets resolved, in which case we
- * return the currently best available information, which is the string
- * that will eventually be used to do that resolving, 'si' */
- if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
- && (si && si != &PL_sv_undef))
- {
- sv_catsv(matches_string, si);
- }
-
- /* Add the inversion list to whatever we have. This may have come from
- * the swash, or from an input parameter */
- if (invlist) {
- sv_catsv(matches_string, _invlist_contents(invlist));
- }
- *listsvp = matches_string;
- }
-
- return sw;
-}
#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
/*
- reginclass - determine if a character falls into a character class
- n is the ANYOF regnode
+ n is the ANYOF-type regnode
p is the target string
p_end points to one byte beyond the end of the target string
utf8_target tells whether p is in UTF-8.
* UTF8_ALLOW_FFFF */
if (c_len == (STRLEN)-1)
Perl_croak(aTHX_ "Malformed UTF-8 character (fatal)");
+ if (c > 255 && OP(n) == ANYOFL && ! is_ANYOF_SYNTHETIC(n)) {
+ _CHECK_AND_OUTPUT_WIDE_LOCALE_CP_MSG(c);
+ }
}
/* If this character is potentially in the bitmap, check it */
- if (c < 256) {
+ if (c < NUM_ANYOF_CODE_POINTS) {
if (ANYOF_BITMAP_TEST(n, c))
match = TRUE;
- else if (flags & ANYOF_NON_UTF8_NON_ASCII_ALL
- && ! utf8_target
- && ! isASCII(c))
+ else if ((flags & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII)
+ && ! utf8_target
+ && ! isASCII(c))
{
match = TRUE;
}
else if (flags & ANYOF_LOCALE_FLAGS) {
- if (flags & ANYOF_LOC_FOLD) {
- if (ANYOF_BITMAP_TEST(n, PL_fold_locale[c])) {
- match = TRUE;
- }
+ if ((flags & ANYOF_LOC_FOLD)
+ && c < 256
+ && ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
+ {
+ match = TRUE;
}
- if (! match && ANYOF_POSIXL_TEST_ANY_SET(n)) {
+ else if (ANYOF_POSIXL_TEST_ANY_SET(n)
+ && c < 256
+ ) {
/* The data structure is arranged so bits 0, 2, 4, ... are set
* if the class includes the Posix character class given by
/* If the bitmap didn't (or couldn't) match, and something outside the
* bitmap could match, try that. */
if (!match) {
- if (c >= 256 && (flags & ANYOF_ABOVE_LATIN1_ALL)) {
- match = TRUE; /* Everything above 255 matches */
+ if (c >= NUM_ANYOF_CODE_POINTS
+ && (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP))
+ {
+ match = TRUE; /* Everything above the bitmap matches */
}
- else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
- || (utf8_target && (flags & ANYOF_UTF8))
+ else if ((flags & ANYOF_HAS_NONBITMAP_NON_UTF8_MATCHES)
+ || (utf8_target && (flags & ANYOF_HAS_UTF8_NONBITMAP_MATCHES))
|| ((flags & ANYOF_LOC_FOLD)
&& IN_UTF8_CTYPE_LOCALE
- && ARG(n) != ANYOF_NONBITMAP_EMPTY))
+ && ARG(n) != ANYOF_ONLY_HAS_BITMAP))
{
SV* only_utf8_locale = NULL;
SV * const sw = _get_regclass_nonbitmap_data(prog, n, TRUE, 0,
- &only_utf8_locale);
+ &only_utf8_locale, NULL);
if (sw) {
U8 utf8_buffer[2];
U8 * utf8_p;