#include "inline_invlist.c"
#include "unicode_constants.h"
-#define RF_tainted 1 /* tainted information used? e.g. locale */
-
#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
#ifndef STATIC
switch ((_char_class_number) classnum) {
case _CC_ENUM_ALPHANUMERIC: return isALPHANUMERIC_LC(character);
case _CC_ENUM_ALPHA: return isALPHA_LC(character);
+ case _CC_ENUM_ASCII: return isASCII_LC(character);
+ case _CC_ENUM_BLANK: return isBLANK_LC(character);
+ case _CC_ENUM_CASED: return isLOWER_LC(character)
+ || isUPPER_LC(character);
+ case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
case _CC_ENUM_DIGIT: return isDIGIT_LC(character);
case _CC_ENUM_GRAPH: return isGRAPH_LC(character);
case _CC_ENUM_LOWER: return isLOWER_LC(character);
case _CC_ENUM_PRINT: return isPRINT_LC(character);
+ case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
case _CC_ENUM_PUNCT: return isPUNCT_LC(character);
+ case _CC_ENUM_SPACE: return isSPACE_LC(character);
case _CC_ENUM_UPPER: return isUPPER_LC(character);
case _CC_ENUM_WORDCHAR: return isWORDCHAR_LC(character);
- case _CC_ENUM_SPACE: return isSPACE_LC(character);
- case _CC_ENUM_BLANK: return isBLANK_LC(character);
case _CC_ENUM_XDIGIT: return isXDIGIT_LC(character);
- case _CC_ENUM_CNTRL: return isCNTRL_LC(character);
- case _CC_ENUM_PSXSPC: return isPSXSPC_LC(character);
- case _CC_ENUM_ASCII: return isASCII_LC(character);
default: /* VERTSPACE should never occur in locales */
Perl_croak(aTHX_ "panic: isFOO_lc() has an unexpected character class '%d'", classnum);
}
/* We know what class it must start with. */
switch (OP(c)) {
case ANYOF:
+ case ANYOF_SYNTHETIC:
+ case ANYOF_WARN_SUPER:
if (utf8_target) {
REXEC_FBC_UTF8_CLASS_SCAN(
reginclass(prog, c, (U8*)s, utf8_target));
break;
}
case BOUNDL:
- PL_reg_flags |= RF_tainted;
- FBC_BOUND(isALNUM_LC,
- isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
- isALNUM_LC_utf8((U8*)s));
+ RXp_MATCH_TAINTED_on(prog);
+ FBC_BOUND(isWORDCHAR_LC,
+ isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
+ isWORDCHAR_LC_utf8((U8*)s));
break;
case NBOUNDL:
- PL_reg_flags |= RF_tainted;
- FBC_NBOUND(isALNUM_LC,
- isALNUM_LC_uvchr(UNI_TO_NATIVE(tmp)),
- isALNUM_LC_utf8((U8*)s));
+ RXp_MATCH_TAINTED_on(prog);
+ FBC_NBOUND(isWORDCHAR_LC,
+ isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(tmp)),
+ isWORDCHAR_LC_utf8((U8*)s));
break;
case BOUND:
FBC_BOUND(isWORDCHAR,
- isALNUM_uni(tmp),
+ isWORDCHAR_uni(tmp),
cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
break;
case BOUNDA:
break;
case NBOUND:
FBC_NBOUND(isWORDCHAR,
- isALNUM_uni(tmp),
+ isWORDCHAR_uni(tmp),
cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
break;
case NBOUNDA:
break;
case BOUNDU:
FBC_BOUND(isWORDCHAR_L1,
- isALNUM_uni(tmp),
+ isWORDCHAR_uni(tmp),
cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
break;
case NBOUNDU:
FBC_NBOUND(isWORDCHAR_L1,
- isALNUM_uni(tmp),
+ isWORDCHAR_uni(tmp),
cBOOL(swash_fetch(PL_utf8_swash_ptrs[_CC_WORDCHAR], (U8*)s, utf8_target)));
break;
case LNBREAK:
/* FALLTHROUGH */
case POSIXL:
- PL_reg_flags |= RF_tainted;
+ RXp_MATCH_TAINTED_on(prog);
REXEC_FBC_CSCAN(to_complement ^ cBOOL(isFOO_utf8_lc(FLAGS(c), (U8 *) s)),
to_complement ^ cBOOL(isFOO_lc(FLAGS(c), *s)));
break;
Perl_croak(aTHX_ "corrupted regexp program");
}
- PL_reg_flags = 0;
+ RX_MATCH_TAINTED_off(rx);
PL_reg_state.re_state_eval_setup_done = FALSE;
PL_reg_maxiter = 0;
);
);
Safefree(swap);
- RX_MATCH_TAINTED_set(rx, PL_reg_flags & RF_tainted);
if (PL_reg_state.re_state_eval_setup_done)
restore_pos(aTHX_ prog);
const char * s;
U32 fold_utf8_flags;
- PL_reg_flags |= RF_tainted;
+ RX_MATCH_TAINTED_on(reginfo->prog);
folder = foldEQ_locale;
fold_array = PL_fold_locale;
fold_utf8_flags = FOLDEQ_UTF8_LOCALE;
* have to set the FLAGS fields of these */
case BOUNDL: /* /\b/l */
case NBOUNDL: /* /\B/l */
- PL_reg_flags |= RF_tainted;
+ RX_MATCH_TAINTED_on(reginfo->prog);
/* FALL THROUGH */
case BOUND: /* /\b/ */
case BOUNDU: /* /\b/u */
ln = utf8n_to_uvchr(r, UTF8SKIP(r), 0, uniflags);
}
if (FLAGS(scan) != REGEX_LOCALE_CHARSET) {
- ln = isALNUM_uni(ln);
+ ln = isWORDCHAR_uni(ln);
if (NEXTCHR_IS_EOS)
n = 0;
else {
}
}
else {
- ln = isALNUM_LC_uvchr(UNI_TO_NATIVE(ln));
- n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC_utf8((U8*)locinput);
+ ln = isWORDCHAR_LC_uvchr(UNI_TO_NATIVE(ln));
+ n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC_utf8((U8*)locinput);
}
}
else {
n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_L1(nextchr);
break;
case REGEX_LOCALE_CHARSET:
- ln = isALNUM_LC(ln);
- n = NEXTCHR_IS_EOS ? 0 : isALNUM_LC(nextchr);
+ ln = isWORDCHAR_LC(ln);
+ n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR_LC(nextchr);
break;
case REGEX_DEPENDS_CHARSET:
- ln = isALNUM(ln);
- n = NEXTCHR_IS_EOS ? 0 : isALNUM(nextchr);
+ ln = isWORDCHAR(ln);
+ n = NEXTCHR_IS_EOS ? 0 : isWORDCHAR(nextchr);
break;
case REGEX_ASCII_RESTRICTED_CHARSET:
case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
break;
case ANYOF: /* /[abc]/ */
+ case ANYOF_WARN_SUPER:
if (NEXTCHR_IS_EOS)
sayNO;
if (utf8_target) {
/* The locale hasn't influenced the outcome before this, so defer
* tainting until now */
- PL_reg_flags |= RF_tainted;
+ RX_MATCH_TAINTED_on(reginfo->prog);
/* Use isFOO_lc() for characters within Latin1. (Note that
* UTF8_IS_INVARIANT works even on non-UTF-8 strings, or else
const U8 *fold_array;
UV utf8_fold_flags;
- PL_reg_flags |= RF_tainted;
+ RX_MATCH_TAINTED_on(reginfo->prog);
folder = foldEQ_locale;
fold_array = PL_fold_locale;
type = REFFL;
goto do_nref_ref_common;
case REFFL: /* /\1/il */
- PL_reg_flags |= RF_tainted;
+ RX_MATCH_TAINTED_on(reginfo->prog);
folder = foldEQ_locale;
fold_array = PL_fold_locale;
utf8_fold_flags = FOLDEQ_UTF8_LOCALE;
* depth - (for debugging) backtracking depth.
*/
STATIC I32
-S_regrepeat(pTHX_ const regexp *prog, char **startposp, const regnode *p,
+S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
I32 max, int depth, bool is_utf8_pat)
{
dVAR;
goto do_exactf;
case EXACTFL:
- PL_reg_flags |= RF_tainted;
+ RXp_MATCH_TAINTED_on(prog);
utf8_flags = FOLDEQ_UTF8_LOCALE;
goto do_exactf;
break;
}
case ANYOF:
+ case ANYOF_WARN_SUPER:
if (utf8_target) {
while (hardcount < max
&& scan < loceol
/* FALLTHROUGH */
case POSIXL:
- PL_reg_flags |= RF_tainted;
+ RXp_MATCH_TAINTED_on(prog);
if (! utf8_target) {
while (scan < loceol && to_complement ^ cBOOL(isFOO_lc(FLAGS(p),
*scan)))
*/
STATIC bool
-S_reginclass(pTHX_ const regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
+S_reginclass(pTHX_ regexp * const prog, const regnode * const n, const U8* const p, const bool utf8_target)
{
dVAR;
const char flags = ANYOF_FLAGS(n);
match = TRUE;
}
else if (flags & ANYOF_LOCALE) {
- PL_reg_flags |= RF_tainted;
+ RXp_MATCH_TAINTED_on(prog);
if ((flags & ANYOF_LOC_FOLD)
&& ANYOF_BITMAP_TEST(n, PL_fold_locale[c]))
* will be 1, so the exclusive or will reverse things, so we
* are testing for \W. On the third iteration, 'to_complement'
* will be 0, and we would be testing for \s; the fourth
- * iteration would test for \S, etc. */
+ * iteration would test for \S, etc.
+ *
+ * Note that this code assumes that all the classes are closed
+ * under folding. For example, if a character matches \w, then
+ * its fold does too; and vice versa. This should be true for
+ * any well-behaved locale for all the currently defined Posix
+ * classes, except for :lower: and :upper:, which are handled
+ * by the pseudo-class :cased: which matches if either of the
+ * other two does. To get rid of this assumption, an outer
+ * loop could be used below to iterate over both the source
+ * character, and its fold (if different) */
int count = 0;
int to_complement = 0;
|| (utf8_target
&& (c >=256
|| (! (flags & ANYOF_LOCALE))
- || (flags & ANYOF_IS_SYNTHETIC)))))
+ || OP(n) == ANYOF_SYNTHETIC))))
{
SV * const sw = core_regclass_swash(prog, n, TRUE, 0);
if (sw) {
}
if (UNICODE_IS_SUPER(c)
- && (flags & ANYOF_WARN_SUPER)
+ && OP(n) == ANYOF_WARN_SUPER
&& ckWARN_d(WARN_NON_UNICODE))
{
Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),