I32 orig_utf8; /* whether the pattern was originally in utf8 */
/* XXX use this for future optimisation of case
* where pattern must be upgraded to utf8. */
+ I32 uni_semantics; /* If a d charset modifier should use unicode
+ rules, even if the pattern is not in
+ utf8 */
HV *paren_names; /* Paren names */
regnode **recurse; /* Recurse regops */
I32 recurse_count; /* Number of recurse regops */
+ I32 in_lookbehind;
#if ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
+#define RExC_uni_semantics (pRExC_state->uni_semantics)
#define RExC_orig_utf8 (pRExC_state->orig_utf8)
#define RExC_open_parens (pRExC_state->open_parens)
#define RExC_close_parens (pRExC_state->close_parens)
#define RExC_paren_names (pRExC_state->paren_names)
#define RExC_recurse (pRExC_state->recurse)
#define RExC_recurse_count (pRExC_state->recurse_count)
+#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define HASWIDTH 0x01 /* Known to match non-null strings. */
/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
- * character, and if utf8, must be invariant. */
+ * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
#define SIMPLE 0x02
#define SPSTART 0x04 /* Starts with * or +. */
#define TRYAGAIN 0x08 /* Weeded out a declaration. */
#define UTF cBOOL(RExC_utf8)
#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
+#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
+#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
+#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
- if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+ if (OP(scan) == ALNUMU) {
for (value = 0; value < 256; value++) {
if (!isWORDCHAR_L1(value)) {
ANYOF_BITMAP_CLEAR(data->start_class, value);
else {
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
- else if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+ else if (OP(scan) == ALNUMU) {
for (value = 0; value < 256; value++) {
if (isWORDCHAR_L1(value)) {
ANYOF_BITMAP_SET(data->start_class, value);
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
- if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+ if (OP(scan) == NALNUMU) {
for (value = 0; value < 256; value++) {
if (isWORDCHAR_L1(value)) {
ANYOF_BITMAP_CLEAR(data->start_class, value);
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
else {
- for (value = 0; value < 256; value++)
- if (!isALNUM(value))
- ANYOF_BITMAP_SET(data->start_class, value);
+ if (OP(scan) == NALNUMU) {
+ for (value = 0; value < 256; value++) {
+ if (! isWORDCHAR_L1(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (! isALNUM(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ }
}
}
break;
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
- if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+ if (OP(scan) == SPACEU) {
for (value = 0; value < 256; value++) {
if (!isSPACE_L1(value)) {
ANYOF_BITMAP_CLEAR(data->start_class, value);
if (data->start_class->flags & ANYOF_LOCALE) {
ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
}
- else if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+ else if (OP(scan) == SPACEU) {
for (value = 0; value < 256; value++) {
if (isSPACE_L1(value)) {
ANYOF_BITMAP_SET(data->start_class, value);
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
- if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+ if (OP(scan) == NSPACEU) {
for (value = 0; value < 256; value++) {
if (isSPACE_L1(value)) {
ANYOF_BITMAP_CLEAR(data->start_class, value);
else {
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
- else if (FLAGS(scan) == REGEX_UNICODE_CHARSET) {
+ else if (OP(scan) == NSPACEU) {
for (value = 0; value < 256; value++) {
if (!isSPACE_L1(value)) {
ANYOF_BITMAP_SET(data->start_class, value);
DEBUG_r(if (!PL_colorset) reginitcolors());
RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
+ RExC_uni_semantics = 0;
/****************** LONG JUMP TARGET HERE***********************/
/* Longjmp back to here if have to switch in midstream to utf8 */
RExC_sawback = 0;
RExC_seen = 0;
+ RExC_in_lookbehind = 0;
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
RExC_seen_evals = 0;
RExC_extralen = 0;
if (used_setjump) {
JMPENV_POP;
}
+
DEBUG_PARSE_r({
PerlIO_printf(Perl_debug_log,
"Required size %"IVdf" nodes\n"
RExC_lastnum=0;
RExC_lastparse=NULL;
});
+
+ /* The first pass could have found things that force Unicode semantics */
+ if ((RExC_utf8 || RExC_uni_semantics)
+ && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
+ {
+ set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
+ }
+
/* Small enough for pointer-storage convention?
If extralen==0, this means that we will not need long jumps. */
if (RExC_size >= 0x10000L && RExC_extralen)
else {
regnode *first = ri->program + 1;
U8 fop = OP(first);
- U8 nop = OP(NEXTOPER(first));
-
- if (PL_regkind[fop] == NOTHING && nop == END)
+
+ if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
r->extflags |= RXf_NULL;
- else if (PL_regkind[fop] == BOL && nop == END)
+ else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
r->extflags |= RXf_START_ONLY;
- else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END)
+ else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
+ && OP(regnext(first)) == END)
r->extflags |= RXf_WHITE;
}
#endif
goto capturing_parens;
}
RExC_seen |= REG_SEEN_LOOKBEHIND;
+ RExC_in_lookbehind++;
RExC_parse++;
case '=': /* (?=...) */
RExC_seen_zerolen++;
that follow */
has_use_defaults = TRUE;
STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
- if (RExC_utf8) { /* But the default for a utf8 pattern is
- unicode semantics */
- set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET);
- }
+ set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
+ ? REGEX_UNICODE_CHARSET
+ : REGEX_DEPENDS_CHARSET);
goto parse_flags;
default:
--RExC_parse;
cs = REGEX_UNICODE_CHARSET;
has_charset_modifier = 1;
break;
+ case ASCII_RESTRICT_PAT_MOD:
+ if (has_charset_modifier || flagsp == &negflags) {
+ goto fail_modifiers;
+ }
+ cs = REGEX_ASCII_RESTRICTED_CHARSET;
+ has_charset_modifier = 1;
+ break;
case DEPENDS_PAT_MOD:
if (has_use_defaults
|| has_charset_modifier
/* The dual charset means unicode semantics if the
* pattern (or target, not known until runtime) are
- * utf8 */
- cs = (RExC_utf8)
+ * utf8, or something in the pattern indicates unicode
+ * semantics */
+ cs = (RExC_utf8 || RExC_uni_semantics)
? REGEX_UNICODE_CHARSET
: REGEX_DEPENDS_CHARSET;
has_charset_modifier = 1;
FAIL("Junk on end of regexp"); /* "Can't happen". */
/* NOTREACHED */
}
+
+ if (RExC_in_lookbehind) {
+ RExC_in_lookbehind--;
+ }
if (after_freeze)
RExC_npar = after_freeze;
return(ret);
register regnode *ret = NULL;
I32 flags;
char *parse_start = RExC_parse;
+ U8 op;
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_PARSE("atom");
*flagp = WORST; /* Tentatively. */
*flagp |= HASWIDTH;
goto finish_meta_pat;
case 'w':
- if (LOC) {
- ret = reg_node(pRExC_state, (U8)(ALNUML));
- } else {
- ret = reg_node(pRExC_state, (U8)(ALNUM));
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = ALNUML;
+ break;
+ case REGEX_UNICODE_CHARSET:
+ op = ALNUMU;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = ALNUMA;
+ break;
+ case REGEX_DEPENDS_CHARSET:
+ op = ALNUM;
+ break;
+ default:
+ goto bad_charset;
}
- FLAGS(ret) = get_regex_charset(RExC_flags);
+ ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'W':
- if (LOC) {
- ret = reg_node(pRExC_state, (U8)(NALNUML));
- } else {
- ret = reg_node(pRExC_state, (U8)(NALNUM));
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = NALNUML;
+ break;
+ case REGEX_UNICODE_CHARSET:
+ op = NALNUMU;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = NALNUMA;
+ break;
+ case REGEX_DEPENDS_CHARSET:
+ op = NALNUM;
+ break;
+ default:
+ goto bad_charset;
}
- FLAGS(ret) = get_regex_charset(RExC_flags);
+ ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'b':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- if (LOC) {
- ret = reg_node(pRExC_state, (U8)(BOUNDL));
- } else {
- ret = reg_node(pRExC_state, (U8)(BOUND));
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = BOUNDL;
+ break;
+ case REGEX_UNICODE_CHARSET:
+ op = BOUNDU;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = BOUNDA;
+ break;
+ case REGEX_DEPENDS_CHARSET:
+ op = BOUND;
+ break;
+ default:
+ goto bad_charset;
}
+ ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
goto finish_meta_pat;
case 'B':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- if (LOC) {
- ret = reg_node(pRExC_state, (U8)(NBOUNDL));
- } else {
- ret = reg_node(pRExC_state, (U8)(NBOUND));
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = NBOUNDL;
+ break;
+ case REGEX_UNICODE_CHARSET:
+ op = NBOUNDU;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = NBOUNDA;
+ break;
+ case REGEX_DEPENDS_CHARSET:
+ op = NBOUND;
+ break;
+ default:
+ goto bad_charset;
}
+ ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
goto finish_meta_pat;
case 's':
- if (LOC) {
- ret = reg_node(pRExC_state, (U8)(SPACEL));
- } else {
- ret = reg_node(pRExC_state, (U8)(SPACE));
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = SPACEL;
+ break;
+ case REGEX_UNICODE_CHARSET:
+ op = SPACEU;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = SPACEA;
+ break;
+ case REGEX_DEPENDS_CHARSET:
+ op = SPACE;
+ break;
+ default:
+ goto bad_charset;
}
- FLAGS(ret) = get_regex_charset(RExC_flags);
+ ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'S':
- if (LOC) {
- ret = reg_node(pRExC_state, (U8)(NSPACEL));
- } else {
- ret = reg_node(pRExC_state, (U8)(NSPACE));
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = NSPACEL;
+ break;
+ case REGEX_UNICODE_CHARSET:
+ op = NSPACEU;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = NSPACEA;
+ break;
+ case REGEX_DEPENDS_CHARSET:
+ op = NSPACE;
+ break;
+ default:
+ goto bad_charset;
}
- FLAGS(ret) = get_regex_charset(RExC_flags);
+ ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'd':
- if (LOC) {
- ret = reg_node(pRExC_state, (U8)(DIGITL));
- } else {
- ret = reg_node(pRExC_state, (U8)(DIGIT));
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = DIGITL;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = DIGITA;
+ break;
+ case REGEX_DEPENDS_CHARSET: /* No difference between these */
+ case REGEX_UNICODE_CHARSET:
+ op = DIGIT;
+ break;
+ default:
+ goto bad_charset;
}
+ ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'D':
- if (LOC) {
- ret = reg_node(pRExC_state, (U8)(NDIGITL));
- } else {
- ret = reg_node(pRExC_state, (U8)(NDIGIT));
+ switch (get_regex_charset(RExC_flags)) {
+ case REGEX_LOCALE_CHARSET:
+ op = NDIGITL;
+ break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ op = NDIGITA;
+ break;
+ case REGEX_DEPENDS_CHARSET: /* No difference between these */
+ case REGEX_UNICODE_CHARSET:
+ op = NDIGIT;
+ break;
+ default:
+ goto bad_charset;
}
+ ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'R':
ret = reganode(pRExC_state,
((! FOLD)
? NREF
- : (UNI_SEMANTICS)
+ : (AT_LEAST_UNI_SEMANTICS)
? NREFFU
: (LOC)
? NREFFL
ret = reganode(pRExC_state,
((! FOLD)
? REF
- : (UNI_SEMANTICS)
+ : (AT_LEAST_UNI_SEMANTICS)
? REFFU
: (LOC)
? REFFL
(U8) ((! FOLD) ? EXACT
: (LOC)
? EXACTFL
- : (UNI_SEMANTICS)
+ : (AT_LEAST_UNI_SEMANTICS)
? EXACTFU
: EXACTF)
);
}
return(ret);
+
+/* Jumped to when an unrecognized character set is encountered */
+bad_charset:
+ Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
+ return(NULL);
}
STATIC char *
* there are two tests passed in, to use depending on that. There aren't any
* cases where the label is different from the name, so no need for that
* parameter */
-#define _C_C_T_(NAME,TEST_8,TEST_7,WORD) \
+#define _C_C_T_(NAME, TEST_8, TEST_7, WORD) \
ANYOF_##NAME: \
if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
else if (UNI_SEMANTICS) { \
for (value = 0; value < 256; value++) { \
- if (TEST_8) stored += \
+ if (TEST_8(value)) stored += \
S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
} \
} \
else { \
for (value = 0; value < 128; value++) { \
- if (TEST_7) stored += \
+ if (TEST_7(UNI_TO_NATIVE(value))) stored += \
S_set_regclass_bit(aTHX_ pRExC_state, ret, \
(U8) UNI_TO_NATIVE(value)); \
} \
if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
else if (UNI_SEMANTICS) { \
for (value = 0; value < 256; value++) { \
- if (! TEST_8) stored += \
+ if (! TEST_8(value)) stored += \
S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
} \
} \
else { \
for (value = 0; value < 128; value++) { \
- if (! TEST_7) stored += \
- S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
+ if (! TEST_7(UNI_TO_NATIVE(value))) stored += S_set_regclass_bit( \
+ aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \
} \
- /* For a non-ut8 target string with DEPENDS semantics, all above ASCII \
- * Latin1 code points match the complement of any of the classes. But \
- * in utf8, they have their Unicode semantics, so can't just set them \
- * in the bitmap, or else regexec.c will think they matched when they \
- * shouldn't. */ \
- ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8; \
+ if (ASCII_RESTRICTED) { \
+ for (value = 128; value < 256; value++) { \
+ stored += S_set_regclass_bit( \
+ aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \
+ } \
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8; \
+ } \
+ else { \
+ /* For a non-ut8 target string with DEPENDS semantics, all above \
+ * ASCII Latin1 code points match the complement of any of the \
+ * classes. But in utf8, they have their Unicode semantics, so \
+ * can't just set them in the bitmap, or else regexec.c will think \
+ * they matched when they shouldn't. */ \
+ ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8; \
+ } \
} \
yesno = '!'; \
what = WORD; \
U8 stored = 0;
U8 fold;
- fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
+ fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
: PL_fold[value];
/* It assumes the bit for 'value' has already been set */
e = RExC_parse;
n = 1;
}
- if (!SIZE_ONLY) {
+ if (SIZE_ONLY) {
+ if (LOC) {
+ ckWARN2reg(RExC_parse,
+ "\\%c uses Unicode rules, not locale rules",
+ (int) value);
+ }
+ }
+ else {
if (UCHARAT(RExC_parse) == '^') {
RExC_parse++;
n--;
/* The \p could match something in the Latin1 range, hence
* something that isn't utf8 */
ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
- if (FOLD) { /* And one of these could have a multi-char fold */
- OP(ret) = ANYOFV;
- }
namedclass = ANYOF_MAX; /* no official name, but it's named */
+
+ /* \p means they want Unicode semantics */
+ RExC_uni_semantics = 1;
}
break;
case 'n': value = '\n'; break;
* --jhi */
switch ((I32)namedclass) {
- case _C_C_T_(ALNUMC, isALNUMC_L1(value), isALNUMC(value), "XPosixAlnum");
- case _C_C_T_(ALPHA, isALPHA_L1(value), isALPHA(value), "XPosixAlpha");
- case _C_C_T_(BLANK, isBLANK_L1(value), isBLANK(value), "XPosixBlank");
- case _C_C_T_(CNTRL, isCNTRL_L1(value), isCNTRL(value), "XPosixCntrl");
- case _C_C_T_(GRAPH, isGRAPH_L1(value), isGRAPH(value), "XPosixGraph");
- case _C_C_T_(LOWER, isLOWER_L1(value), isLOWER(value), "XPosixLower");
- case _C_C_T_(PRINT, isPRINT_L1(value), isPRINT(value), "XPosixPrint");
- case _C_C_T_(PSXSPC, isPSXSPC_L1(value), isPSXSPC(value), "XPosixSpace");
- case _C_C_T_(PUNCT, isPUNCT_L1(value), isPUNCT(value), "XPosixPunct");
- case _C_C_T_(UPPER, isUPPER_L1(value), isUPPER(value), "XPosixUpper");
+ case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum");
+ case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha");
+ case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank");
+ case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl");
+ case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph");
+ case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower");
+ case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint");
+ case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
+ case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
+ case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
/* \s, \w match all unicode if utf8. */
- case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "SpacePerl");
- case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "Word");
+ case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
+ case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
#else
/* \s, \w match ascii and locale only */
- case _C_C_T_(SPACE, isSPACE_L1(value), isSPACE(value), "PerlSpace");
- case _C_C_T_(ALNUM, isWORDCHAR_L1(value), isALNUM(value), "PerlWord");
+ case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "PerlSpace");
+ case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "PerlWord");
#endif
- case _C_C_T_(XDIGIT, isXDIGIT_L1(value), isXDIGIT(value), "XPosixXDigit");
+ case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
case ANYOF_ASCII:
stored +=
S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
}
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
yesno = '!';
what = "ASCII";
break;
}
yesno = '!';
what = POSIX_CC_UNI_NAME("Digit");
+ if (ASCII_RESTRICTED ) {
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
+ }
break;
case ANYOF_MAX:
/* this is to handle \p and \P */
vFAIL("Invalid [::] class");
break;
}
- if (what) {
+ if (what && ! (ASCII_RESTRICTED)) {
/* Strings such as "+utf8::isWord\n" */
Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
ANYOF_FLAGS(ret) |= ANYOF_UTF8;
/* Currently, we don't look at every value in the range.
* Therefore we have to assume the worst case: that if
- * folding, it will match more than one character */
- if (FOLD) {
+ * folding, it will match more than one character. But in
+ * lookbehind patterns, can only be single character
+ * length, so disallow those folds */
+ if (FOLD && ! RExC_in_lookbehind) {
OP(ret) = ANYOFV;
}
}
#endif
Perl_sv_catpvf(aTHX_ listsv,
"%04"UVxf"\n", f);
- else {
+ else if (! RExC_in_lookbehind) {
/* Any multicharacter foldings
+ * (disallowed in lookbehind patterns)
* require the following transform:
* [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
* where E folds into "pq" and F folds
/* This is the one character in the bitmap that needs special handling
* under non-locale folding, as it folds to two characters 'ss'. This
* happens if it is set and not inverting, or isn't set and are
- * inverting */
+ * inverting (disallowed in lookbehind patterns because they can't be
+ * variable length) */
if (! LOC
+ && ! RExC_in_lookbehind
&& (cBOOL(ANYOF_BITMAP_TEST(ret, LATIN_SMALL_LETTER_SHARP_S))
^ cBOOL(ANYOF_FLAGS(ret) & ANYOF_INVERT)))
{
op = EXACT;
}
} /* else 2 chars in the bit map: the folds of each other */
- else if (UNI_SEMANTICS || !isASCII(value)) {
+ else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
/* To join adjacent nodes, they must be the exact EXACTish type.
* Try to use the most likely type, by using EXACTFU if the regex
case REGEX_LOCALE_CHARSET:
PerlIO_printf(Perl_debug_log, "LOCALE");
break;
+ case REGEX_ASCII_RESTRICTED_CHARSET:
+ PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
+ break;
default:
PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
break;