TRIE_STORE_REVCHAR;
}
if ( set_bit ) {
- /* store the codepoint in the bitmap, and if its ascii
- also store its folded equivelent. */
+ /* store the codepoint in the bitmap, and its folded
+ * equivalent. */
TRIE_BITMAP_SET(trie,uvc);
/* store the folded codepoint */
}
#endif
}
-
- if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
+
+ if (UTF
+ && ( OP(scan) == EXACTF || OP(scan) == EXACTFU)
+ && ( STR_LEN(scan) >= 6 ) )
+ {
/*
Two problematic code points in Unicode casefolding of EXACT nodes:
(!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
&& !ANYOF_BITMAP_TEST(data->start_class, uc)
&& (!(data->start_class->flags & ANYOF_FOLD)
- || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
+ || !ANYOF_BITMAP_TEST(data->start_class, (UNI_SEMANTICS) ? PL_fold_latin1[uc] : PL_fold[uc])))
)
compat = 0;
ANYOF_CLASS_ZERO(data->start_class);
if (uc >= 0x100 ||
(!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
&& !ANYOF_BITMAP_TEST(data->start_class, uc)
- && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
+ && !ANYOF_BITMAP_TEST(data->start_class, (UNI_SEMANTICS) ? PL_fold_latin1[uc] : PL_fold[uc])))
compat = 0;
ANYOF_CLASS_ZERO(data->start_class);
ANYOF_BITMAP_ZERO(data->start_class);
ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
data->start_class->flags |= ANYOF_FOLD;
- if (OP(scan) == EXACTFL)
+ if (OP(scan) == EXACTFL) {
data->start_class->flags |= ANYOF_LOCALE;
+ }
+ else {
+
+ /* Also set the other member of the fold pair. Can't
+ * do this for locale, because not known until runtime
+ */
+ ANYOF_BITMAP_SET(data->start_class,
+ (OP(scan) == EXACTFU)
+ ? PL_fold_latin1[uc]
+ : PL_fold[uc]);
+ }
}
}
else if (flags & SCF_DO_STCLASS_OR) {
if (data->start_class->flags & ANYOF_FOLD) {
/* false positive possible if the class is case-folded.
Assume that the locale settings are the same... */
- if (uc < 0x100)
+ if (uc < 0x100) {
ANYOF_BITMAP_SET(data->start_class, uc);
+ if (OP(scan) != EXACTFL) {
+
+ /* And set the other member of the fold pair, but
+ * can't do that in locale because not known until
+ * run-time */
+ ANYOF_BITMAP_SET(data->start_class,
+ (OP(scan) == EXACTFU)
+ ? PL_fold_latin1[uc]
+ : PL_fold[uc]);
+ }
+ }
data->start_class->flags &= ~ANYOF_EOS;
}
cl_and(data->start_class, and_withp);
f |= SCF_DO_STCLASS_AND;
f &= ~SCF_DO_STCLASS_OR;
}
- /* These are the cases when once a subexpression
- fails at a particular position, it cannot succeed
- even after backtracking at the enclosing scope.
-
- XXXX what if minimal match and we are at the
- initial run of {n,m}? */
- if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
+ /* Exclude from super-linear cache processing any {n,m}
+ regops for which the combination of input pos and regex
+ pos is not enough information to determine if a match
+ will be possible.
+
+ For example, in the regex /foo(bar\s*){4,8}baz/ with the
+ regex pos at the \s*, the prospects for a match depend not
+ only on the input position but also on how many (bar\s*)
+ repeats into the {4,8} we are. */
+ if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
f &= ~SCF_WHILEM_VISITED_POS;
/* This will finish on WHILEM, setting scan, or on NULL: */
if (PL_regkind[OP(first)] == EXACT) {
if (OP(first) == EXACT)
NOOP; /* Empty, get anchored substr later. */
- else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
+ else
ri->regstclass = first;
}
#ifdef TRIE_STCLASS
ENTER;
Perl_save_re_context(aTHX);
- rop = sv_compile_2op(sv, &sop, "re", &pad);
+ rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
sop->op_private |= OPpREFCOUNTED;
/* re_dup will OpREFCNT_inc */
OpREFCNT_set(sop, 1);
that follow */
has_use_defaults = TRUE;
STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
- RExC_flags &= ~(RXf_PMf_LOCALE|RXf_PMf_UNICODE);
goto parse_flags;
default:
--RExC_parse;
char *endchar; /* Points to '.' or '}' ending cur char in the input
stream */
- ret = reg_node(pRExC_state,
- (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+ ret = reg_node(pRExC_state, (U8) ((! FOLD) ? EXACT
+ : (LOC)
+ ? EXACTFL
+ : UNI_SEMANTICS
+ ? EXACTFU
+ : EXACTF));
s= STRING(ret);
/* Exact nodes can hold only a U8 length's of text = 255. Loop through
defchar:
ender = 0;
ret = reg_node(pRExC_state,
- (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
+ (U8) ((! FOLD) ? EXACT
+ : (LOC)
+ ? EXACTFL
+ : (UNI_SEMANTICS)
+ ? EXACTFU
+ : EXACTF)
+ );
s = STRING(ret);
for (len = 0, p = RExC_parse - 1;
len < 127 && p < RExC_end;
ANYOF_##NAME: \
for (value = 0; value < 256; value++) \
if (TEST) \
- ANYOF_BITMAP_SET(ret, value); \
+ stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
yesno = '+'; \
what = WORD; \
break; \
case ANYOF_N##NAME: \
for (value = 0; value < 256; value++) \
if (!TEST) \
- ANYOF_BITMAP_SET(ret, value); \
+ stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
yesno = '!'; \
what = WORD; \
break
if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
else if (UNI_SEMANTICS) { \
for (value = 0; value < 256; value++) { \
- if (TEST_8) ANYOF_BITMAP_SET(ret, value); \
+ if (TEST_8) stored += \
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
} \
} \
else { \
for (value = 0; value < 256; value++) { \
- if (TEST_7) ANYOF_BITMAP_SET(ret, value); \
+ if (TEST_7) stored += \
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
} \
} \
yesno = '+'; \
if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
else if (UNI_SEMANTICS) { \
for (value = 0; value < 256; value++) { \
- if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
+ if (! TEST_8) stored += \
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
} \
} \
else { \
for (value = 0; value < 256; value++) { \
- if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
+ if (! TEST_7) stored += \
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
} \
} \
yesno = '!'; \
#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
#endif
+STATIC U8
+S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
+{
+
+ /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
+ * Locale folding is done at run-time, so this function should not be
+ * called for nodes that are for locales.
+ *
+ * This function simply sets the bit corresponding to the fold of the input
+ * 'value', if not already set. The fold of 'f' is 'F', and the fold of
+ * 'F' is 'f'.
+ *
+ * It also sets any necessary flags, and returns the number of bits that
+ * actually changed from 0 to 1 */
+
+ U8 stored = 0;
+ U8 fold;
+
+ fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
+ : PL_fold[value];
+
+ /* It assumes the bit for 'value' has already been set */
+ if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
+ ANYOF_BITMAP_SET(node, fold);
+ stored++;
+ }
+
+ /* The fold of the German sharp s is two ASCII characters, so isn't in the
+ * bitmap and doesn't have to be in utf8, but we only process it if unicode
+ * semantics are called for */
+ if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
+ ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
+ }
+ else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
+ || (! UNI_SEMANTICS
+ && ! isASCII(value)
+ && PL_fold_latin1[value] != value))
+ { /* A character that has a fold outside of Latin1 matches outside the
+ bitmap, but only when the target string is utf8. Similarly when we
+ don't have unicode semantics for the above ASCII Latin-1 characters,
+ and they have a fold, they should match if the target is utf8, and
+ not otherwise */
+ ANYOF_FLAGS(node) |= ANYOF_UTF8;
+ }
+
+ return stored;
+}
+
+
+PERL_STATIC_INLINE U8
+S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U32 value)
+{
+ /* This inline function sets a bit in the bitmap if not already set, and if
+ * appropriate, its fold, returning the number of bits that actually
+ * changed from 0 to 1 */
+
+ U8 stored;
+
+ if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
+ return 0;
+ }
+
+ ANYOF_BITMAP_SET(node, value);
+ stored = 1;
+
+ if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
+ stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
+ }
+
+ return stored;
+}
+
/*
parse a class specification and produce either an ANYOF node that
matches the pattern or if the pattern matches a single char only and
(value=='p' ? '+' : '!'), (int)n, RExC_parse);
}
RExC_parse = e + 1;
+
+ /* The \p could match something in the Latin1 range, hence
+ * something that isn't utf8 */
ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
namedclass = ANYOF_MAX; /* no official name, but it's named */
}
w, w, rangebegin);
if (prevvalue < 256) {
- ANYOF_BITMAP_SET(ret, prevvalue);
- ANYOF_BITMAP_SET(ret, '-');
+ stored +=
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
+ stored +=
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
}
else {
ANYOF_FLAGS(ret) |= ANYOF_UTF8;
else {
#ifndef EBCDIC
for (value = 0; value < 128; value++)
- ANYOF_BITMAP_SET(ret, value);
+ stored +=
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
#else /* EBCDIC */
for (value = 0; value < 256; value++) {
if (isASCII(value))
- ANYOF_BITMAP_SET(ret, value);
+ stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
}
#endif /* EBCDIC */
}
else {
#ifndef EBCDIC
for (value = 128; value < 256; value++)
- ANYOF_BITMAP_SET(ret, value);
+ stored +=
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
#else /* EBCDIC */
for (value = 0; value < 256; value++) {
if (!isASCII(value))
- ANYOF_BITMAP_SET(ret, value);
+ stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
}
#endif /* EBCDIC */
}
else {
/* consecutive digits assumed */
for (value = '0'; value <= '9'; value++)
- ANYOF_BITMAP_SET(ret, value);
+ stored +=
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
}
yesno = '+';
what = POSIX_CC_UNI_NAME("Digit");
else {
/* consecutive digits assumed */
for (value = 0; value < '0'; value++)
- ANYOF_BITMAP_SET(ret, value);
+ stored +=
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
for (value = '9' + 1; value < 256; value++)
- ANYOF_BITMAP_SET(ret, value);
+ stored +=
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
}
yesno = '!';
what = POSIX_CC_UNI_NAME("Digit");
w, w, rangebegin);
}
if (!SIZE_ONLY)
- ANYOF_BITMAP_SET(ret, '-');
+ stored +=
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
} else
range = 1; /* yeah, it's a range! */
continue; /* but do it the next time */
if (isLOWER(prevvalue)) {
for (i = prevvalue; i <= ceilvalue; i++)
if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
- stored++;
- ANYOF_BITMAP_SET(ret, i);
+ stored +=
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
}
} else {
for (i = prevvalue; i <= ceilvalue; i++)
if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
- stored++;
- ANYOF_BITMAP_SET(ret, i);
+ stored +=
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
}
}
}
#endif
for (i = prevvalue; i <= ceilvalue; i++) {
if (!ANYOF_BITMAP_TEST(ret,i)) {
- stored++;
- ANYOF_BITMAP_SET(ret, i);
+ stored +=
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
}
}
}
if( stored == 1 && (value < 128 || (value < 256 && !UTF))
&& !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
) {
- /* optimize single char class to an EXACT node
- but *only* when its not a UTF/high char */
+ /* optimize single char class to an EXACT node but *only* when its not
+ * a UTF/high char. Note that the information needed to decide to do
+ * this optimization is not currently available until the 2nd pass, and
+ * that the actually used EXACT node takes less space than the
+ * calculated ANYOF node, and hence the amount of space calculated in
+ * the first pass is larger than actually used. Currently we don't
+ * keep track of enough information to do this for nodes which contain
+ * matches outside the bitmap */
const char * cur_parse= RExC_parse;
RExC_emit = (regnode *)orig_emit;
RExC_parse = (char *)orig_parse;
switch (OP(scan)) {
case EXACT:
case EXACTF:
+ case EXACTFU:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(scan);
sv_catpvs(sv, "{unicode_all}");
else if (flags & ANYOF_UTF8)
sv_catpvs(sv, "{unicode}");
- else if (flags & ANYOF_NONBITMAP)
+ if (flags & ANYOF_NONBITMAP_NON_UTF8)
sv_catpvs(sv, "{outside bitmap}");
{
ones (binary 1111 1111, hexadecimal FF). It is similar, but not
identical, to the ASCII delete (DEL) or rubout control character.
) So the old condition can be simplified to !isPRINT(c) */
- if (!isPRINT(c))
- Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
+ if (!isPRINT(c)) {
+ if (c < 256) {
+ Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
+ }
+ else {
+ Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
+ }
+ }
else {
const char string = c;
if (c == '-' || c == ']' || c == '\\' || c == '^')