PERL_ARGS_ASSERT_CL_AND;
assert(and_with->type == ANYOF);
- if (!(and_with->flags & ANYOF_CLASS)
- && !(cl->flags & ANYOF_CLASS)
+
+ if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
+ && !(ANYOF_CLASS_TEST_ANY_SET(cl))
&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
&& !(and_with->flags & ANYOF_FOLD)
&& !(cl->flags & ANYOF_FOLD)) {
/* OR char bitmap and class bitmap separately */
for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
cl->bitmap[i] |= or_with->bitmap[i];
- if (or_with->flags & ANYOF_CLASS) {
+ if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
cl->classflags[i] |= or_with->classflags[i];
cl->flags |= ANYOF_CLASS;
goto do_default;
if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
- || ((data->start_class->flags & ANYOF_CLASS)
- && ANYOF_CLASS_TEST_ANY_SET(data->start_class)));
+ || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
cl_anything(pRExC_state, data->start_class);
}
if (flags & SCF_DO_STCLASS_AND || !value)
ANYOF_##NAME: \
for (value = 0; value < 256; value++) \
if (TEST) \
- stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+ stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
yesno = '+'; \
what = WORD; \
break; \
case ANYOF_N##NAME: \
for (value = 0; value < 256; value++) \
if (!TEST) \
- stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+ stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
yesno = '!'; \
what = WORD; \
break
else if (UNI_SEMANTICS) { \
for (value = 0; value < 256; value++) { \
if (TEST_8) stored += \
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
} \
} \
else { \
for (value = 0; value < 128; value++) { \
if (TEST_7) stored += \
- S_set_regclass_bit(aTHX_ pRExC_state, ret, UNI_TO_NATIVE(value)); \
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \
} \
} \
yesno = '+'; \
else if (UNI_SEMANTICS) { \
for (value = 0; value < 256; value++) { \
if (! TEST_8) stored += \
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
} \
} \
else { \
for (value = 0; value < 128; value++) { \
if (! TEST_7) stored += \
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
} \
for (value = 128; value < 256; value++) { \
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value); \
} \
} \
yesno = '!'; \
if (SIZE_ONLY) {
RExC_size += ANYOF_SKIP;
+#ifdef ANYOF_ADD_LOC_SKIP
+ if (LOC) {
+ RExC_size += ANYOF_ADD_LOC_SKIP;
+ }
+#endif
listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
}
else {
RExC_emit += ANYOF_SKIP;
- if (LOC)
+ if (LOC) {
ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
+#ifdef ANYOF_ADD_LOC_SKIP
+ RExC_emit += ANYOF_ADD_LOC_SKIP;
+#endif
+ }
ANYOF_BITMAP_ZERO(ret);
listsv = newSVpvs("# comment\n");
}
if (LOC && namedclass < ANYOF_MAX && ! need_class) {
need_class = 1;
if (SIZE_ONLY) {
+#ifdef ANYOF_CLASS_ADD_SKIP
RExC_size += ANYOF_CLASS_ADD_SKIP;
+#endif
}
else {
+#ifdef ANYOF_CLASS_ADD_SKIP
RExC_emit += ANYOF_CLASS_ADD_SKIP;
+#endif
ANYOF_CLASS_ZERO(ret);
}
ANYOF_FLAGS(ret) |= ANYOF_CLASS;
if (prevvalue < 256) {
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
stored +=
S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
}
else {
ANYOF_FLAGS(ret) |= ANYOF_UTF8;
Perl_sv_catpvf(aTHX_ listsv,
- "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
+ "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
}
}
else {
for (value = 0; value < 128; value++)
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
}
yesno = '+';
what = NULL; /* Doesn't match outside ascii, so
else {
for (value = 128; value < 256; value++)
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, ASCII_TO_NATIVE(value));
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
}
yesno = '!';
what = "ASCII";
/* consecutive digits assumed */
for (value = '0'; value <= '9'; value++)
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
}
yesno = '+';
what = POSIX_CC_UNI_NAME("Digit");
/* consecutive digits assumed */
for (value = 0; value < '0'; value++)
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
for (value = '9' + 1; value < 256; value++)
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
}
yesno = '!';
what = POSIX_CC_UNI_NAME("Digit");
for (i = prevvalue; i <= ceilvalue; i++)
if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
}
} else {
for (i = prevvalue; i <= ceilvalue; i++)
if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
stored +=
- S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+ S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
}
}
}
else
#endif
for (i = prevvalue; i <= ceilvalue; i++) {
- stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
+ stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
}
}
if (value > 255 || UTF) {
if (FOLD && (LOC || ANYOF_FLAGS(ret) & ANYOF_NONBITMAP)) {
ANYOF_FLAGS(ret) |= ANYOF_FOLD;
}
- if( stored == 1 && (value < 128 || (value < 256 && !UTF))
- && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
- ) {
- /* optimize single char class to an EXACT node but *only* when its not
- * a UTF/high char. Note that the information needed to decide to do
- * this optimization is not currently available until the 2nd pass, and
- * that the actually used EXACT node takes less space than the
- * calculated ANYOF node, and hence the amount of space calculated in
- * the first pass is larger than actually used. Currently we don't
- * keep track of enough information to do this for nodes which contain
- * matches outside the bitmap */
- const char * cur_parse= RExC_parse;
- RExC_emit = (regnode *)orig_emit;
- RExC_parse = (char *)orig_parse;
- ret = reg_node(pRExC_state,
- (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
- RExC_parse = (char *)cur_parse;
- *STRING(ret)= (char)value;
- STR_LEN(ret)= 1;
- RExC_emit += STR_SZ(1);
- SvREFCNT_dec(listsv);
- return ret;
- }
/* Optimize inverted simple patterns (e.g. [^a-z]). Note that this doesn't
* optimize locale. Doing so perhaps could be done as long as there is
* interaction with above 0x100 chars */
if ((ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
- ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
- ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
+ ANYOF_BITMAP(ret)[value] ^= 0xFF;
+ stored = 256 - stored;
+
+ /* The inversion means that everything above 255 is matched */
+ ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
+ }
+
+ /* A single character class can be "optimized" into an EXACTish node.
+ * Note that since we don't currently count how many characters there are
+ * outside the bitmap, we are XXX missing optimization possibilities for
+ * them. This optimization can't happen unless this is a truly single
+ * character class, which means that it can't be an inversion into a
+ * many-character class, and there must be no possibility of there being
+ * things outside the bitmap. 'stored' (only) for locales doesn't include
+ * \w, etc, so have to make a special test that they aren't present
+ *
+ * Similarly A 2-character class of the very special form like [bB] can be
+ * optimized into an EXACTFish node, but only for non-locales, and for
+ * characters which only have the two folds; so things like 'fF' and 'Ii'
+ * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
+ * FI'. */
+ if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
+ && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
+ || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
+ || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
+ && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
+ /* If the latest code point has a fold whose
+ * bit is set, it must be the only other one */
+ && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
+ && ANYOF_BITMAP_TEST(ret, prevvalue)))))
+ {
+ /* Note that the information needed to decide to do this optimization
+ * is not currently available until the 2nd pass, and that the actually
+ * used EXACTish node takes less space than the calculated ANYOF node,
+ * and hence the amount of space calculated in the first pass is larger
+ * than actually used, so this optimization doesn't gain us any space.
+ * But an EXACT node is faster than an ANYOF node, and can be combined
+ * with any adjacent EXACT nodes later by the optimizer for further
+ * gains. The speed of executing an EXACTF is similar to an ANYOF
+ * node, so the optimization advantage comes from the ability to join
+ * it to adjacent EXACT nodes */
+
+ const char * cur_parse= RExC_parse;
+ U8 op;
+ RExC_emit = (regnode *)orig_emit;
+ RExC_parse = (char *)orig_parse;
+
+ if (stored == 1) {
+
+ /* A locale node with one point can be folded; all the other cases
+ * with folding will have two points, since we calculate them above
+ */
+ if (ANYOF_FLAGS(ret) & ANYOF_FOLD) {
+ op = EXACTFL;
+ }
+ else {
+ op = EXACT;
+ }
+ } /* else 2 chars in the bit map: the folds of each other */
+ else if (UNI_SEMANTICS || !isASCII(value)) {
+
+ /* To join adjacent nodes, they must be the exact EXACTish type.
+ * Try to use the most likely type, by using EXACTFU if the regex
+ * calls for them, or is required because the character is
+ * non-ASCII */
+ op = EXACTFU;
+ }
+ else { /* Otherwise, more likely to be EXACTF type */
+ op = EXACTF;
+ }
+
+ ret = reg_node(pRExC_state, op);
+ RExC_parse = (char *)cur_parse;
+ if (UTF && ! NATIVE_IS_INVARIANT(value)) {
+ *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
+ *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
+ STR_LEN(ret)= 2;
+ RExC_emit += STR_SZ(2);
+ }
+ else {
+ *STRING(ret)= (char)value;
+ STR_LEN(ret)= 1;
+ RExC_emit += STR_SZ(1);
+ }
+ SvREFCNT_dec(listsv);
+ return ret;
}
{
}
EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
- /* output any special charclass tests (used mostly under use locale) */
- if (o->flags & ANYOF_CLASS && ANYOF_CLASS_TEST_ANY_SET(o))
+ /* output any special charclass tests (used entirely under use locale) */
+ if (ANYOF_CLASS_TEST_ANY_SET(o))
for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
if (ANYOF_CLASS_TEST(o,i)) {
sv_catpv(sv, anyofs[i]);