regnode **recurse; /* Recurse regops */
I32 recurse_count; /* Number of recurse regops */
I32 in_lookbehind;
+ I32 contains_locale;
+ I32 override_recoding;
#if ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#define RExC_recurse (pRExC_state->recurse)
#define RExC_recurse_count (pRExC_state->recurse_count)
#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
+#define RExC_contains_locale (pRExC_state->contains_locale)
+#define RExC_override_recoding (pRExC_state->override_recoding)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
(int)offset, RExC_precomp, RExC_precomp + offset); \
} STMT_END
+#define ckWARN2regdep(loc,m, a1) STMT_START { \
+ const IV offset = loc - RExC_precomp; \
+ Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
+ m REPORT_LOCATION, \
+ a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
+} STMT_END
+
#define ckWARN2reg(loc, m, a1) STMT_START { \
const IV offset = loc - RExC_precomp; \
Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
{
PERL_ARGS_ASSERT_CL_ANYTHING;
- ANYOF_CLASS_ZERO(cl);
ANYOF_BITMAP_SETALL(cl);
- cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL|ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
- if (LOC)
+ cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
+ |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
+
+ /* If any portion of the regex is to operate under locale rules,
+ * initialization includes it. The reason this isn't done for all regexes
+ * is that the optimizer was written under the assumption that locale was
+ * all-or-nothing. Given the complexity and lack of documentation in the
+ * optimizer, and that there are inadequate test cases for locale, so many
+ * parts of it may not work properly, it is safest to avoid locale unless
+ * necessary. */
+ if (RExC_contains_locale) {
+ ANYOF_CLASS_SETALL(cl); /* /l uses class */
cl->flags |= ANYOF_LOCALE;
+ }
+ else {
+ ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
+ }
}
/* Can match anything (initialization) */
Zero(cl, 1, struct regnode_charclass_class);
cl->type = ANYOF;
cl_anything(pRExC_state, cl);
+ ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
}
-STATIC void
-S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
-{
- PERL_ARGS_ASSERT_CL_INIT_ZERO;
+/* These two functions currently do the exact same thing */
+#define cl_init_zero S_cl_init
- Zero(cl, 1, struct regnode_charclass_class);
- cl->type = ANYOF;
- cl_anything(pRExC_state, cl);
- if (LOC)
- cl->flags |= ANYOF_LOCALE;
-}
-
-/* 'And' a given class with another one. Can create false positives */
-/* We assume that cl is not inverted */
+/* 'AND' a given class with another one. Can create false positives. 'cl'
+ * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
+ * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
STATIC void
S_cl_and(struct regnode_charclass_class *cl,
const struct regnode_charclass_class *and_with)
assert(and_with->type == ANYOF);
+ /* I (khw) am not sure all these restrictions are necessary XXX */
if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
&& !(ANYOF_CLASS_TEST_ANY_SET(cl))
&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
cl->bitmap[i] &= and_with->bitmap[i];
} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
- if (!(and_with->flags & ANYOF_EOS))
- cl->flags &= ~ANYOF_EOS;
- if (!(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD))
- cl->flags &= ~ANYOF_LOC_NONBITMAP_FOLD;
- if (!(and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
- cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
+ if (and_with->flags & ANYOF_INVERT) {
- if (cl->flags & ANYOF_UNICODE_ALL
- && and_with->flags & ANYOF_NONBITMAP
- && !(and_with->flags & ANYOF_INVERT))
- {
- if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
+ /* Here, the and'ed node is inverted. Get the AND of the flags that
+ * aren't affected by the inversion. Those that are affected are
+ * handled individually below */
+ U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
+ cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
+ cl->flags |= affected_flags;
+
+ /* We currently don't know how to deal with things that aren't in the
+ * bitmap, but we know that the intersection is no greater than what
+ * is already in cl, so let there be false positives that get sorted
+ * out after the synthetic start class succeeds, and the node is
+ * matched for real. */
+
+ /* The inversion of these two flags indicate that the resulting
+ * intersection doesn't have them */
+ if (and_with->flags & ANYOF_UNICODE_ALL) {
cl->flags &= ~ANYOF_UNICODE_ALL;
}
- cl->flags |= and_with->flags & ANYOF_NONBITMAP; /* field is 2 bits; use
- only the one(s)
- actually set */
- ARG_SET(cl, ARG(and_with));
- }
- if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
- !(and_with->flags & ANYOF_INVERT))
- cl->flags &= ~ANYOF_UNICODE_ALL;
- if (!(and_with->flags & (ANYOF_NONBITMAP|ANYOF_UNICODE_ALL)) &&
- !(and_with->flags & ANYOF_INVERT))
- cl->flags &= ~ANYOF_NONBITMAP;
+ if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
+ cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
+ }
+ }
+ else { /* and'd node is not inverted */
+ U8 outside_bitmap_but_not_utf8; /* Temp variable */
+
+ if (! ANYOF_NONBITMAP(and_with)) {
+
+ /* Here 'and_with' doesn't match anything outside the bitmap
+ * (except possibly ANYOF_UNICODE_ALL), which means the
+ * intersection can't either, except for ANYOF_UNICODE_ALL, in
+ * which case we don't know what the intersection is, but it's no
+ * greater than what cl already has, so can just leave it alone,
+ * with possible false positives */
+ if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
+ ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
+ cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
+ }
+ }
+ else if (! ANYOF_NONBITMAP(cl)) {
+
+ /* Here, 'and_with' does match something outside the bitmap, and cl
+ * doesn't have a list of things to match outside the bitmap. If
+ * cl can match all code points above 255, the intersection will
+ * be those above-255 code points that 'and_with' matches. If cl
+ * can't match all Unicode code points, it means that it can't
+ * match anything outside the bitmap (since the 'if' that got us
+ * into this block tested for that), so we leave the bitmap empty.
+ */
+ if (cl->flags & ANYOF_UNICODE_ALL) {
+ ARG_SET(cl, ARG(and_with));
+
+ /* and_with's ARG may match things that don't require UTF8.
+ * And now cl's will too, in spite of this being an 'and'. See
+ * the comments below about the kludge */
+ cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
+ }
+ }
+ else {
+ /* Here, both 'and_with' and cl match something outside the
+ * bitmap. Currently we do not do the intersection, so just match
+ * whatever cl had at the beginning. */
+ }
+
+
+ /* Take the intersection of the two sets of flags. However, the
+ * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
+ * kludge around the fact that this flag is not treated like the others
+ * which are initialized in cl_anything(). The way the optimizer works
+ * is that the synthetic start class (SSC) is initialized to match
+ * anything, and then the first time a real node is encountered, its
+ * values are AND'd with the SSC's with the result being the values of
+ * the real node. However, there are paths through the optimizer where
+ * the AND never gets called, so those initialized bits are set
+ * inappropriately, which is not usually a big deal, as they just cause
+ * false positives in the SSC, which will just mean a probably
+ * imperceptible slow down in execution. However this bit has a
+ * higher false positive consequence in that it can cause utf8.pm,
+ * utf8_heavy.pl ... to be loaded when not necessary, which is a much
+ * bigger slowdown and also causes significant extra memory to be used.
+ * In order to prevent this, the code now takes a different tack. The
+ * bit isn't set unless some part of the regular expression needs it,
+ * but once set it won't get cleared. This means that these extra
+ * modules won't get loaded unless there was some path through the
+ * pattern that would have required them anyway, and so any false
+ * positives that occur by not ANDing them out when they could be
+ * aren't as severe as they would be if we treated this bit like all
+ * the others */
+ outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
+ & ANYOF_NONBITMAP_NON_UTF8;
+ cl->flags &= and_with->flags;
+ cl->flags |= outside_bitmap_but_not_utf8;
+ }
}
-/* 'OR' a given class with another one. Can create false positives */
-/* We assume that cl is not inverted */
+/* 'OR' a given class with another one. Can create false positives. 'cl'
+ * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
+ * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
STATIC void
S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
{
PERL_ARGS_ASSERT_CL_OR;
if (or_with->flags & ANYOF_INVERT) {
+
+ /* Here, the or'd node is to be inverted. This means we take the
+ * complement of everything not in the bitmap, but currently we don't
+ * know what that is, so give up and match anything */
+ if (ANYOF_NONBITMAP(or_with)) {
+ cl_anything(pRExC_state, cl);
+ }
/* We do not use
* (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
* <= (B1 | !B2) | (CL1 | !CL2)
* (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
* (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
*/
- if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
+ else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
&& !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
&& !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
int i;
else {
cl_anything(pRExC_state, cl);
}
- } else {
+
+ /* And, we can just take the union of the flags that aren't affected
+ * by the inversion */
+ cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
+
+ /* For the remaining flags:
+ ANYOF_UNICODE_ALL and inverted means to not match anything above
+ 255, which means that the union with cl should just be
+ what cl has in it, so can ignore this flag
+ ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
+ is 127-255 to match them, but then invert that, so the
+ union with cl should just be what cl has in it, so can
+ ignore this flag
+ */
+ } else { /* 'or_with' is not inverted */
/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
&& (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
else { /* XXXX: logic is complicated, leave it along for a moment. */
cl_anything(pRExC_state, cl);
}
- }
- if (or_with->flags & ANYOF_EOS)
- cl->flags |= ANYOF_EOS;
- if (!(or_with->flags & ANYOF_NON_UTF8_LATIN1_ALL))
- cl->flags |= ANYOF_NON_UTF8_LATIN1_ALL;
- if (or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
- cl->flags |= ANYOF_LOC_NONBITMAP_FOLD;
+ if (ANYOF_NONBITMAP(or_with)) {
+
+ /* Use the added node's outside-the-bit-map match if there isn't a
+ * conflict. If there is a conflict (both nodes match something
+ * outside the bitmap, but what they match outside is not the same
+ * pointer, and hence not easily compared until XXX we extend
+ * inversion lists this far), give up and allow the start class to
+ * match everything outside the bitmap. If that stuff is all above
+ * 255, can just set UNICODE_ALL, otherwise caould be anything. */
+ if (! ANYOF_NONBITMAP(cl)) {
+ ARG_SET(cl, ARG(or_with));
+ }
+ else if (ARG(cl) != ARG(or_with)) {
- /* If both nodes match something outside the bitmap, but what they match
- * outside is not the same pointer, and hence not easily compared, give up
- * and allow the start class to match everything outside the bitmap */
- if (cl->flags & ANYOF_NONBITMAP && or_with->flags & ANYOF_NONBITMAP &&
- ARG(cl) != ARG(or_with)) {
- cl->flags |= ANYOF_UNICODE_ALL;
- }
+ if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
+ cl_anything(pRExC_state, cl);
+ }
+ else {
+ cl->flags |= ANYOF_UNICODE_ALL;
+ }
+ }
+ }
- if (or_with->flags & ANYOF_UNICODE_ALL) {
- cl->flags |= ANYOF_UNICODE_ALL;
+ /* Take the union */
+ cl->flags |= or_with->flags;
}
}
}
} else {
/*
- Currently we do not believe that the trie logic can
- handle case insensitive matching properly when the
- pattern is not unicode (thus forcing unicode semantics).
+ Currently the trie logic handles case insensitive matching properly only
+ when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
+ semantics).
If/when this is fixed the following define can be swapped
in below to fully enable trie logic.
#define TRIE_TYPE_IS_SAFE 1
*/
-#define TRIE_TYPE_IS_SAFE (UTF || optype==EXACT)
+#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
if ( last && TRIE_TYPE_IS_SAFE ) {
make_trie( pRExC_state,
data->start_class->flags &= ~ANYOF_EOS;
data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
if (OP(scan) == EXACTFL) {
+ /* XXX This set is probably no longer necessary, and
+ * probably wrong as LOCALE now is on in the initial
+ * state */
data->start_class->flags |= ANYOF_LOCALE;
}
else {
else {
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
- else if (OP(scan) == ALNUMU) {
+
+ /* Even if under locale, set the bits for non-locale
+ * in case it isn't a true locale-node. This will
+ * create false positives if it truly is locale */
+ if (OP(scan) == ALNUMU) {
for (value = 0; value < 256; value++) {
if (isWORDCHAR_L1(value)) {
ANYOF_BITMAP_SET(data->start_class, value);
else {
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
- else {
- if (OP(scan) == NALNUMU) {
- for (value = 0; value < 256; value++) {
- if (! isWORDCHAR_L1(value)) {
- ANYOF_BITMAP_SET(data->start_class, value);
- }
- }
- } else {
- for (value = 0; value < 256; value++) {
- if (! isALNUM(value)) {
- ANYOF_BITMAP_SET(data->start_class, value);
- }
- }
+
+ /* Even if under locale, set the bits for non-locale in
+ * case it isn't a true locale-node. This will create
+ * false positives if it truly is locale */
+ if (OP(scan) == NALNUMU) {
+ for (value = 0; value < 256; value++) {
+ if (! isWORDCHAR_L1(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
+ }
+ } else {
+ for (value = 0; value < 256; value++) {
+ if (! isALNUM(value)) {
+ ANYOF_BITMAP_SET(data->start_class, value);
+ }
}
}
}
if (data->start_class->flags & ANYOF_LOCALE) {
ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
}
- else if (OP(scan) == SPACEU) {
+ if (OP(scan) == SPACEU) {
for (value = 0; value < 256; value++) {
if (isSPACE_L1(value)) {
ANYOF_BITMAP_SET(data->start_class, value);
else {
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
- else if (OP(scan) == NSPACEU) {
+ if (OP(scan) == NSPACEU) {
for (value = 0; value < 256; value++) {
if (!isSPACE_L1(value)) {
ANYOF_BITMAP_SET(data->start_class, value);
break;
case DIGIT:
if (flags & SCF_DO_STCLASS_AND) {
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
- for (value = 0; value < 256; value++)
- if (!isDIGIT(value))
- ANYOF_BITMAP_CLEAR(data->start_class, value);
+ if (!(data->start_class->flags & ANYOF_LOCALE)) {
+ ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
+ for (value = 0; value < 256; value++)
+ if (!isDIGIT(value))
+ ANYOF_BITMAP_CLEAR(data->start_class, value);
+ }
}
else {
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
- else {
- for (value = 0; value < 256; value++)
- if (isDIGIT(value))
- ANYOF_BITMAP_SET(data->start_class, value);
- }
+ for (value = 0; value < 256; value++)
+ if (isDIGIT(value))
+ ANYOF_BITMAP_SET(data->start_class, value);
}
break;
case NDIGIT:
if (flags & SCF_DO_STCLASS_AND) {
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
+ if (!(data->start_class->flags & ANYOF_LOCALE))
+ ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
for (value = 0; value < 256; value++)
if (isDIGIT(value))
ANYOF_BITMAP_CLEAR(data->start_class, value);
else {
if (data->start_class->flags & ANYOF_LOCALE)
ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
- else {
- for (value = 0; value < 256; value++)
- if (!isDIGIT(value))
- ANYOF_BITMAP_SET(data->start_class, value);
- }
+ for (value = 0; value < 256; value++)
+ if (!isDIGIT(value))
+ ANYOF_BITMAP_SET(data->start_class, value);
}
break;
CASE_SYNST_FNC(VERTWS);
I32 sawplus = 0;
I32 sawopen = 0;
bool used_setjump = FALSE;
+ regex_charset initial_charset = get_regex_charset(orig_pm_flags);
U8 jump_ret = 0;
dJMPENV;
RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
RExC_uni_semantics = 0;
+ RExC_contains_locale = 0;
/****************** LONG JUMP TARGET HERE***********************/
/* Longjmp back to here if have to switch in midstream to utf8 */
restudied = 0;
#endif
- /* Set to use unicode semantics if the pattern is in utf8 and has the
- * 'depends' charset specified, as it means unicode when utf8 */
pm_flags = orig_pm_flags;
- if (RExC_utf8 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET) {
+ if (initial_charset == REGEX_LOCALE_CHARSET) {
+ RExC_contains_locale = 1;
+ }
+ else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
+
+ /* Set to use unicode semantics if the pattern is in utf8 and has the
+ * 'depends' charset specified, as it means unicode when utf8 */
set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
}
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
RExC_seen_evals = 0;
RExC_extralen = 0;
+ RExC_override_recoding = 0;
/* First pass: determine size, legality. */
RExC_parse = exp;
&& (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
ri->regstclass = NULL;
- /* If the synthetic start class were to ever be used when EOS is set,
- * that bit would have to be cleared, as it is shared with another */
if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
&& stclass_flag
&& !(data.start_class->flags & ANYOF_EOS)
&& !cl_is_anything(data.start_class))
{
const U32 n = add_data(pRExC_state, 1, "f");
+ data.start_class->flags |= ANYOF_IS_SYNTHETIC;
Newx(RExC_rxi->data->data[n], 1,
struct regnode_charclass_class);
r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
= r->float_substr = r->float_utf8 = NULL;
- /* If the synthetic start class were to ever be used when EOS is set,
- * that bit would have to be cleared, as it is shared with another */
if (!(data.start_class->flags & ANYOF_EOS)
&& !cl_is_anything(data.start_class))
{
const U32 n = add_data(pRExC_state, 1, "f");
+ data.start_class->flags |= ANYOF_IS_SYNTHETIC;
Newx(RExC_rxi->data->data[n], 1,
struct regnode_charclass_class);
* this file. An inversion list is here implemented as a malloc'd C array with
* some added info. More will be coming when functionality is added later.
*
+ * It is currently implemented as an SV pointing to an array of UVs that the SV
+ * thinks are bytes. This allows us to have an array of UV whose memory
+ * management is automatically handled by the existing facilities for SV's.
+ *
* Some of the methods should always be private to the implementation, and some
* should eventually be made public */
#define INVLIST_INITIAL_LEN 10
-#define INVLIST_ARRAY_KEY "array"
-#define INVLIST_MAX_KEY "max"
-#define INVLIST_LEN_KEY "len"
PERL_STATIC_INLINE UV*
-S_invlist_array(pTHX_ HV* const invlist)
+S_invlist_array(pTHX_ SV* const invlist)
{
/* Returns the pointer to the inversion list's array. Every time the
* length changes, this needs to be called in case malloc or realloc moved
* it */
- SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
-
PERL_ARGS_ASSERT_INVLIST_ARRAY;
- if (list_ptr == NULL) {
- Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
- INVLIST_ARRAY_KEY);
- }
-
- return INT2PTR(UV *, SvUV(*list_ptr));
-}
-
-PERL_STATIC_INLINE void
-S_invlist_set_array(pTHX_ HV* const invlist, const UV* const array)
-{
- PERL_ARGS_ASSERT_INVLIST_SET_ARRAY;
-
- /* Sets the array stored in the inversion list to the memory beginning with
- * the parameter */
-
- if (hv_stores(invlist, INVLIST_ARRAY_KEY, newSVuv(PTR2UV(array))) == NULL) {
- Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
- INVLIST_ARRAY_KEY);
- }
+ return (UV *) SvPVX(invlist);
}
PERL_STATIC_INLINE UV
-S_invlist_len(pTHX_ HV* const invlist)
+S_invlist_len(pTHX_ SV* const invlist)
{
/* Returns the current number of elements in the inversion list's array */
- SV** len_ptr = hv_fetchs(invlist, INVLIST_LEN_KEY, FALSE);
-
PERL_ARGS_ASSERT_INVLIST_LEN;
- if (len_ptr == NULL) {
- Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
- INVLIST_LEN_KEY);
- }
-
- return SvUV(*len_ptr);
+ return SvCUR(invlist) / sizeof(UV);
}
PERL_STATIC_INLINE UV
-S_invlist_max(pTHX_ HV* const invlist)
+S_invlist_max(pTHX_ SV* const invlist)
{
/* Returns the maximum number of elements storable in the inversion list's
* array, without having to realloc() */
- SV** max_ptr = hv_fetchs(invlist, INVLIST_MAX_KEY, FALSE);
-
PERL_ARGS_ASSERT_INVLIST_MAX;
- if (max_ptr == NULL) {
- Perl_croak(aTHX_ "panic: inversion list without a '%s' element",
- INVLIST_MAX_KEY);
- }
-
- return SvUV(*max_ptr);
+ return SvLEN(invlist) / sizeof(UV);
}
PERL_STATIC_INLINE void
-S_invlist_set_len(pTHX_ HV* const invlist, const UV len)
+S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
{
/* Sets the current number of elements stored in the inversion list */
PERL_ARGS_ASSERT_INVLIST_SET_LEN;
- if (len != 0 && len > invlist_max(invlist)) {
- Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' more than %s=%"UVuf" in inversion list", INVLIST_LEN_KEY, len, INVLIST_MAX_KEY, invlist_max(invlist));
- }
-
- if (hv_stores(invlist, INVLIST_LEN_KEY, newSVuv(len)) == NULL) {
- Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
- INVLIST_LEN_KEY);
- }
+ SvCUR_set(invlist, len * sizeof(UV));
}
PERL_STATIC_INLINE void
-S_invlist_set_max(pTHX_ HV* const invlist, const UV max)
+S_invlist_set_max(pTHX_ SV* const invlist, const UV max)
{
/* Sets the maximum number of elements storable in the inversion list
PERL_ARGS_ASSERT_INVLIST_SET_MAX;
if (max < invlist_len(invlist)) {
- Perl_croak(aTHX_ "panic: Can't make '%s=%"UVuf"' less than %s=%"UVuf" in inversion list", INVLIST_MAX_KEY, invlist_len(invlist), INVLIST_LEN_KEY, invlist_max(invlist));
+ Perl_croak(aTHX_ "panic: Can't make max size '%"UVuf"' less than current length %"UVuf" in inversion list", invlist_max(invlist), invlist_len(invlist));
}
- if (hv_stores(invlist, INVLIST_MAX_KEY, newSVuv(max)) == NULL) {
- Perl_croak(aTHX_ "panic: can't store '%s' entry in inversion list",
- INVLIST_LEN_KEY);
- }
+ SvLEN_set(invlist, max * sizeof(UV));
}
#ifndef PERL_IN_XSUB_RE
-HV*
+SV*
Perl__new_invlist(pTHX_ IV initial_size)
{
* space to store 'initial_size' elements. If that number is negative, a
* system default is used instead */
- HV* invlist = newHV();
- UV* list;
-
if (initial_size < 0) {
initial_size = INVLIST_INITIAL_LEN;
}
/* Allocate the initial space */
- Newx(list, initial_size, UV);
- invlist_set_array(invlist, list);
-
- /* set_len has to come before set_max, as the latter inspects the len */
- invlist_set_len(invlist, 0);
- invlist_set_max(invlist, initial_size);
-
- return invlist;
+ return newSV(initial_size * sizeof(UV));
}
#endif
PERL_STATIC_INLINE void
-S_invlist_destroy(pTHX_ HV* const invlist)
+S_invlist_destroy(pTHX_ SV* const invlist)
{
/* Inversion list destructor */
- SV** list_ptr = hv_fetchs(invlist, INVLIST_ARRAY_KEY, FALSE);
-
PERL_ARGS_ASSERT_INVLIST_DESTROY;
- if (list_ptr != NULL) {
- UV *list = INT2PTR(UV *, SvUV(*list_ptr)); /* PERL_POISON needs lvalue */
- Safefree(list);
- }
+ SvREFCNT_dec(invlist);
}
STATIC void
-S_invlist_extend(pTHX_ HV* const invlist, const UV new_max)
+S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
{
- /* Change the maximum size of an inversion list (up or down) */
-
- UV* orig_array;
- UV* array;
- const UV old_max = invlist_max(invlist);
+ /* Grow the maximum size of an inversion list */
PERL_ARGS_ASSERT_INVLIST_EXTEND;
- if (old_max == new_max) { /* If a no-op */
- return;
- }
-
- array = orig_array = invlist_array(invlist);
- Renew(array, new_max, UV);
-
- /* If the size change moved the list in memory, set the new one */
- if (array != orig_array) {
- invlist_set_array(invlist, array);
- }
-
- invlist_set_max(invlist, new_max);
-
+ SvGROW((SV *)invlist, new_max * sizeof(UV));
}
PERL_STATIC_INLINE void
-S_invlist_trim(pTHX_ HV* const invlist)
+S_invlist_trim(pTHX_ SV* const invlist)
{
PERL_ARGS_ASSERT_INVLIST_TRIM;
/* Change the length of the inversion list to how many entries it currently
* has */
- invlist_extend(invlist, invlist_len(invlist));
+ SvPV_shrink_to_cur((SV *) invlist);
}
/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
* etc */
#define ELEMENT_IN_INVLIST_SET(i) (! ((i) & 1))
+#define PREV_ELEMENT_IN_INVLIST_SET(i) ! ELEMENT_IN_INVLIST_SET(i)
#ifndef PERL_IN_XSUB_RE
void
-Perl__append_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
+Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
{
/* Subject to change or removal. Append the range from 'start' to 'end' at
* the end of the inversion list. The range must be above any existing
}
#endif
-PERL_STATIC_INLINE HV*
-S_invlist_union(pTHX_ HV* const a, HV* const b)
+STATIC void
+S_invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
{
- /* Return a new inversion list which is the union of two inversion lists.
+ /* Take the union of two inversion lists and point 'result' to it. If
+ * 'result' on input points to one of the two lists, the reference count to
+ * that list will be decremented.
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
* length there. The preface says to incorporate its examples into your
* XXX A potential performance improvement is to keep track as we go along
* if only one of the inputs contributes to the result, meaning the other
* is a subset of that one. In that case, we can skip the final copy and
- * return the larger of the input lists */
+ * return the larger of the input lists, but then outside code might need
+ * to keep track of whether to free the input list or not */
UV* array_a = invlist_array(a); /* a's array */
UV* array_b = invlist_array(b);
UV len_a = invlist_len(a); /* length of a's array */
UV len_b = invlist_len(b);
- HV* u; /* the resulting union */
+ SV* u; /* the resulting union */
UV* array_u;
UV len_u;
/* Here, we are finished going through at least one of the lists, which
* means there is something remaining in at most one. We check if the list
* that hasn't been exhausted is positioned such that we are in the middle
- * of a range in its set or not. (We are in the set if the next item in
- * the array marks the beginning of something not in the set) If in the
- * set, we decrement 'count'; if 0, there is potentially more to output.
+ * of a range in its set or not. (i_a and i_b point to the element beyond
+ * the one we care about.) If in the set, we decrement 'count'; if 0, there
+ * is potentially more to output.
* There are four cases:
* 1) Both weren't in their sets, count is 0, and remains 0. What's left
* in the union is entirely from the non-exhausted set.
* that
* 3) the exhausted was in its set, non-exhausted isn't, count is 1.
* Nothing further should be output because the union includes
- * everything from the exhausted set. Not decrementing insures that.
+ * everything from the exhausted set. Not decrementing ensures that.
* 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
* decrementing to 0 insures that we look at the remainder of the
* non-exhausted set */
- if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
- || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
+ if ((i_a != len_a && PREV_ELEMENT_IN_INVLIST_SET(i_a))
+ || (i_b != len_b && PREV_ELEMENT_IN_INVLIST_SET(i_b)))
{
count--;
}
}
}
- return u;
+ /* We may be removing a reference to one of the inputs */
+ if (&a == output || &b == output) {
+ SvREFCNT_dec(*output);
+ }
+
+ *output = u;
+ return;
}
-PERL_STATIC_INLINE HV*
-S_invlist_intersection(pTHX_ HV* const a, HV* const b)
+STATIC void
+S_invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
{
- /* Return the intersection of two inversion lists. The basis for this
- * comes from "Unicode Demystified" Chapter 13 by Richard Gillam, published
- * by Addison-Wesley, and explained at some length there. The preface says
- * to incorporate its examples into your code at your own risk.
+ /* Take the intersection of two inversion lists and point 'i' to it. If
+ * 'i' on input points to one of the two lists, the reference count to that
+ * list will be decremented.
+ * The basis for this comes from "Unicode Demystified" Chapter 13 by
+ * Richard Gillam, published by Addison-Wesley, and explained at some
+ * length there. The preface says to incorporate its examples into your
+ * code at your own risk. In fact, it had bugs
*
* The algorithm is like a merge sort, and is essentially the same as the
* union above
UV len_a = invlist_len(a); /* length of a's array */
UV len_b = invlist_len(b);
- HV* r; /* the resulting intersection */
+ SV* r; /* the resulting intersection */
UV* array_r;
UV len_r;
array */
bool cp_in_set; /* Is it in the input list's set or not */
- /* We need to take one or the other of the two inputs for the union.
- * Since we are merging two sorted lists, we take the smaller of the
- * next items. In case of a tie, we take the one that is not in its
- * set first (a difference from the union algorithm). If we took one
- * in the set first, it would increment the count, possibly to 2 which
- * would cause it to be output as starting a range in the intersection,
- * and the next time through we would take that same number, and output
- * it again as ending the set. By doing it the opposite of this, we
- * there is no possibility that the count will be momentarily
- * incremented to 2. (In a tie and both are in the set or both not in
- * the set, it doesn't matter which we take first.) */
+ /* We need to take one or the other of the two inputs for the
+ * intersection. Since we are merging two sorted lists, we take the
+ * smaller of the next items. In case of a tie, we take the one that
+ * is not in its set first (a difference from the union algorithm). If
+ * we took one in the set first, it would increment the count, possibly
+ * to 2 which would cause it to be output as starting a range in the
+ * intersection, and the next time through we would take that same
+ * number, and output it again as ending the set. By doing it the
+ * opposite of this, there is no possibility that the count will be
+ * momentarily incremented to 2. (In a tie and both are in the set or
+ * both not in the set, it doesn't matter which we take first.) */
if (array_a[i_a] < array_b[i_b]
|| (array_a[i_a] == array_b[i_b] && ! ELEMENT_IN_INVLIST_SET(i_a)))
{
}
}
- /* Here, we are finished going through at least one of the sets, which
- * means there is something remaining in at most one. See the comments in
- * the union code */
- if ((i_a != len_a && ! ELEMENT_IN_INVLIST_SET(i_a))
- || (i_b != len_b && ! ELEMENT_IN_INVLIST_SET(i_b)))
+ /* Here, we are finished going through at least one of the lists, which
+ * means there is something remaining in at most one. We check if the list
+ * that has been exhausted is positioned such that we are in the middle
+ * of a range in its set or not. (i_a and i_b point to elements 1 beyond
+ * the ones we care about.) There are four cases:
+ * 1) Both weren't in their sets, count is 0, and remains 0. There's
+ * nothing left in the intersection.
+ * 2) Both were in their sets, count is 2 and perhaps is incremented to
+ * above 2. What should be output is exactly that which is in the
+ * non-exhausted set, as everything it has is also in the intersection
+ * set, and everything it doesn't have can't be in the intersection
+ * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
+ * gets incremented to 2. Like the previous case, the intersection is
+ * everything that remains in the non-exhausted set.
+ * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
+ * remains 1. And the intersection has nothing more. */
+ if ((i_a == len_a && PREV_ELEMENT_IN_INVLIST_SET(i_a))
+ || (i_b == len_b && PREV_ELEMENT_IN_INVLIST_SET(i_b)))
{
- count--;
+ count++;
}
/* The final length is what we've output so far plus what else is in the
- * intersection. Only one of the subexpressions below will be non-zero */
+ * intersection. At most one of the subexpressions below will be non-zero */
len_r = i_r;
- if (count == 2) {
+ if (count >= 2) {
len_r += (len_a - i_a) + (len_b - i_b);
}
}
/* Finish outputting any remaining */
- if (count == 2) { /* Only one of will have a non-zero copy count */
+ if (count >= 2) { /* At most one will have a non-zero copy count */
IV copy_count;
if ((copy_count = len_a - i_a) > 0) {
Copy(array_a + i_a, array_r + i_r, copy_count, UV);
}
}
- return r;
+ /* We may be removing a reference to one of the inputs */
+ if (&a == i || &b == i) {
+ SvREFCNT_dec(*i);
+ }
+
+ *i = r;
+ return;
}
-STATIC HV*
-S_add_range_to_invlist(pTHX_ HV* const invlist, const UV start, const UV end)
+STATIC SV*
+S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
{
/* Add the range from 'start' to 'end' inclusive to the inversion list's
* set. A pointer to the inversion list is returned. This may actually be
- * a new list, in which case the passed in one has been destroyed */
+ * a new list, in which case the passed in one has been destroyed. The
+ * passed in inversion list can be NULL, in which case a new one is created
+ * with just the one range in it */
- HV* range_invlist;
- HV* added_invlist;
+ SV* range_invlist;
+ UV len;
- UV len = invlist_len(invlist);
-
- PERL_ARGS_ASSERT_ADD_RANGE_TO_INVLIST;
+ if (invlist == NULL) {
+ invlist = _new_invlist(2);
+ len = 0;
+ }
+ else {
+ len = invlist_len(invlist);
+ }
/* If comes after the final entry, can just append it to the end */
if (len == 0
range_invlist = _new_invlist(2);
_append_range_to_invlist(range_invlist, start, end);
- added_invlist = invlist_union(invlist, range_invlist);
+ invlist_union(invlist, range_invlist, &invlist);
/* The passed in list can be freed, as well as our temporary */
- invlist_destroy(range_invlist);
- if (invlist != added_invlist) {
- invlist_destroy(invlist);
- }
+ SvREFCNT_dec(range_invlist);
- return added_invlist;
+ return invlist;
}
+PERL_STATIC_INLINE SV*
+S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
+ return add_range_to_invlist(invlist, cp, cp);
+}
+
+#undef INVLIST_INITIAL_LENGTH
+
/* End of inversion list object */
/*
SvIV_set(sv_dat, 1);
}
#ifdef DEBUGGING
+ /* Yes this does cause a memory leak in debugging Perls */
if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
SvREFCNT_dec(svname);
#endif
{
U32 posflags = 0, negflags = 0;
U32 *flagsp = &posflags;
- bool has_charset_modifier = 0;
- regex_charset cs = REGEX_DEPENDS_CHARSET;
+ char has_charset_modifier = '\0';
+ regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
+ ? REGEX_UNICODE_CHARSET
+ : REGEX_DEPENDS_CHARSET;
while (*RExC_parse) {
/* && strchr("iogcmsx", *RExC_parse) */
switch (*RExC_parse) {
CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
case LOCALE_PAT_MOD:
- if (has_charset_modifier || flagsp == &negflags) {
- goto fail_modifiers;
+ if (has_charset_modifier) {
+ goto excess_modifier;
+ }
+ else if (flagsp == &negflags) {
+ goto neg_modifier;
}
cs = REGEX_LOCALE_CHARSET;
- has_charset_modifier = 1;
+ has_charset_modifier = LOCALE_PAT_MOD;
+ RExC_contains_locale = 1;
break;
case UNICODE_PAT_MOD:
- if (has_charset_modifier || flagsp == &negflags) {
- goto fail_modifiers;
+ if (has_charset_modifier) {
+ goto excess_modifier;
+ }
+ else if (flagsp == &negflags) {
+ goto neg_modifier;
}
cs = REGEX_UNICODE_CHARSET;
- has_charset_modifier = 1;
+ has_charset_modifier = UNICODE_PAT_MOD;
break;
case ASCII_RESTRICT_PAT_MOD:
- if (has_charset_modifier || flagsp == &negflags) {
- goto fail_modifiers;
+ if (flagsp == &negflags) {
+ goto neg_modifier;
}
- if (*(RExC_parse + 1) == ASCII_RESTRICT_PAT_MOD) {
+ if (has_charset_modifier) {
+ if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
+ goto excess_modifier;
+ }
/* Doubled modifier implies more restricted */
- cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
- RExC_parse++;
- }
+ cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
+ }
else {
cs = REGEX_ASCII_RESTRICTED_CHARSET;
}
- has_charset_modifier = 1;
+ has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
break;
case DEPENDS_PAT_MOD:
- if (has_use_defaults
- || has_charset_modifier
- || flagsp == &negflags)
- {
+ if (has_use_defaults) {
goto fail_modifiers;
+ }
+ else if (flagsp == &negflags) {
+ goto neg_modifier;
+ }
+ else if (has_charset_modifier) {
+ goto excess_modifier;
}
/* The dual charset means unicode semantics if the
cs = (RExC_utf8 || RExC_uni_semantics)
? REGEX_UNICODE_CHARSET
: REGEX_DEPENDS_CHARSET;
- has_charset_modifier = 1;
+ has_charset_modifier = DEPENDS_PAT_MOD;
break;
+ excess_modifier:
+ RExC_parse++;
+ if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
+ vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
+ }
+ else if (has_charset_modifier == *(RExC_parse - 1)) {
+ vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
+ }
+ else {
+ vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
+ }
+ /*NOTREACHED*/
+ neg_modifier:
+ RExC_parse++;
+ vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
+ /*NOTREACHED*/
case ONCE_PAT_MOD: /* 'o' */
case GLOBAL_PAT_MOD: /* 'g' */
if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
parse_start = RExC_parse; /* MJD */
br = regbranch(pRExC_state, &flags, 1,depth+1);
- if (freeze_paren) {
- if (RExC_npar > after_freeze)
- after_freeze = RExC_npar;
- RExC_npar = freeze_paren;
- }
-
/* branch_len = (paren != 0); */
if (br == NULL)
if (RExC_in_lookbehind) {
RExC_in_lookbehind--;
}
- if (after_freeze)
+ if (after_freeze > RExC_npar)
RExC_npar = after_freeze;
return(ret);
}
const char * const origparse = RExC_parse;
I32 min;
I32 max = REG_INFTY;
+#ifdef RE_TRACK_PATTERN_OFFSETS
char *parse_start;
+#endif
const char *maxpos = NULL;
GET_RE_DEBUG_FLAGS_DECL;
if (op == '{' && regcurly(RExC_parse)) {
maxpos = NULL;
+#ifdef RE_TRACK_PATTERN_OFFSETS
parse_start = RExC_parse; /* MJD */
+#endif
next = RExC_parse + 1;
while (isDIGIT(*next) || *next == ',') {
if (*next == ',') {
vFAIL("Regexp *+ operand could be empty");
#endif
+#ifdef RE_TRACK_PATTERN_OFFSETS
parse_start = RExC_parse;
+#endif
nextchar(pRExC_state);
*flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
}
-/* reg_namedseq(pRExC_state,UVp)
+/* reg_namedseq(pRExC_state,UVp, UV depth)
This is expected to be called by a parser routine that has
recognized '\N' and needs to handle the rest. RExC_parse is
Parsing failures will generate a fatal error via vFAIL(...)
*/
STATIC regnode *
-S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp)
+S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
{
char * endbrace; /* '}' following the name */
regnode *ret = NULL;
-#ifdef DEBUGGING
- char* parse_start = RExC_parse - 2; /* points to the '\N' */
-#endif
char* p;
GET_RE_DEBUG_FLAGS_DECL;
ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
}
else { /* Not a char class */
- char *s; /* String to put in generated EXACT node */
- STRLEN len = 0; /* Its current byte length */
+
+ /* What is done here is to convert this to a sub-pattern of the form
+ * (?:\x{char1}\x{char2}...)
+ * and then call reg recursively. That way, it retains its atomicness,
+ * while not having to worry about special handling that some code
+ * points may have. toke.c has converted the original Unicode values
+ * to native, so that we can just pass on the hex values unchanged. We
+ * do have to set a flag to keep recoding from happening in the
+ * recursion */
+
+ SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
+ STRLEN len;
char *endchar; /* Points to '.' or '}' ending cur char in the input
stream */
- ret = reg_node(pRExC_state,
- (U8) ((! FOLD) ? EXACT
- : (LOC)
- ? EXACTFL
- : (MORE_ASCII_RESTRICTED)
- ? EXACTFA
- : (AT_LEAST_UNI_SEMANTICS)
- ? EXACTFU
- : EXACTF));
- s= STRING(ret);
-
- /* Exact nodes can hold only a U8 length's of text = 255. Loop through
- * the input which is of the form now 'c1.c2.c3...}' until find the
- * ending brace or exceed length 255. The characters that exceed this
- * limit are dropped. The limit could be relaxed should it become
- * desirable by reparsing this as (?:\N{NAME}), so could generate
- * multiple EXACT nodes, as is done for just regular input. But this
- * is primarily a named character, and not intended to be a huge long
- * string, so 255 bytes should be good enough */
- while (1) {
- STRLEN length_of_hex;
- I32 grok_flags = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX
- | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
- UV cp; /* Ord of current character */
- bool use_this_char_fold = FOLD;
+ char *orig_end = RExC_end;
+
+ while (RExC_parse < endbrace) {
/* Code points are separated by dots. If none, there is only one
* code point, and is terminated by the brace */
endchar = RExC_parse + strcspn(RExC_parse, ".}");
- /* The values are Unicode even on EBCDIC machines */
- length_of_hex = (STRLEN)(endchar - RExC_parse);
- cp = grok_hex(RExC_parse, &length_of_hex, &grok_flags, NULL);
- if ( length_of_hex == 0
- || length_of_hex != (STRLEN)(endchar - RExC_parse) )
- {
- RExC_parse += length_of_hex; /* Includes all the valid */
- RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
- ? UTF8SKIP(RExC_parse)
- : 1;
- /* Guard against malformed utf8 */
- if (RExC_parse >= endchar) RExC_parse = endchar;
- vFAIL("Invalid hexadecimal number in \\N{U+...}");
- }
-
- if (FOLD
- && (cp > 255 || ! MORE_ASCII_RESTRICTED)
- && is_TRICKYFOLD_cp(cp))
- {
- }
-
- /* Under /aa, we can't mix ASCII with non- in a fold. If we are
- * folding, and the source isn't ASCII, look through all the
- * characters it folds to. If any one of them is ASCII, forbid
- * this fold. (cp is uni, so the 127 below is correct even for
- * EBCDIC) */
- if (use_this_char_fold && cp > 127 && MORE_ASCII_RESTRICTED) {
- U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
- U8* s = tmpbuf;
- U8* e;
- STRLEN foldlen;
-
- (void) toFOLD_uni(cp, tmpbuf, &foldlen);
- e = s + foldlen;
-
- while (s < e) {
- if (isASCII(*s)) {
- use_this_char_fold = FALSE;
- break;
- }
- s += UTF8SKIP(s);
- }
- }
-
- if (! use_this_char_fold) { /* Not folding, just append to the
- string */
- STRLEN unilen;
-
- /* Quit before adding this character if would exceed limit */
- if (len + UNISKIP(cp) > U8_MAX) break;
-
- unilen = reguni(pRExC_state, cp, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
- } else { /* Folding, output the folded equivalent */
- STRLEN foldlen,numlen;
- U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
- cp = toFOLD_uni(cp, tmpbuf, &foldlen);
-
- /* Quit before exceeding size limit */
- if (len + foldlen > U8_MAX) break;
-
- for (foldbuf = tmpbuf;
- foldlen;
- foldlen -= numlen)
- {
- cp = utf8_to_uvchr(foldbuf, &numlen);
- if (numlen > 0) {
- const STRLEN unilen = reguni(pRExC_state, cp, s);
- s += unilen;
- len += unilen;
- /* In EBCDIC the numlen and unilen can differ. */
- foldbuf += numlen;
- if (numlen >= foldlen)
- break;
- }
- else
- break; /* "Can't happen." */
- }
- }
+ /* Convert to notation the rest of the code understands */
+ sv_catpv(substitute_parse, "\\x{");
+ sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
+ sv_catpv(substitute_parse, "}");
/* Point to the beginning of the next character in the sequence. */
RExC_parse = endchar + 1;
-
- /* Quit if no more characters */
- if (RExC_parse >= endbrace) break;
}
+ sv_catpv(substitute_parse, ")");
+ RExC_parse = SvPV(substitute_parse, len);
- if (SIZE_ONLY) {
- if (RExC_parse < endbrace) {
- ckWARNreg(RExC_parse - 1,
- "Using just the first characters returned by \\N{}");
- }
-
- RExC_size += STR_SZ(len);
- } else {
- STR_LEN(ret) = len;
- RExC_emit += STR_SZ(len);
+ /* Don't allow empty number */
+ if (len < 8) {
+ vFAIL("Invalid hexadecimal number in \\N{U+...}");
}
+ RExC_end = RExC_parse + len;
- RExC_parse = endbrace + 1;
+ /* The values are Unicode, and therefore not subject to recoding */
+ RExC_override_recoding = 1;
+
+ ret = reg(pRExC_state, 1, flagp, depth+1);
+
+ RExC_parse = endbrace;
+ RExC_end = orig_end;
+ RExC_override_recoding = 0;
- *flagp |= HASWIDTH; /* Not SIMPLE, as that causes the engine to fail
- with malformed in t/re/pat_advanced.t */
- RExC_parse --;
- Set_Node_Cur_Length(ret); /* MJD */
nextchar(pRExC_state);
}
RExC_parse++;
vFAIL("Quantifier follows nothing");
break;
- case LATIN_SMALL_LETTER_SHARP_S:
- case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
- case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
-#if UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T) != UTF8_TWO_BYTE_HI_nocast(IOTA_D_T)
-#error The beginning utf8 byte of IOTA_D_T and UPSILON_D_T unexpectedly differ. Other instances in this code should have the case statement below.
- case UTF8_TWO_BYTE_HI_nocast(UPSILON_D_T):
-#endif
- do_foldchar:
- if (!LOC && FOLD) {
- U32 len,cp;
- len=0; /* silence a spurious compiler warning */
- if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) {
- *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */
- RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */
- ret = reganode(pRExC_state, FOLDCHAR, cp);
- Set_Node_Length(ret, 1); /* MJD */
- nextchar(pRExC_state); /* kill whitespace under /x */
- return ret;
- }
- }
- goto outer_default;
case '\\':
/* Special Escapes
literal text handling code.
*/
switch ((U8)*++RExC_parse) {
- case LATIN_SMALL_LETTER_SHARP_S:
- case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
- case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
- goto do_foldchar;
/* Special Escapes */
case 'A':
RExC_seen_zerolen++;
Also this makes sure that things like /\N{BLAH}+/ and
\N{BLAH} being multi char Just Happen. dmq*/
++RExC_parse;
- ret= reg_namedseq(pRExC_state, NULL, flagp);
+ ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
break;
case 'k': /* Handle \k<NAME> and \k'NAME' */
parse_named_seq:
/* FALL THROUGH */
default:
- outer_default:{
+
+ parse_start = RExC_parse - 1;
+
+ RExC_parse++;
+
+ defchar: {
+ typedef enum {
+ generic_char = 0,
+ char_s,
+ upsilon_1,
+ upsilon_2,
+ iota_1,
+ iota_2,
+ } char_state;
+ char_state latest_char_state = generic_char;
register STRLEN len;
register UV ender;
register char *p;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
regnode * orig_emit;
- parse_start = RExC_parse - 1;
-
- RExC_parse++;
-
- defchar:
ender = 0;
orig_emit = RExC_emit; /* Save the original output node position in
case we need to output a different node
if (RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
switch ((U8)*p) {
- case LATIN_SMALL_LETTER_SHARP_S:
- case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
- case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
- if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
- goto normal_default;
case '^':
case '$':
case '.':
switch ((U8)*++p) {
/* These are all the special escapes. */
- case LATIN_SMALL_LETTER_SHARP_S:
- case UTF8_TWO_BYTE_HI_nocast(LATIN_SMALL_LETTER_SHARP_S):
- case UTF8_TWO_BYTE_HI_nocast(IOTA_D_T):
- if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF))
- goto normal_default;
case 'A': /* Start assertion */
case 'b': case 'B': /* Word-boundary assertion*/
case 'C': /* Single char !DANGEROUS! */
goto recode_encoding;
break;
recode_encoding:
- {
+ if (! RExC_override_recoding) {
SV* enc = PL_encoding;
ender = reg_recode((const char)(U8)ender, &enc);
if (!enc && SIZE_ONLY)
* putting it in a special node keeps regexec from having to
* deal with a non-utf8 multi-char fold */
if (FOLD
- && (ender > 255 || ! MORE_ASCII_RESTRICTED)
- && is_TRICKYFOLD_cp(ender))
+ && (ender > 255 || (! MORE_ASCII_RESTRICTED && ! LOC)))
{
- /* If is in middle of outputting characters into an
- * EXACTish node, go output what we have so far, and
- * position the parse so that this will be called again
- * immediately */
- if (len) {
- p = RExC_parse + len - 1;
- goto loopdone;
- }
- else {
+ /* We look for either side of the fold. For example \xDF
+ * folds to 'ss'. We look for both the single character
+ * \xDF and the sequence 'ss'. When we find something that
+ * could be one of those, we stop and flush whatever we
+ * have output so far into the EXACTish node that was being
+ * built. Then restore the input pointer to what it was.
+ * regatom will return that EXACT node, and will be called
+ * again, positioned so the first character is the one in
+ * question, which we return in a different node type.
+ * The multi-char folds are a sequence, so the occurrence
+ * of the first character in that sequence doesn't
+ * necessarily mean that what follows is the rest of the
+ * sequence. We keep track of that with a state machine,
+ * with the state being set to the latest character
+ * processed before the current one. Most characters will
+ * set the state to 0, but if one occurs that is part of a
+ * potential tricky fold sequence, the state is set to that
+ * character, and the next loop iteration sees if the state
+ * should progress towards the final folded-from character,
+ * or if it was a false alarm. If it turns out to be a
+ * false alarm, the character(s) will be output in a new
+ * EXACTish node, and join_exact() will later combine them.
+ * In the case of the 'ss' sequence, which is more common
+ * and more easily checked, some look-ahead is done to
+ * save time by ruling-out some false alarms */
+ switch (ender) {
+ default:
+ latest_char_state = generic_char;
+ break;
+ case 's':
+ case 'S':
+ case 0x17F: /* LATIN SMALL LETTER LONG S */
+ if (AT_LEAST_UNI_SEMANTICS) {
+ if (latest_char_state == char_s) { /* 'ss' */
+ ender = LATIN_SMALL_LETTER_SHARP_S;
+ goto do_tricky;
+ }
+ else if (p < RExC_end) {
+
+ /* Look-ahead at the next character. If it
+ * is also an s, we handle as a sharp s
+ * tricky regnode. */
+ if (*p == 's' || *p == 'S') {
+
+ /* But first flush anything in the
+ * EXACTish buffer */
+ if (len != 0) {
+ p = oldp;
+ goto loopdone;
+ }
+ p++; /* Account for swallowing this
+ 's' up */
+ ender = LATIN_SMALL_LETTER_SHARP_S;
+ goto do_tricky;
+ }
+ /* Here, the next character is not a
+ * literal 's', but still could
+ * evaluate to one if part of a \o{},
+ * \x or \OCTAL-DIGIT. The minimum
+ * length required for that is 4, eg
+ * \x53 or \123 */
+ else if (*p == '\\'
+ && p < RExC_end - 4
+ && (isDIGIT(*(p + 1))
+ || *(p + 1) == 'x'
+ || *(p + 1) == 'o' ))
+ {
- /* Here we are ready to output our tricky fold
- * character. What's done is to pretend it's in a
- * [bracketed] class, and let the code that deals with
- * those handle it, as that code has all the
- * intelligence necessary. First save the current
- * parse state, get rid of the already allocated EXACT
- * node that the ANYOFV node will replace, and point
- * the parse to a buffer which we fill with the
- * character we want the regclass code to think is
- * being parsed */
- char* const oldregxend = RExC_end;
- char tmpbuf[2];
- RExC_emit = orig_emit;
- RExC_parse = tmpbuf;
- if (UTF) {
- tmpbuf[0] = UTF8_TWO_BYTE_HI(ender);
- tmpbuf[1] = UTF8_TWO_BYTE_LO(ender);
- RExC_end = RExC_parse + 2;
- }
- else {
- tmpbuf[0] = (char) ender;
- RExC_end = RExC_parse + 1;
- }
+ /* Here, it could be an 's', too much
+ * bother to figure it out here. Flush
+ * the buffer if any; when come back
+ * here, set the state so know that the
+ * previous char was an 's' */
+ if (len != 0) {
+ latest_char_state = generic_char;
+ p = oldp;
+ goto loopdone;
+ }
+ latest_char_state = char_s;
+ break;
+ }
+ }
+ }
- ret = regclass(pRExC_state,depth+1);
+ /* Here, can't be an 'ss' sequence, or at least not
+ * one that could fold to/from the sharp ss */
+ latest_char_state = generic_char;
+ break;
+ case 0x03C5: /* First char in upsilon series */
+ case 0x03A5: /* Also capital UPSILON, which folds to
+ 03C5, and hence exhibits the same
+ problem */
+ if (p < RExC_end - 4) { /* Need >= 4 bytes left */
+ latest_char_state = upsilon_1;
+ if (len != 0) {
+ p = oldp;
+ goto loopdone;
+ }
+ }
+ else {
+ latest_char_state = generic_char;
+ }
+ break;
+ case 0x03B9: /* First char in iota series */
+ case 0x0399: /* Also capital IOTA */
+ case 0x1FBE: /* GREEK PROSGEGRAMMENI folds to 3B9 */
+ case 0x0345: /* COMBINING GREEK YPOGEGRAMMENI folds
+ to 3B9 */
+ if (p < RExC_end - 4) {
+ latest_char_state = iota_1;
+ if (len != 0) {
+ p = oldp;
+ goto loopdone;
+ }
+ }
+ else {
+ latest_char_state = generic_char;
+ }
+ break;
+ case 0x0308:
+ if (latest_char_state == upsilon_1) {
+ latest_char_state = upsilon_2;
+ }
+ else if (latest_char_state == iota_1) {
+ latest_char_state = iota_2;
+ }
+ else {
+ latest_char_state = generic_char;
+ }
+ break;
+ case 0x301:
+ if (latest_char_state == upsilon_2) {
+ ender = GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS;
+ goto do_tricky;
+ }
+ else if (latest_char_state == iota_2) {
+ ender = GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS;
+ goto do_tricky;
+ }
+ latest_char_state = generic_char;
+ break;
- /* Here, have parsed the buffer. Reset the parse to
- * the actual input, and return */
- RExC_end = oldregxend;
- RExC_parse = p - 1;
+ /* These are the tricky fold characters. Flush any
+ * buffer first. (When adding to this list, also should
+ * add them to fold_grind.t to make sure get tested) */
+ case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS:
+ case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS:
+ case LATIN_SMALL_LETTER_SHARP_S:
+ case LATIN_CAPITAL_LETTER_SHARP_S:
+ case 0x1FD3: /* GREEK SMALL LETTER IOTA WITH DIALYTIKA AND OXIA */
+ case 0x1FE3: /* GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND OXIA */
+ if (len != 0) {
+ p = oldp;
+ goto loopdone;
+ }
+ /* FALL THROUGH */
+ do_tricky: {
+ char* const oldregxend = RExC_end;
+ U8 tmpbuf[UTF8_MAXBYTES+1];
+
+ /* Here, we know we need to generate a special
+ * regnode, and 'ender' contains the tricky
+ * character. What's done is to pretend it's in a
+ * [bracketed] class, and let the code that deals
+ * with those handle it, as that code has all the
+ * intelligence necessary. First save the current
+ * parse state, get rid of the already allocated
+ * but empty EXACT node that the ANYOFV node will
+ * replace, and point the parse to a buffer which
+ * we fill with the character we want the regclass
+ * code to think is being parsed */
+ RExC_emit = orig_emit;
+ RExC_parse = (char *) tmpbuf;
+ if (UTF) {
+ U8 *d = uvchr_to_utf8(tmpbuf, ender);
+ *d = '\0';
+ RExC_end = (char *) d;
+ }
+ else { /* ender above 255 already excluded */
+ tmpbuf[0] = (U8) ender;
+ tmpbuf[1] = '\0';
+ RExC_end = RExC_parse + 1;
+ }
- Set_Node_Offset(ret, RExC_parse);
- Set_Node_Cur_Length(ret);
- nextchar(pRExC_state);
- *flagp |= HASWIDTH|SIMPLE;
- return ret;
+ ret = regclass(pRExC_state,depth+1);
+
+ /* Here, have parsed the buffer. Reset the parse to
+ * the actual input, and return */
+ RExC_end = oldregxend;
+ RExC_parse = p - 1;
+
+ Set_Node_Offset(ret, RExC_parse);
+ Set_Node_Cur_Length(ret);
+ nextchar(pRExC_state);
+ *flagp |= HASWIDTH|SIMPLE;
+ return ret;
+ }
}
}
if ( RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
if (UTF && FOLD) {
- /* Prime the casefolded buffer. */
- if (isASCII(ender)) {
+ /* Prime the casefolded buffer. Locale rules, which apply
+ * only to code points < 256, aren't known until execution,
+ * so for them, just output the original character using
+ * utf8 */
+ if (LOC && ender < 256) {
+ if (UNI_IS_INVARIANT(ender)) {
+ *tmpbuf = (U8) ender;
+ foldlen = 1;
+ } else {
+ *tmpbuf = UTF8_TWO_BYTE_HI(ender);
+ *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
+ foldlen = 2;
+ }
+ }
+ else if (isASCII(ender)) { /* Note: Here can't also be LOC
+ */
ender = toLOWER(ender);
*tmpbuf = (U8) ender;
foldlen = 1;
}
- else if (! MORE_ASCII_RESTRICTED) {
+ else if (! MORE_ASCII_RESTRICTED && ! LOC) {
+
+ /* Locale and /aa require more selectivity about the
+ * fold, so are handled below. Otherwise, here, just
+ * use the fold */
ender = toFOLD_uni(ender, tmpbuf, &foldlen);
}
else {
- /* When not to mix ASCII with non-, reject folds that
- * mix them, using only the non-folded code point. So
- * do the fold to a temporary, and inspect each
- * character in it. */
+ /* Under locale rules or /aa we are not to mix,
+ * respectively, ords < 256 or ASCII with non-. So
+ * reject folds that mix them, using only the
+ * non-folded code point. So do the fold to a
+ * temporary, and inspect each character in it. */
U8 trialbuf[UTF8_MAXBYTES_CASE+1];
U8* s = trialbuf;
UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
bool fold_ok = TRUE;
while (s < e) {
- if (isASCII(*s)) {
+ if (isASCII(*s)
+ || (LOC && (UTF8_IS_INVARIANT(*s)
+ || UTF8_IS_DOWNGRADEABLE_START(*s))))
+ {
fold_ok = FALSE;
break;
}
}
len--;
}
- else
+ else {
REGC((char)ender, s++);
+ }
}
loopdone: /* Jumped to when encounters something that shouldn't be in
the node */
ANYOF_##NAME: \
for (value = 0; value < 256; value++) \
if (TEST) \
- stored += set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); \
+ stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
yesno = '+'; \
what = WORD; \
break; \
case ANYOF_N##NAME: \
for (value = 0; value < 256; value++) \
if (!TEST) \
- stored += set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); \
+ stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
yesno = '!'; \
what = WORD; \
break
else if (UNI_SEMANTICS) { \
for (value = 0; value < 256; value++) { \
if (TEST_8(value)) stored += \
- set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); \
+ set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
} \
} \
else { \
for (value = 0; value < 128; value++) { \
if (TEST_7(UNI_TO_NATIVE(value))) stored += \
set_regclass_bit(pRExC_state, ret, \
- (U8) UNI_TO_NATIVE(value), &nonbitmap); \
+ (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
} \
} \
yesno = '+'; \
else if (UNI_SEMANTICS) { \
for (value = 0; value < 256; value++) { \
if (! TEST_8(value)) stored += \
- set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap); \
+ set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
} \
} \
else { \
for (value = 0; value < 128; value++) { \
if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
- pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
+ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
} \
if (AT_LEAST_ASCII_RESTRICTED) { \
for (value = 128; value < 256; value++) { \
stored += set_regclass_bit( \
- pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
+ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
} \
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8; \
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \
} \
else { \
/* For a non-ut8 target string with DEPENDS semantics, all above \
* classes. But in utf8, they have their Unicode semantics, so \
* can't just set them in the bitmap, or else regexec.c will think \
* they matched when they shouldn't. */ \
- ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL|ANYOF_UTF8; \
+ ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \
} \
} \
yesno = '!'; \
what = WORD; \
break
-/*
- We dont use PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS as the direct test
- so that it is possible to override the option here without having to
- rebuild the entire core. as we are required to do if we change regcomp.h
- which is where PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS is defined.
-*/
-#if PERL_LEGACY_UNICODE_CHARCLASS_MAPPINGS
-#define BROKEN_UNICODE_CHARCLASS_MAPPINGS
-#endif
-
-#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
-#define POSIX_CC_UNI_NAME(CCNAME) CCNAME
-#else
-#define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
-#endif
-
STATIC U8
-S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
+S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
{
/* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
* Locale folding is done at run-time, so this function should not be
* called for nodes that are for locales.
*
- * This function simply sets the bit corresponding to the fold of the input
+ * This function sets the bit corresponding to the fold of the input
* 'value', if not already set. The fold of 'f' is 'F', and the fold of
* 'F' is 'f'.
*
- * It also sets any necessary flags, and returns the number of bits that
- * actually changed from 0 to 1 */
+ * It also knows about the characters that are in the bitmap that have
+ * folds that are matchable only outside it, and sets the appropriate lists
+ * and flags.
+ *
+ * It returns the number of bits that actually changed from 0 to 1 */
U8 stored = 0;
U8 fold;
ANYOF_BITMAP_SET(node, fold);
stored++;
}
- if ((_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED))
- || (! UNI_SEMANTICS
- && ! isASCII(value)
- && PL_fold_latin1[value] != value))
- { /* A character that has a fold outside of Latin1 matches outside the
- bitmap, but only when the target string is utf8. Similarly when we
- don't have unicode semantics for the above ASCII Latin-1 characters,
- and they have a fold, they should match if the target is utf8, and
- not otherwise */
- if (! *nonbitmap_ptr) {
- *nonbitmap_ptr = _new_invlist(2);
+ if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
+ /* Certain Latin1 characters have matches outside the bitmap. To get
+ * here, 'value' is one of those characters. None of these matches is
+ * valid for ASCII characters under /aa, which have been excluded by
+ * the 'if' above. The matches fall into three categories:
+ * 1) They are singly folded-to or -from an above 255 character, as
+ * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
+ * WITH DIAERESIS;
+ * 2) They are part of a multi-char fold with another character in the
+ * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
+ * 3) They are part of a multi-char fold with a character not in the
+ * bitmap, such as various ligatures.
+ * We aren't dealing fully with multi-char folds, except we do deal
+ * with the pattern containing a character that has a multi-char fold
+ * (not so much the inverse).
+ * For types 1) and 3), the matches only happen when the target string
+ * is utf8; that's not true for 2), and we set a flag for it.
+ *
+ * The code below adds to the passed in inversion list the single fold
+ * closures for 'value'. The values are hard-coded here so that an
+ * innocent-looking character class, like /[ks]/i won't have to go out
+ * to disk to find the possible matches. XXX It would be better to
+ * generate these via regen, in case a new version of the Unicode
+ * standard adds new mappings, though that is not really likely. */
+ switch (value) {
+ case 'k':
+ case 'K':
+ /* KELVIN SIGN */
+ *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
+ break;
+ case 's':
+ case 'S':
+ /* LATIN SMALL LETTER LONG S */
+ *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
+ break;
+ case MICRO_SIGN:
+ *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+ GREEK_SMALL_LETTER_MU);
+ *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+ GREEK_CAPITAL_LETTER_MU);
+ break;
+ case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
+ case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
+ /* ANGSTROM SIGN */
+ *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
+ if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
+ *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+ PL_fold_latin1[value]);
+ }
+ break;
+ case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
+ *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+ LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
+ break;
+ case LATIN_SMALL_LETTER_SHARP_S:
+ *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
+ LATIN_CAPITAL_LETTER_SHARP_S);
+
+ /* Under /a, /d, and /u, this can match the two chars "ss" */
+ if (! MORE_ASCII_RESTRICTED) {
+ add_alternate(alternate_ptr, (U8 *) "ss", 2);
+
+ /* And under /u or /a, it can match even if the target is
+ * not utf8 */
+ if (AT_LEAST_UNI_SEMANTICS) {
+ ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
+ }
+ }
+ break;
+ case 'F': case 'f':
+ case 'I': case 'i':
+ case 'L': case 'l':
+ case 'T': case 't':
+ case 'A': case 'a':
+ case 'H': case 'h':
+ case 'J': case 'j':
+ case 'N': case 'n':
+ case 'W': case 'w':
+ case 'Y': case 'y':
+ /* These all are targets of multi-character folds from code
+ * points that require UTF8 to express, so they can't match
+ * unless the target string is in UTF-8, so no action here is
+ * necessary, as regexec.c properly handles the general case
+ * for UTF-8 matching */
+ break;
+ default:
+ /* Use deprecated warning to increase the chances of this
+ * being output */
+ ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
+ break;
}
- *nonbitmap_ptr = add_range_to_invlist(*nonbitmap_ptr, value, value);
- ANYOF_FLAGS(node) |= ANYOF_UTF8;
+ }
+ else if (DEPENDS_SEMANTICS
+ && ! isASCII(value)
+ && PL_fold_latin1[value] != value)
+ {
+ /* Under DEPENDS rules, non-ASCII Latin1 characters match their
+ * folds only when the target string is in UTF-8. We add the fold
+ * here to the list of things to match outside the bitmap, which
+ * won't be looked at unless it is UTF8 (or else if something else
+ * says to look even if not utf8, but those things better not happen
+ * under DEPENDS semantics. */
+ *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
}
return stored;
PERL_STATIC_INLINE U8
-S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
+S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
{
/* This inline function sets a bit in the bitmap if not already set, and if
* appropriate, its fold, returning the number of bits that actually
stored = 1;
if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
- stored += set_regclass_bit_fold(pRExC_state, node, value, nonbitmap_ptr);
+ stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
}
return stored;
}
+STATIC void
+S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
+{
+ /* Adds input 'string' with length 'len' to the ANYOF node's unicode
+ * alternate list, pointed to by 'alternate_ptr'. This is an array of
+ * the multi-character folds of characters in the node */
+ SV *sv;
+
+ PERL_ARGS_ASSERT_ADD_ALTERNATE;
+
+ if (! *alternate_ptr) {
+ *alternate_ptr = newAV();
+ }
+ sv = newSVpvn_utf8((char*)string, len, TRUE);
+ av_push(*alternate_ptr, sv);
+ return;
+}
+
/*
parse a class specification and produce either an ANYOF node that
matches the pattern or perhaps will be optimized into an EXACTish node
- instead. */
+ instead. The node contains a bit map for the first 256 characters, with the
+ corresponding bit set if that character is in the list. For characters
+ above 255, a range list is used */
STATIC regnode *
S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
IV namedclass;
char *rangebegin = NULL;
bool need_class = 0;
+ bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
SV *listsv = NULL;
+ STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
+ than just initialized. */
UV n;
- HV* nonbitmap = NULL;
+
+ /* code points this node matches that can't be stored in the bitmap */
+ SV* nonbitmap = NULL;
+
+ /* The items that are to match that aren't stored in the bitmap, but are a
+ * result of things that are stored there. This is the fold closure of
+ * such a character, either because it has DEPENDS semantics and shouldn't
+ * be matched unless the target string is utf8, or is a code point that is
+ * too large for the bit map, as for example, the fold of the MICRO SIGN is
+ * above 255. This all is solely for performance reasons. By having this
+ * code know the outside-the-bitmap folds that the bitmapped characters are
+ * involved with, we don't have to go out to disk to find the list of
+ * matches, unless the character class includes code points that aren't
+ * storable in the bit map. That means that a character class with an 's'
+ * in it, for example, doesn't need to go out to disk to find everything
+ * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
+ * empty unless there is something whose fold we don't know about, and will
+ * have to go out to the disk to find. */
+ SV* l1_fold_invlist = NULL;
+
+ /* List of multi-character folds that are matched by this node */
AV* unicode_alternate = NULL;
#ifdef EBCDIC
UV literal_endpoint = 0;
RExC_parse++;
if (!SIZE_ONLY)
ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+
+ /* We have decided to not allow multi-char folds in inverted character
+ * classes, due to the confusion that can happen, especially with
+ * classes that are designed for a non-Unicode world: You have the
+ * peculiar case that:
+ "s s" =~ /^[^\xDF]+$/i => Y
+ "ss" =~ /^[^\xDF]+$/i => N
+ *
+ * See [perl #89750] */
+ allow_full_fold = FALSE;
}
if (SIZE_ONLY) {
RExC_size += ANYOF_SKIP;
-#ifdef ANYOF_ADD_LOC_SKIP
- if (LOC) {
- RExC_size += ANYOF_ADD_LOC_SKIP;
- }
-#endif
listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
}
else {
RExC_emit += ANYOF_SKIP;
if (LOC) {
ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
-#ifdef ANYOF_ADD_LOC_SKIP
- RExC_emit += ANYOF_ADD_LOC_SKIP;
-#endif
}
ANYOF_BITMAP_ZERO(ret);
listsv = newSVpvs("# comment\n");
+ initial_listsv_len = SvCUR(listsv);
}
nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
from earlier versions, OTOH that behaviour was broken
as well. */
UV v; /* value is register so we cant & it /grrr */
- if (reg_namedseq(pRExC_state, &v, NULL)) {
+ if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
goto parseit;
}
value= v;
e = RExC_parse;
n = 1;
}
- if (SIZE_ONLY) {
- if (LOC) {
- ckWARN2reg(RExC_parse,
- "\\%c uses Unicode rules, not locale rules",
- (int) value);
- }
- }
- else {
+ if (!SIZE_ONLY) {
if (UCHARAT(RExC_parse) == '^') {
RExC_parse++;
n--;
/* The \p could match something in the Latin1 range, hence
* something that isn't utf8 */
- ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP;
+ ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
namedclass = ANYOF_MAX; /* no official name, but it's named */
/* \p means they want Unicode semantics */
break;
}
recode_encoding:
- {
+ if (! RExC_override_recoding) {
SV* enc = PL_encoding;
value = reg_recode((const char)(U8)value, &enc);
if (!enc && SIZE_ONLY)
if (LOC && namedclass < ANYOF_MAX && ! need_class) {
need_class = 1;
if (SIZE_ONLY) {
-#ifdef ANYOF_CLASS_ADD_SKIP
- RExC_size += ANYOF_CLASS_ADD_SKIP;
-#endif
+ RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
}
else {
-#ifdef ANYOF_CLASS_ADD_SKIP
- RExC_emit += ANYOF_CLASS_ADD_SKIP;
-#endif
+ RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
ANYOF_CLASS_ZERO(ret);
}
ANYOF_FLAGS(ret) |= ANYOF_CLASS;
}
/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
- * literal */
+ * literal, as is the character that began the false range, i.e.
+ * the 'a' in the examples */
if (range) {
if (!SIZE_ONLY) {
const int w =
"False [] range \"%*.*s\"",
w, w, rangebegin);
+ stored +=
+ set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
if (prevvalue < 256) {
stored +=
- set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &nonbitmap);
- stored +=
- set_regclass_bit(pRExC_state, ret, '-', &nonbitmap);
+ set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
}
else {
- ANYOF_FLAGS(ret) |= ANYOF_UTF8;
- Perl_sv_catpvf(aTHX_ listsv,
- "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
+ nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
}
}
case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace");
case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct");
case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper");
-#ifdef BROKEN_UNICODE_CHARCLASS_MAPPINGS
/* \s, \w match all unicode if utf8. */
case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl");
case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word");
-#else
- /* \s, \w match ascii and locale only */
- case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "PerlSpace");
- case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "PerlWord");
-#endif
case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit");
case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
else {
for (value = 0; value < 128; value++)
stored +=
- set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
+ set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
}
yesno = '+';
what = NULL; /* Doesn't match outside ascii, so
else {
for (value = 128; value < 256; value++)
stored +=
- set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
+ set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
}
ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
yesno = '!';
/* consecutive digits assumed */
for (value = '0'; value <= '9'; value++)
stored +=
- set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);
+ set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
}
yesno = '+';
- what = POSIX_CC_UNI_NAME("Digit");
+ what = "Digit";
break;
case ANYOF_NDIGIT:
if (LOC)
/* consecutive digits assumed */
for (value = 0; value < '0'; value++)
stored +=
- set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);
+ set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
for (value = '9' + 1; value < 256; value++)
stored +=
- set_regclass_bit(pRExC_state, ret, (U8) value, &nonbitmap);
+ set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
}
yesno = '!';
- what = POSIX_CC_UNI_NAME("Digit");
+ what = "Digit";
if (AT_LEAST_ASCII_RESTRICTED ) {
ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
}
if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
/* Strings such as "+utf8::isWord\n" */
Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
- ANYOF_FLAGS(ret) |= ANYOF_UTF8;
}
continue;
}
else {
prevvalue = value; /* save the beginning of the range */
- if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
- RExC_parse[1] != ']') {
+ if (RExC_parse+1 < RExC_end
+ && *RExC_parse == '-'
+ && RExC_parse[1] != ']')
+ {
RExC_parse++;
/* a bad range like \w-, [:word:]- ? */
}
if (!SIZE_ONLY)
stored +=
- set_regclass_bit(pRExC_state, ret, '-', &nonbitmap);
+ set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
} else
range = 1; /* yeah, it's a range! */
continue; /* but do it the next time */
for (i = prevvalue; i <= ceilvalue; i++)
if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
stored +=
- set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap);
+ set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
}
} else {
for (i = prevvalue; i <= ceilvalue; i++)
if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
stored +=
- set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap);
+ set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
}
}
}
else
#endif
for (i = prevvalue; i <= ceilvalue; i++) {
- stored += set_regclass_bit(pRExC_state, ret, (U8) i, &nonbitmap);
+ stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
}
}
if (value > 255) {
const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
const UV natvalue = NATIVE_TO_UNI(value);
- if (! nonbitmap) {
- nonbitmap = _new_invlist(2);
- }
nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
- ANYOF_FLAGS(ret) |= ANYOF_UTF8;
}
-#if 0
-
- /* If the code point requires utf8 to represent, and we are not
- * folding, it can't match unless the target is in utf8. Only
- * a few code points above 255 fold to below it, so XXX an
- * optimization would be to know which ones and set the flag
- * appropriately. */
- ANYOF_FLAGS(ret) |= (FOLD || value < 256)
- ? ANYOF_NONBITMAP
- : ANYOF_UTF8;
- if (prevnatvalue < natvalue) { /* '>' case is fatal error above */
-
- /* The \t sets the whole range */
- Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
- prevnatvalue, natvalue);
-
- /* Currently, we don't look at every value in the range.
- * Therefore we have to assume the worst case: that if
- * folding, it will match more than one character. But in
- * lookbehind patterns, can only be single character
- * length, so disallow those folds */
- if (FOLD && ! RExC_in_lookbehind) {
- OP(ret) = ANYOFV;
- }
- }
- else if (prevnatvalue == natvalue) {
- Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
- if (FOLD) {
- U8 foldbuf[UTF8_MAXBYTES_CASE+1];
- STRLEN foldlen;
- const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
-
-#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
- if (RExC_precomp[0] == ':' &&
- RExC_precomp[1] == '[' &&
- (f == 0xDF || f == 0x92)) {
- f = NATIVE_TO_UNI(f);
- }
-#endif
- /* If folding and foldable and a single
- * character, insert also the folded version
- * to the charclass. */
- if (f != value) {
-#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
- if ((RExC_precomp[0] == ':' &&
- RExC_precomp[1] == '[' &&
- (f == 0xA2 &&
- (value == 0xFB05 || value == 0xFB06))) ?
- foldlen == ((STRLEN)UNISKIP(f) - 1) :
- foldlen == (STRLEN)UNISKIP(f) )
-#else
- if (foldlen == (STRLEN)UNISKIP(f))
-#endif
- Perl_sv_catpvf(aTHX_ listsv,
- "%04"UVxf"\n", f);
- else if (! RExC_in_lookbehind) {
- /* Any multicharacter foldings
- * (disallowed in lookbehind patterns)
- * require the following transform:
- * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
- * where E folds into "pq" and F folds
- * into "rst", all other characters
- * fold to single characters. We save
- * away these multicharacter foldings,
- * to be later saved as part of the
- * additional "s" data. */
- SV *sv;
-
- if (!unicode_alternate)
- unicode_alternate = newAV();
- sv = newSVpvn_utf8((char*)foldbuf, foldlen,
- TRUE);
- av_push(unicode_alternate, sv);
- OP(ret) = ANYOFV;
- }
- }
-
- /* If folding and the value is one of the Greek
- * sigmas insert a few more sigmas to make the
- * folding rules of the sigmas to work right.
- * Note that not all the possible combinations
- * are handled here: some of them are handled
- * by the standard folding rules, and some of
- * them (literal or EXACTF cases) are handled
- * during runtime in regexec.c:S_find_byclass(). */
- if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
- Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
- (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
- Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
- (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
- }
- else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
- Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
- (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
- }
- }
- }
-#endif
#ifdef EBCDIC
literal_endpoint = 0;
#endif
return ret;
/****** !SIZE_ONLY AFTER HERE *********/
- /* Finish up the non-bitmap entries */
- if (nonbitmap) {
- UV* nonbitmap_array;
+ /* If folding and there are code points above 255, we calculate all
+ * characters that could fold to or from the ones already on the list */
+ if (FOLD && nonbitmap) {
UV i;
- /* If folding, we add to the list all characters that could fold to or
- * from the ones already on the list */
- if (FOLD) {
- HV* fold_intersection;
- UV* fold_list;
-
- /* This is a list of all the characters that participate in folds
- * (except marks, etc in multi-char folds */
- if (! PL_utf8_foldable) {
- SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
- PL_utf8_foldable = _swash_to_invlist(swash);
- }
+ SV* fold_intersection;
+ UV* fold_list;
- /* This is a hash that for a particular fold gives all characters
- * that are involved in it */
- if (! PL_utf8_foldclosures) {
-
- /* If we were unable to find any folds, then we likely won't be
- * able to find the closures. So just create an empty list.
- * Folding will effectively be restricted to the non-Unicode
- * rules hard-coded into Perl. (This case happens legitimately
- * during compilation of Perl itself before the Unicode tables
- * are generated) */
- if (invlist_len(PL_utf8_foldable) == 0) {
- PL_utf8_foldclosures = _new_invlist(0);
- } else {
- /* If the folds haven't been read in, call a fold function
- * to force that */
- if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES+1];
- STRLEN dummy_len;
- to_utf8_fold((U8*) "A", dummy, &dummy_len);
- }
- PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
+ /* This is a list of all the characters that participate in folds
+ * (except marks, etc in multi-char folds */
+ if (! PL_utf8_foldable) {
+ SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
+ PL_utf8_foldable = _swash_to_invlist(swash);
+ }
+
+ /* This is a hash that for a particular fold gives all characters
+ * that are involved in it */
+ if (! PL_utf8_foldclosures) {
+
+ /* If we were unable to find any folds, then we likely won't be
+ * able to find the closures. So just create an empty list.
+ * Folding will effectively be restricted to the non-Unicode rules
+ * hard-coded into Perl. (This case happens legitimately during
+ * compilation of Perl itself before the Unicode tables are
+ * generated) */
+ if (invlist_len(PL_utf8_foldable) == 0) {
+ PL_utf8_foldclosures = newHV();
+ } else {
+ /* If the folds haven't been read in, call a fold function
+ * to force that */
+ if (! PL_utf8_tofold) {
+ U8 dummy[UTF8_MAXBYTES+1];
+ STRLEN dummy_len;
+ to_utf8_fold((U8*) "A", dummy, &dummy_len);
}
+ PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
}
+ }
+
+ /* Only the characters in this class that participate in folds need
+ * be checked. Get the intersection of this class and all the
+ * possible characters that are foldable. This can quickly narrow
+ * down a large class */
+ invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
+
+ /* Now look at the foldable characters in this class individually */
+ fold_list = invlist_array(fold_intersection);
+ for (i = 0; i < invlist_len(fold_intersection); i++) {
+ UV j;
- /* Only the characters in this class that participate in folds need
- * be checked. Get the intersection of this class and all the
- * possible characters that are foldable. This can quickly narrow
- * down a large class */
- fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
-
- /* Now look at the foldable characters in this class individually */
- fold_list = invlist_array(fold_intersection);
- for (i = 0; i < invlist_len(fold_intersection); i++) {
- UV j;
-
- /* The next entry is the beginning of the range that is in the
- * class */
- UV start = fold_list[i++];
-
-
- /* The next entry is the beginning of the next range, which
- * isn't in the class, so the end of the current range is one
- * less than that */
- UV end = fold_list[i] - 1;
-
- /* Look at every character in the range */
- for (j = start; j <= end; j++) {
-
- /* Get its fold */
- U8 foldbuf[UTF8_MAXBYTES_CASE+1];
- STRLEN foldlen;
- const UV f = to_uni_fold(j, foldbuf, &foldlen);
-
- if (foldlen > (STRLEN)UNISKIP(f)) {
-
- /* Any multicharacter foldings (disallowed in
- * lookbehind patterns) require the following
- * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
- * E folds into "pq" and F folds into "rst", all other
- * characters fold to single characters. We save away
- * these multicharacter foldings, to be later saved as
- * part of the additional "s" data. */
- if (! RExC_in_lookbehind) {
- SV *sv;
- U8* loc = foldbuf;
- U8* e = foldbuf + foldlen;
-
- /* If any of the folded characters of this are in
- * the Latin1 range, tell the regex engine that
- * this can match a non-utf8 target string. The
- * multi-byte fold whose source is in the
- * Latin1 range (U+00DF) applies only when the
- * target string is utf8, or under unicode rules */
- if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
- while (loc < e) {
- if (MORE_ASCII_RESTRICTED && (isASCII(*loc) != isASCII(j))) {
+ /* The next entry is the beginning of the range that is in the
+ * class */
+ UV start = fold_list[i++];
+
+
+ /* The next entry is the beginning of the next range, which
+ * isn't in the class, so the end of the current range is one
+ * less than that */
+ UV end = fold_list[i] - 1;
+
+ /* Look at every character in the range */
+ for (j = start; j <= end; j++) {
+
+ /* Get its fold */
+ U8 foldbuf[UTF8_MAXBYTES_CASE+1];
+ STRLEN foldlen;
+ const UV f =
+ _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
+
+ if (foldlen > (STRLEN)UNISKIP(f)) {
+
+ /* Any multicharacter foldings (disallowed in
+ * lookbehind patterns) require the following
+ * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
+ * E folds into "pq" and F folds into "rst", all other
+ * characters fold to single characters. We save away
+ * these multicharacter foldings, to be later saved as
+ * part of the additional "s" data. */
+ if (! RExC_in_lookbehind) {
+ U8* loc = foldbuf;
+ U8* e = foldbuf + foldlen;
+
+ /* If any of the folded characters of this are in
+ * the Latin1 range, tell the regex engine that
+ * this can match a non-utf8 target string. The
+ * only multi-byte fold whose source is in the
+ * Latin1 range (U+00DF) applies only when the
+ * target string is utf8, or under unicode rules */
+ if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
+ while (loc < e) {
+
+ /* Can't mix ascii with non- under /aa */
+ if (MORE_ASCII_RESTRICTED
+ && (isASCII(*loc) != isASCII(j)))
+ {
+ goto end_multi_fold;
+ }
+ if (UTF8_IS_INVARIANT(*loc)
+ || UTF8_IS_DOWNGRADEABLE_START(*loc))
+ {
+ /* Can't mix above and below 256 under
+ * LOC */
+ if (LOC) {
goto end_multi_fold;
}
- /* XXX Discard this fold if any are latin1
- * and LOC */
- if (UTF8_IS_INVARIANT(*loc)
- || UTF8_IS_DOWNGRADEABLE_START(*loc))
- {
- ANYOF_FLAGS(ret)
- |= ANYOF_NONBITMAP_NON_UTF8;
- break;
- }
- loc += UTF8SKIP(loc);
+ ANYOF_FLAGS(ret)
+ |= ANYOF_NONBITMAP_NON_UTF8;
+ break;
}
+ loc += UTF8SKIP(loc);
}
- ANYOF_FLAGS(ret) |= ANYOF_UTF8;
+ }
- if (!unicode_alternate) {
- unicode_alternate = newAV();
- }
- sv = newSVpvn_utf8((char*)foldbuf, foldlen, TRUE);
- av_push(unicode_alternate, sv);
+ add_alternate(&unicode_alternate, foldbuf, foldlen);
+ end_multi_fold: ;
+ }
- /* This node is variable length */
- OP(ret) = ANYOFV;
- end_multi_fold: ;
- }
+ /* This is special-cased, as it is the only letter which
+ * has both a multi-fold and single-fold in Latin1. All
+ * the other chars that have single and multi-folds are
+ * always in utf8, and the utf8 folding algorithm catches
+ * them */
+ if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
+ stored += set_regclass_bit(pRExC_state,
+ ret,
+ LATIN_SMALL_LETTER_SHARP_S,
+ &l1_fold_invlist, &unicode_alternate);
}
- else { /* Single character fold */
- SV** listp;
-
- /* Consider "k" =~ /[K]/i. The line above would have
- * just folded the 'k' to itself, and that isn't going
- * to match 'K'. So we look through the closure of
- * everything that folds to 'k'. That will find the
- * 'K'. Initialize the list, if necessary */
-
- /* The data structure is a hash with the keys every
- * character that is folded to, like 'k', and the
- * values each an array of everything that folds to its
- * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
- if ((listp = hv_fetch(PL_utf8_foldclosures,
- (char *) foldbuf, foldlen, FALSE)))
- {
- AV* list = (AV*) *listp;
- IV k;
- for (k = 0; k <= av_len(list); k++) {
- SV** c_p = av_fetch(list, k, FALSE);
- UV c;
- if (c_p == NULL) {
- Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
- }
- c = SvUV(*c_p);
- if (MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j))) {
- continue;
- }
+ }
+ else {
+ /* Single character fold. Add everything in its fold
+ * closure to the list that this node should match */
+ SV** listp;
+
+ /* The fold closures data structure is a hash with the
+ * keys being every character that is folded to, like
+ * 'k', and the values each an array of everything that
+ * folds to its key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
+ if ((listp = hv_fetch(PL_utf8_foldclosures,
+ (char *) foldbuf, foldlen, FALSE)))
+ {
+ AV* list = (AV*) *listp;
+ IV k;
+ for (k = 0; k <= av_len(list); k++) {
+ SV** c_p = av_fetch(list, k, FALSE);
+ UV c;
+ if (c_p == NULL) {
+ Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+ }
+ c = SvUV(*c_p);
+
+ /* /aa doesn't allow folds between ASCII and
+ * non-; /l doesn't allow them between above
+ * and below 256 */
+ if ((MORE_ASCII_RESTRICTED
+ && (isASCII(c) != isASCII(j)))
+ || (LOC && ((c < 256) != (j < 256))))
+ {
+ continue;
+ }
- if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
- stored += set_regclass_bit(pRExC_state, ret, (U8) c, &nonbitmap);
- }
- /* It may be that the code point is already
- * in this range or already in the bitmap,
- * XXX THink about LOC
- * in which case we need do nothing */
- else if ((c < start || c > end)
- && (c > 255
- || ! ANYOF_BITMAP_TEST(ret, c)))
- {
- nonbitmap = add_range_to_invlist(nonbitmap, c, c);
- }
+ if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
+ stored += set_regclass_bit(pRExC_state,
+ ret,
+ (U8) c,
+ &l1_fold_invlist, &unicode_alternate);
+ }
+ /* It may be that the code point is already
+ * in this range or already in the bitmap,
+ * in which case we need do nothing */
+ else if ((c < start || c > end)
+ && (c > 255
+ || ! ANYOF_BITMAP_TEST(ret, c)))
+ {
+ nonbitmap = add_cp_to_invlist(nonbitmap, c);
}
}
}
}
}
- invlist_destroy(fold_intersection);
- } /* End of processing all the folds */
-
- /* Here have the full list of items to match that aren't in the
- * bitmap. Convert to the structure that the rest of the code is
- * expecting. XXX That rest of the code should convert to this
- * structure */
- nonbitmap_array = invlist_array(nonbitmap);
- for (i = 0; i < invlist_len(nonbitmap); i++) {
-
- /* The next entry is the beginning of the range that is in the
- * class */
- UV start = nonbitmap_array[i++];
-
- /* The next entry is the beginning of the next range, which isn't
- * in the class, so the end of the current range is one less than
- * that */
- UV end = nonbitmap_array[i] - 1;
+ }
+ SvREFCNT_dec(fold_intersection);
+ }
- if (start == end) {
- Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
- }
- else {
- /* The \t sets the whole range */
- Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
- /* XXX EBCDIC */
- start, end);
- }
+ /* Combine the two lists into one. */
+ if (l1_fold_invlist) {
+ if (nonbitmap) {
+ invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
+ SvREFCNT_dec(l1_fold_invlist);
+ }
+ else {
+ nonbitmap = l1_fold_invlist;
}
- invlist_destroy(nonbitmap);
}
/* Here, we have calculated what code points should be in the character
* optimize locale. Doing so perhaps could be done as long as there is
* nothing like \w in it; some thought also would have to be given to the
* interaction with above 0x100 chars */
- if (! LOC && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
+ if (! LOC
+ && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT
+ && ! unicode_alternate
+ && ! nonbitmap
+ && SvCUR(listsv) == initial_listsv_len)
+ {
for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
ANYOF_BITMAP(ret)[value] ^= 0xFF;
stored = 256 - stored;
/* The inversion means that everything above 255 is matched; and at the
* same time we clear the invert flag */
- ANYOF_FLAGS(ret) = ANYOF_UTF8|ANYOF_UNICODE_ALL;
+ ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
}
/* Folding in the bitmap is taken care of above, but not for locale (for
* which we have to wait to see what folding is in effect at runtime), and
* for things not in the bitmap. Set run-time fold flag for these */
- if (FOLD && (LOC || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP))) {
+ if (FOLD && (LOC || nonbitmap || unicode_alternate)) {
ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
}
* characters which only have the two folds; so things like 'fF' and 'Ii'
* wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
* FI'. */
- if (! (ANYOF_FLAGS(ret) & (ANYOF_NONBITMAP|ANYOF_INVERT|ANYOF_UNICODE_ALL))
+ if (! nonbitmap
+ && ! unicode_alternate
+ && SvCUR(listsv) == initial_listsv_len
+ && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
&& (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
|| (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
|| (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
return ret;
}
- {
+ if (nonbitmap) {
+ UV* nonbitmap_array = invlist_array(nonbitmap);
+ UV nonbitmap_len = invlist_len(nonbitmap);
+ UV i;
+
+ /* Here have the full list of items to match that aren't in the
+ * bitmap. Convert to the structure that the rest of the code is
+ * expecting. XXX That rest of the code should convert to this
+ * structure */
+ for (i = 0; i < nonbitmap_len; i++) {
+
+ /* The next entry is the beginning of the range that is in the
+ * class */
+ UV start = nonbitmap_array[i++];
+ UV end;
+
+ /* The next entry is the beginning of the next range, which isn't
+ * in the class, so the end of the current range is one less than
+ * that. But if there is no next range, it means that the range
+ * begun by 'start' extends to infinity, which for this platform
+ * ends at UV_MAX */
+ if (i == nonbitmap_len) {
+ end = UV_MAX;
+ }
+ else {
+ end = nonbitmap_array[i] - 1;
+ }
+
+ if (start == end) {
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
+ }
+ else {
+ /* The \t sets the whole range */
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
+ /* XXX EBCDIC */
+ start, end);
+ }
+ }
+ SvREFCNT_dec(nonbitmap);
+ }
+
+ if (SvCUR(listsv) == initial_listsv_len && ! unicode_alternate) {
+ ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
+ SvREFCNT_dec(listsv);
+ SvREFCNT_dec(unicode_alternate);
+ }
+ else {
+
AV * const av = newAV();
SV *rv;
/* The 0th element stores the character class description
* used later (regexec.c:S_reginclass()). */
av_store(av, 0, listsv);
av_store(av, 1, NULL);
- av_store(av, 2, MUTABLE_SV(unicode_alternate));
+
+ /* Store any computed multi-char folds only if we are allowing
+ * them */
+ if (allow_full_fold) {
+ av_store(av, 2, MUTABLE_SV(unicode_alternate));
+ if (unicode_alternate) { /* This node is variable length */
+ OP(ret) = ANYOFV;
+ }
+ }
+ else {
+ av_store(av, 2, NULL);
+ }
rv = newRV_noinc(MUTABLE_SV(av));
n = add_data(pRExC_state, 1, "s");
RExC_rxi->data->data[n] = (void*)rv;
/* output information about the unicode matching */
if (flags & ANYOF_UNICODE_ALL)
sv_catpvs(sv, "{unicode_all}");
- else if (flags & ANYOF_UTF8)
+ else if (ANYOF_NONBITMAP(o))
sv_catpvs(sv, "{unicode}");
if (flags & ANYOF_NONBITMAP_NON_UTF8)
sv_catpvs(sv, "{outside bitmap}");
- {
+ if (ANYOF_NONBITMAP(o)) {
SV *lv;
SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
dVAR;
struct regexp *const r = (struct regexp *)SvANY(rx);
regexp_internal *reti;
- int len, npar;
+ int len;
RXi_GET_DECL(r,ri);
PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
- npar = r->nparens+1;
len = ProgLen(ri);
Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);