#define WORST 0 /* Worst case. */
#define HASWIDTH 0x01 /* Known to match non-null strings. */
-/* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
- * character. Note that this is not the same thing as REGNODE_SIMPLE */
+/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
+ * character. (There needs to be a case: in the switch statement in regexec.c
+ * for any node marked SIMPLE.) Note that this is not the same thing as
+ * REGNODE_SIMPLE */
#define SIMPLE 0x02
-#define SPSTART 0x04 /* Starts with * or +. */
+#define SPSTART 0x04 /* Starts with * or + */
#define TRYAGAIN 0x08 /* Weeded out a declaration. */
#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
ANYOF_BITMAP_SETALL(cl);
cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
- |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
+ |ANYOF_NON_UTF8_LATIN1_ALL;
/* If any portion of the regex is to operate under locale rules,
* initialization includes it. The reason this isn't done for all regexes
* necessary. */
if (RExC_contains_locale) {
ANYOF_CLASS_SETALL(cl); /* /l uses class */
- cl->flags |= ANYOF_LOCALE;
+ cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
}
else {
ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
&& !(ANYOF_CLASS_TEST_ANY_SET(cl))
&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
- && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
+ && !(and_with->flags & ANYOF_LOC_FOLD)
+ && !(cl->flags & ANYOF_LOC_FOLD)) {
int i;
if (and_with->flags & ANYOF_INVERT)
* (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
*/
else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
- && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
+ && !(or_with->flags & ANYOF_LOC_FOLD)
+ && !(cl->flags & ANYOF_LOC_FOLD) ) {
int i;
for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
} else { /* 'or_with' is not inverted */
/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
- || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
+ && (!(or_with->flags & ANYOF_LOC_FOLD)
+ || (cl->flags & ANYOF_LOC_FOLD)) ) {
int i;
/* OR char bitmap and class bitmap separately */
/* Here, the pattern is not UTF-8. Look for the multi-char folds
* that are all ASCII. As in the above case, EXACTFL and EXACTFA
* nodes can't have multi-char folds to this range (and there are
- * no existing ones to the upper latin1 range). In the EXACTF
+ * no existing ones in the upper latin1 range). In the EXACTF
* case we look also for the sharp s, which can be in the final
* position. Otherwise we can stop looking 1 byte earlier because
* have to find at least two characters for a multi-fold */
const U8 s_masked = 's' & S_or_s_mask;
while (s < upper) {
- int len = is_MULTI_CHAR_FOLD_low_safe(s, s_end);
+ int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
if (! len) { /* Not a multi-char fold. */
if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
{
if (uc >= 0x100 ||
(!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
&& !ANYOF_BITMAP_TEST(data->start_class, uc)
- && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
+ && (!(data->start_class->flags & ANYOF_LOC_FOLD)
|| !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
)
{
if (compat) {
ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
- data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
if (OP(scan) == EXACTFL) {
/* XXX This set is probably no longer necessary, and
* probably wrong as LOCALE now is on in the initial
* state */
- data->start_class->flags |= ANYOF_LOCALE;
+ data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
}
else {
}
}
else if (flags & SCF_DO_STCLASS_OR) {
- if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
+ if (data->start_class->flags & ANYOF_LOC_FOLD) {
/* false positive possible if the class is case-folded.
Assume that the locale settings are the same... */
if (uc < 0x100) {
if (in_char_class && has_multiple_chars) {
ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
}
+
RExC_parse = endbrace + 1;
}
else if (! node_p || ! has_multiple_chars) {
dVAR;
UV nextvalue;
- UV prevvalue, save_prevvalue = OOB_UNICODE;
+ UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
IV range = 0;
- UV value, save_value = 0;
+ UV value = OOB_UNICODE, save_value = OOB_UNICODE;
regnode *ret;
STRLEN numlen;
IV namedclass = OOB_NAMEDCLASS;
char *rangebegin = NULL;
bool need_class = 0;
- bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
SV *listsv = NULL;
STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
than just initialized. */
/* Assume we are going to generate an ANYOF node. */
ret = reganode(pRExC_state, ANYOF, 0);
-
if (!SIZE_ONLY) {
ANYOF_FLAGS(ret) = 0;
}
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_parse++;
- if (! RExC_in_multi_char_class) {
- invert = TRUE;
- RExC_naughty++;
-
- /* We have decided to not allow multi-char folds in inverted
- * character classes, due to the confusion that can happen,
- * especially with classes that are designed for a non-Unicode
- * world: You have the peculiar case that:
- "s s" =~ /^[^\xDF]+$/i => Y
- "ss" =~ /^[^\xDF]+$/i => N
- *
- * See [perl #89750] */
- allow_full_fold = FALSE;
- }
+ invert = TRUE;
+ RExC_naughty++;
}
if (SIZE_ONLY) {
SV* scratch_list = NULL;
/* Include all above-Latin1 non-blanks */
- _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+ _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
+ &scratch_list);
/* Add them to the running total of posix classes */
- _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank, &scratch_list);
+ _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
+ &scratch_list);
if (! posixes) {
posixes = scratch_list;
}
/* Get the list of all non-ASCII-blanks in Latin 1, and
* add them to the running total */
- _invlist_subtract(PL_Latin1, PL_PosixBlank, &scratch_list);
+ _invlist_subtract(PL_Latin1, PL_PosixBlank,
+ &scratch_list);
_invlist_union(posixes, scratch_list, &posixes);
SvREFCNT_dec(scratch_list);
}
* For single-valued non-inverted ranges, we consider the possibility
* of multi-char folds. (We made a conscious decision to not do this
* for the other cases because it can often lead to non-intuitive
- * results) */
+ * results. For example, you have the peculiar case that:
+ * "s s" =~ /^[^\xDF]+$/i => Y
+ * "ss" =~ /^[^\xDF]+$/i => N
+ *
+ * See [perl #89750] */
if (FOLD && ! invert && value == prevvalue) {
if (value == LATIN_SMALL_LETTER_SHARP_S
|| (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
if (! RExC_in_multi_char_class) {
AV** this_array_ptr;
AV* this_array;
- STRLEN cp_count = utf8_length(foldbuf, foldbuf + foldlen);
+ STRLEN cp_count = utf8_length(foldbuf,
+ foldbuf + foldlen);
SV* multi_fold = sv_2mortal(newSVpvn("", 0));
Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
* ok. This makes the test for the ligature 'ffi' come
* before the test for 'ff' */
if (av_exists(multi_char_matches, cp_count)) {
- this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
+ this_array_ptr = (AV**) av_fetch(multi_char_matches,
+ cp_count, FALSE);
this_array = *this_array_ptr;
}
else {
this_array = newAV();
- av_store(multi_char_matches, cp_count, (SV*) this_array);
+ av_store(multi_char_matches, cp_count,
+ (SV*) this_array);
}
av_push(this_array, multi_fold);
}
- /* This element should not be processed further in this class */
+ /* This element should not be processed further in this
+ * class */
element_count--;
value = save_value;
prevvalue = save_prevvalue;
AV** this_array_ptr;
SV* this_sequence;
- this_array_ptr = (AV**) av_fetch(multi_char_matches, cp_count, FALSE);
- while ((this_sequence = av_pop(*this_array_ptr)) != &PL_sv_undef) {
+ this_array_ptr = (AV**) av_fetch(multi_char_matches,
+ cp_count, FALSE);
+ while ((this_sequence = av_pop(*this_array_ptr)) !=
+ &PL_sv_undef)
+ {
if (! first_time) {
sv_catpv(substitute_parse, "|");
}
ret = reg(pRExC_state, 1, ®_flags, depth+1);
- *flagp |= reg_flags&(HASWIDTH|SPSTART|POSTPONED);
+ *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
RExC_parse = save_parse;
RExC_end = save_end;
* to force that */
if (! PL_utf8_tofold) {
U8 dummy[UTF8_MAXBYTES+1];
- STRLEN dummy_len;
/* This string is just a short named one above \xff */
- to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len);
+ to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
assert(PL_utf8_tofold); /* Verify that worked */
}
PL_utf8_foldclosures =
U8 foldbuf[UTF8_MAXBYTES_CASE+1];
STRLEN foldlen;
- UV f;
SV** listp;
if (j < 256) {
* hard-coded for it. First, get its fold. This is the simple
* fold, as the multi-character folds have been handled earlier
* and separated out */
- f = _to_uni_fold_flags(j, foldbuf, &foldlen,
- ((LOC)
- ? FOLD_FLAGS_LOCALE
- : (ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0));
-
- /* Single character fold of above Latin1. Add everything
- * in its fold closure to the list that this node should
- * match */
- /* The fold closures data structure is a hash with the keys
- * being every character that is folded to, like 'k', and
- * the values each an array of everything that folds to its
- * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
- if ((listp = hv_fetch(PL_utf8_foldclosures,
- (char *) foldbuf, foldlen, FALSE)))
- {
- AV* list = (AV*) *listp;
- IV k;
- for (k = 0; k <= av_len(list); k++) {
- SV** c_p = av_fetch(list, k, FALSE);
- UV c;
- if (c_p == NULL) {
- Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
- }
- c = SvUV(*c_p);
-
- /* /aa doesn't allow folds between ASCII and non-;
- * /l doesn't allow them between above and below
- * 256 */
- if ((ASCII_FOLD_RESTRICTED
- && (isASCII(c) != isASCII(j)))
- || (LOC && ((c < 256) != (j < 256))))
- {
- continue;
- }
+ _to_uni_fold_flags(j, foldbuf, &foldlen,
+ ((LOC)
+ ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
+
+ /* Single character fold of above Latin1. Add everything in
+ * its fold closure to the list that this node should match.
+ * The fold closures data structure is a hash with the keys
+ * being the UTF-8 of every character that is folded to, like
+ * 'k', and the values each an array of all code points that
+ * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
+ * Multi-character folds are not included */
+ if ((listp = hv_fetch(PL_utf8_foldclosures,
+ (char *) foldbuf, foldlen, FALSE)))
+ {
+ AV* list = (AV*) *listp;
+ IV k;
+ for (k = 0; k <= av_len(list); k++) {
+ SV** c_p = av_fetch(list, k, FALSE);
+ UV c;
+ if (c_p == NULL) {
+ Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+ }
+ c = SvUV(*c_p);
- /* Folds involving non-ascii Latin1 characters
- * under /d are added to a separate list */
- if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
- {
- cp_list = add_cp_to_invlist(cp_list, c);
- }
- else {
- depends_list = add_cp_to_invlist(depends_list, c);
- }
- }
- }
+ /* /aa doesn't allow folds between ASCII and non-; /l
+ * doesn't allow them between above and below 256 */
+ if ((ASCII_FOLD_RESTRICTED
+ && (isASCII(c) != isASCII(j)))
+ || (LOC && ((c < 256) != (j < 256))))
+ {
+ continue;
+ }
+
+ /* Folds involving non-ascii Latin1 characters
+ * under /d are added to a separate list */
+ if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
+ {
+ cp_list = add_cp_to_invlist(cp_list, c);
+ }
+ else {
+ depends_list = add_cp_to_invlist(depends_list, c);
+ }
+ }
+ }
}
}
SvREFCNT_dec(fold_intersection);
* folded until runtime */
/* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
- * at compile time. Besides not inverting folded locale now, we can't invert
- * if there are things such as \w, which aren't known until runtime */
+ * at compile time. Besides not inverting folded locale now, we can't
+ * invert if there are things such as \w, which aren't known until runtime
+ * */
if (invert
&& ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
&& ! depends_list
* fetching) */
if (FOLD && LOC)
{
- ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
+ ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
}
/* Some character classes are equivalent to other nodes. Such nodes take
if (flags & ANYOF_LOCALE)
sv_catpvs(sv, "{loc}");
- if (flags & ANYOF_LOC_NONBITMAP_FOLD)
+ if (flags & ANYOF_LOC_FOLD)
sv_catpvs(sv, "{i}");
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
if (flags & ANYOF_INVERT)
if (!ret_x)
ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
+ /* This ensures that SvTHINKFIRST(sv) is true, and hence that
+ sv_force_normal(sv) is called. */
+ SvFAKE_on(ret_x);
ret = (struct regexp *)SvANY(ret_x);
(void)ReREFCNT_inc(rx);
by pointing directly at the buffer, but flagging that the allocated
space in the copy is zero. As we've just done a struct copy, it's now
a case of zero-ing that, rather than copying the current length. */
+ if (SvPOKp(ret_x)) SvPV_free(ret_x);
SvPV_set(ret_x, RX_WRAPPED(rx));
SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
memcpy(&(ret->xpv_cur), &(r->xpv_cur),
sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
SvLEN_set(ret_x, 0);
- SvSTASH_set(ret_x, NULL);
- SvMAGIC_set(ret_x, NULL);
if (r->offs) {
const I32 npar = r->nparens+1;
Newx(ret->offs, npar, regexp_paren_pair);