#define REG_COMP_C
#ifdef PERL_IN_XSUB_RE
# include "re_comp.h"
+extern const struct regexp_engine my_reg_engine;
#else
# include "regcomp.h"
#endif
#include "dquote_static.c"
-#ifndef PERL_IN_XSUB_RE
-# include "charclass_invlists.h"
+#include "charclass_invlists.h"
+#include "inline_invlist.c"
+#include "unicode_constants.h"
+
+#ifdef HAS_ISBLANK
+# define hasISBLANK 1
+#else
+# define hasISBLANK 0
#endif
#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
+#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
+#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
#ifdef op
#undef op
I32 in_lookbehind;
I32 contains_locale;
I32 override_recoding;
+ I32 in_multi_char_class;
struct reg_code_block *code_blocks; /* positions of literal (?{})
within pattern */
int num_code_blocks; /* size of code_blocks[] */
#define RExC_recurse_count (pRExC_state->recurse_count)
#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
#define RExC_contains_locale (pRExC_state->contains_locale)
-#define RExC_override_recoding (pRExC_state->override_recoding)
+#define RExC_override_recoding (pRExC_state->override_recoding)
+#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define WORST 0 /* Worst case. */
#define HASWIDTH 0x01 /* Known to match non-null strings. */
-/* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
- * character, and if utf8, must be invariant. Note that this is not the same
- * thing as REGNODE_SIMPLE */
+/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
+ * character. (There needs to be a case: in the switch statement in regexec.c
+ * for any node marked SIMPLE.) Note that this is not the same thing as
+ * REGNODE_SIMPLE */
#define SIMPLE 0x02
-#define SPSTART 0x04 /* Starts with * or +. */
+#define SPSTART 0x04 /* Starts with * or + */
#define TRYAGAIN 0x08 /* Weeded out a declaration. */
#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
string can occur infinitely far to the right.
- minlenp
- A pointer to the minimum length of the pattern that the string
- was found inside. This is important as in the case of positive
+ A pointer to the minimum number of characters of the pattern that the
+ string was found inside. This is important as in the case of positive
lookahead or positive lookbehind we can have multiple patterns
involved. Consider
ANYOF_BITMAP_SETALL(cl);
cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
- |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
+ |ANYOF_NON_UTF8_LATIN1_ALL;
/* If any portion of the regex is to operate under locale rules,
* initialization includes it. The reason this isn't done for all regexes
* necessary. */
if (RExC_contains_locale) {
ANYOF_CLASS_SETALL(cl); /* /l uses class */
- cl->flags |= ANYOF_LOCALE;
+ cl->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
}
else {
ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
&& !(ANYOF_CLASS_TEST_ANY_SET(cl))
&& (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
- && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
+ && !(and_with->flags & ANYOF_LOC_FOLD)
+ && !(cl->flags & ANYOF_LOC_FOLD)) {
int i;
if (and_with->flags & ANYOF_INVERT)
* (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
*/
else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
- && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
+ && !(or_with->flags & ANYOF_LOC_FOLD)
+ && !(cl->flags & ANYOF_LOC_FOLD) ) {
int i;
for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
} else { /* 'or_with' is not inverted */
/* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
- && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
- || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
+ && (!(or_with->flags & ANYOF_LOC_FOLD)
+ || (cl->flags & ANYOF_LOC_FOLD)) ) {
int i;
/* OR char bitmap and class bitmap separately */
/* The below joins as many adjacent EXACTish nodes as possible into a single
- * one, and looks for problematic sequences of characters whose folds vs.
- * non-folds have sufficiently different lengths, that the optimizer would be
- * fooled into rejecting legitimate matches of them, and the trie construction
- * code can't cope with them. The joining is only done if:
+ * one. The regop may be changed if the node(s) contain certain sequences that
+ * require special handling. The joining is only done if:
* 1) there is room in the current conglomerated node to entirely contain the
* next one.
* 2) they are the exact same node type
*
- * The adjacent nodes actually may be separated by NOTHING kind nodes, and
+ * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
* these get optimized out
*
- * If there are problematic code sequences, *min_subtract is set to the delta
- * that the minimum size of the node can be less than its actual size. And,
- * the node type of the result is changed to reflect that it contains these
- * sequences.
+ * If a node is to match under /i (folded), the number of characters it matches
+ * can be different than its character length if it contains a multi-character
+ * fold. *min_subtract is set to the total delta of the input nodes.
*
* And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
* and contains LATIN SMALL LETTER SHARP S
*
* This is as good a place as any to discuss the design of handling these
- * problematic sequences. It's been wrong in Perl for a very long time. There
- * are three code points in Unicode whose folded lengths differ so much from
- * the un-folded lengths that it causes problems for the optimizer and trie
- * construction. Why only these are problematic, and not others where lengths
- * also differ is something I (khw) do not understand. New versions of Unicode
- * might add more such code points. Hopefully the logic in fold_grind.t that
- * figures out what to test (in part by verifying that each size-combination
- * gets tested) will catch any that do come along, so they can be added to the
- * special handling below. The chances of new ones are actually rather small,
- * as most, if not all, of the world's scripts that have casefolding have
- * already been encoded by Unicode. Also, a number of Unicode's decisions were
- * made to allow compatibility with pre-existing standards, and almost all of
- * those have already been dealt with. These would otherwise be the most
- * likely candidates for generating further tricky sequences. In other words,
- * Unicode by itself is unlikely to add new ones unless it is for compatibility
- * with pre-existing standards, and there aren't many of those left.
- *
- * The previous designs for dealing with these involved assigning a special
- * node for them. This approach doesn't work, as evidenced by this example:
+ * multi-character fold sequences. It's been wrong in Perl for a very long
+ * time. There are three code points in Unicode whose multi-character folds
+ * were long ago discovered to mess things up. The previous designs for
+ * dealing with these involved assigning a special node for them. This
+ * approach doesn't work, as evidenced by this example:
* "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
- * Both these fold to "sss", but if the pattern is parsed to create a node of
- * that would match just the \xDF, it won't be able to handle the case where a
+ * Both these fold to "sss", but if the pattern is parsed to create a node that
+ * would match just the \xDF, it won't be able to handle the case where a
* successful match would have to cross the node's boundary. The new approach
* that hopefully generally solves the problem generates an EXACTFU_SS node
* that is "sss".
*
- * There are a number of components to the approach (a lot of work for just
- * three code points!):
- * 1) This routine examines each EXACTFish node that could contain the
- * problematic sequences. It returns in *min_subtract how much to
+ * It turns out that there are problems with all multi-character folds, and not
+ * just these three. Now the code is general, for all such cases, but the
+ * three still have some special handling. The approach taken is:
+ * 1) This routine examines each EXACTFish node that could contain multi-
+ * character fold sequences. It returns in *min_subtract how much to
* subtract from the the actual length of the string to get a real minimum
- * for one that could match it. This number is usually 0 except for the
- * problematic sequences. This delta is used by the caller to adjust the
- * min length of the match, and the delta between min and max, so that the
- * optimizer doesn't reject these possibilities based on size constraints.
- * 2) These sequences require special handling by the trie code, so it
- * changes the joined node type to ops for the trie's benefit, those new
- * ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
- * 3) This is sufficient for the two Greek sequences (described below), but
- * the one involving the Sharp s (\xDF) needs more. The node type
- * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
- * sequence in it. For non-UTF-8 patterns and strings, this is the only
- * case where there is a possible fold length change. That means that a
- * regular EXACTFU node without UTF-8 involvement doesn't have to concern
- * itself with length changes, and so can be processed faster. regexec.c
- * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
- * is pre-folded by regcomp.c. This saves effort in regex matching.
- * However, probably mostly for historical reasons, the pre-folding isn't
- * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
- * nodes, as what they fold to isn't known until runtime.) The fold
+ * match length; it is 0 if there are no multi-char folds. This delta is
+ * used by the caller to adjust the min length of the match, and the delta
+ * between min and max, so that the optimizer doesn't reject these
+ * possibilities based on size constraints.
+ * 2) Certain of these sequences require special handling by the trie code,
+ * so, if found, this code changes the joined node type to special ops:
+ * EXACTFU_TRICKYFOLD and EXACTFU_SS.
+ * 3) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
+ * is used for an EXACTFU node that contains at least one "ss" sequence in
+ * it. For non-UTF-8 patterns and strings, this is the only case where
+ * there is a possible fold length change. That means that a regular
+ * EXACTFU node without UTF-8 involvement doesn't have to concern itself
+ * with length changes, and so can be processed faster. regexec.c takes
+ * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
+ * pre-folded by regcomp.c. This saves effort in regex matching.
+ * However, the pre-folding isn't done for non-UTF8 patterns because the
+ * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
+ * down by forcing the pattern into UTF8 unless necessary. Also what
+ * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
* possibilities for the non-UTF8 patterns are quite simple, except for
- * the sharp s. All the ones that don't involve a UTF-8 target string
- * are members of a fold-pair, and arrays are set up for all of them
- * that quickly find the other member of the pair. It might actually
- * be faster to pre-fold these, but it isn't currently done, except for
- * the sharp s. Code elsewhere in this file makes sure that it gets
- * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
- * issues described in the next item.
+ * the sharp s. All the ones that don't involve a UTF-8 target string are
+ * members of a fold-pair, and arrays are set up for all of them so that
+ * the other member of the pair can be found quickly. Code elsewhere in
+ * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
+ * 'ss', even if the pattern isn't UTF-8. This avoids the issues
+ * described in the next item.
* 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
* 'ss' or not is not knowable at compile time. It will match iff the
* target string is in UTF-8, unlike the EXACTFU nodes, where it always
* matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
- * it can't be folded to "ss" at compile time, unlike EXACTFU does as
+ * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
* described in item 3). An assumption that the optimizer part of
* regexec.c (probably unwittingly) makes is that a character in the
* pattern corresponds to at most a single character in the target string.
const unsigned int oldl = STR_LEN(scan);
regnode * const nnext = regnext(n);
+ /* XXX I (khw) kind of doubt that this works on platforms where
+ * U8_MAX is above 255 because of lots of other assumptions */
if (oldl + STR_LEN(n) > U8_MAX)
break;
* hence missed). The sequences only happen in folding, hence for any
* non-EXACT EXACTish node */
if (OP(scan) != EXACT) {
- U8 *s;
- U8 * s0 = (U8*) STRING(scan);
- U8 * const s_end = s0 + STR_LEN(scan);
-
- /* The below is perhaps overboard, but this allows us to save a test
- * each time through the loop at the expense of a mask. This is
- * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
- * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
- * This uses an exclusive 'or' to find that bit and then inverts it to
- * form a mask, with just a single 0, in the bit position where 'S' and
- * 's' differ. */
- const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
- const U8 s_masked = 's' & S_or_s_mask;
+ const U8 * const s0 = (U8*) STRING(scan);
+ const U8 * s = s0;
+ const U8 * const s_end = s0 + STR_LEN(scan);
/* One pass is made over the node's string looking for all the
* possibilities. to avoid some tests in the loop, there are two main
* non-UTF-8 */
if (UTF) {
- /* There are two problematic Greek code points in Unicode
- * casefolding
- *
- * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
- * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
- *
- * which casefold to
- *
- * Unicode UTF-8
- *
- * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
- * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
- *
- * This means that in case-insensitive matching (or "loose
- * matching", as Unicode calls it), an EXACTF of length six (the
- * UTF-8 encoded byte length of the above casefolded versions) can
- * match a target string of length two (the byte length of UTF-8
- * encoded U+0390 or U+03B0). This would rather mess up the
- * minimum length computation. (there are other code points that
- * also fold to these two sequences, but the delta is smaller)
- *
- * If these sequences are found, the minimum length is decreased by
- * four (six minus two).
- *
- * Similarly, 'ss' may match the single char and byte LATIN SMALL
- * LETTER SHARP S. We decrease the min length by 1 for each
- * occurrence of 'ss' found */
-
-#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
-# define U390_first_byte 0xb4
- const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
-# define U3B0_first_byte 0xb5
- const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
-#else
-# define U390_first_byte 0xce
- const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
-# define U3B0_first_byte 0xcf
- const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
-#endif
- const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
- yields a net of 0 */
- /* Examine the string for one of the problematic sequences */
- for (s = s0;
- s < s_end - 1; /* Can stop 1 before the end, as minimum length
- * sequence we are looking for is 2 */
- s += UTF8SKIP(s))
+ /* Examine the string for a multi-character fold sequence. UTF-8
+ * patterns have all characters pre-folded by the time this code is
+ * executed */
+ while (s < s_end - 1) /* Can stop 1 before the end, as minimum
+ length sequence we are looking for is 2 */
{
+ int count = 0;
+ int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
+ if (! len) { /* Not a multi-char fold: get next char */
+ s += UTF8SKIP(s);
+ continue;
+ }
- /* Look for the first byte in each problematic sequence */
- switch (*s) {
- /* We don't have to worry about other things that fold to
- * 's' (such as the long s, U+017F), as all above-latin1
- * code points have been pre-folded */
- case 's':
- case 'S':
-
- /* Current character is an 's' or 'S'. If next one is
- * as well, we have the dreaded sequence */
- if (((*(s+1) & S_or_s_mask) == s_masked)
- /* These two node types don't have special handling
- * for 'ss' */
- && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
- {
- *min_subtract += 1;
- OP(scan) = EXACTFU_SS;
- s++; /* No need to look at this character again */
- }
- break;
-
- case U390_first_byte:
- if (s_end - s >= len
-
- /* The 1's are because are skipping comparing the
- * first byte */
- && memEQ(s + 1, U390_tail, len - 1))
- {
- goto greek_sequence;
- }
- break;
+ /* Nodes with 'ss' require special handling, except for EXACTFL
+ * and EXACTFA for which there is no multi-char fold to this */
+ if (len == 2 && *s == 's' && *(s+1) == 's'
+ && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
+ {
+ count = 2;
+ OP(scan) = EXACTFU_SS;
+ s += 2;
+ }
+ else if (len == 6 /* len is the same in both ASCII and EBCDIC for these */
+ && (memEQ(s, GREEK_SMALL_LETTER_IOTA_UTF8
+ COMBINING_DIAERESIS_UTF8
+ COMBINING_ACUTE_ACCENT_UTF8,
+ 6)
+ || memEQ(s, GREEK_SMALL_LETTER_UPSILON_UTF8
+ COMBINING_DIAERESIS_UTF8
+ COMBINING_ACUTE_ACCENT_UTF8,
+ 6)))
+ {
+ count = 3;
+
+ /* These two folds require special handling by trie's, so
+ * change the node type to indicate this. If EXACTFA and
+ * EXACTFL were ever to be handled by trie's, this would
+ * have to be changed. If this node has already been
+ * changed to EXACTFU_SS in this loop, leave it as is. (I
+ * (khw) think it doesn't matter in regexec.c for UTF
+ * patterns, but no need to change it */
+ if (OP(scan) == EXACTFU) {
+ OP(scan) = EXACTFU_TRICKYFOLD;
+ }
+ s += 6;
+ }
+ else { /* Here is a generic multi-char fold. */
+ const U8* multi_end = s + len;
+
+ /* Count how many characters in it. In the case of /l and
+ * /aa, no folds which contain ASCII code points are
+ * allowed, so check for those, and skip if found. (In
+ * EXACTFL, no folds are allowed to any Latin1 code point,
+ * not just ASCII. But there aren't any of these
+ * currently, nor ever likely, so don't take the time to
+ * test for them. The code that generates the
+ * is_MULTI_foo() macros croaks should one actually get put
+ * into Unicode .) */
+ if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
+ count = utf8_length(s, multi_end);
+ s = multi_end;
+ }
+ else {
+ while (s < multi_end) {
+ if (isASCII(*s)) {
+ s++;
+ goto next_iteration;
+ }
+ else {
+ s += UTF8SKIP(s);
+ }
+ count++;
+ }
+ }
+ }
- case U3B0_first_byte:
- if (! (s_end - s >= len
- && memEQ(s + 1, U3B0_tail, len - 1)))
- {
- break;
- }
- greek_sequence:
- *min_subtract += 4;
-
- /* This can't currently be handled by trie's, so change
- * the node type to indicate this. If EXACTFA and
- * EXACTFL were ever to be handled by trie's, this
- * would have to be changed. If this node has already
- * been changed to EXACTFU_SS in this loop, leave it as
- * is. (I (khw) think it doesn't matter in regexec.c
- * for UTF patterns, but no need to change it */
- if (OP(scan) == EXACTFU) {
- OP(scan) = EXACTFU_TRICKYFOLD;
- }
- s += 6; /* We already know what this sequence is. Skip
- the rest of it */
- break;
- }
+ /* The delta is how long the sequence is minus 1 (1 is how long
+ * the character that folds to the sequence is) */
+ *min_subtract += count - 1;
+ next_iteration: ;
}
}
else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
- /* Here, the pattern is not UTF-8. We need to look only for the
- * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
- * in the final position. Otherwise we can stop looking 1 byte
- * earlier because have to find both the first and second 's' */
+ /* Here, the pattern is not UTF-8. Look for the multi-char folds
+ * that are all ASCII. As in the above case, EXACTFL and EXACTFA
+ * nodes can't have multi-char folds to this range (and there are
+ * no existing ones in the upper latin1 range). In the EXACTF
+ * case we look also for the sharp s, which can be in the final
+ * position. Otherwise we can stop looking 1 byte earlier because
+ * have to find at least two characters for a multi-fold */
const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
- for (s = s0; s < upper; s++) {
- switch (*s) {
- case 'S':
- case 's':
- if (s_end - s > 1
- && ((*(s+1) & S_or_s_mask) == s_masked))
- {
- *min_subtract += 1;
-
- /* EXACTF nodes need to know that the minimum
- * length changed so that a sharp s in the string
- * can match this ss in the pattern, but they
- * remain EXACTF nodes, as they are not trie'able,
- * so don't have to invent a new node type to
- * exclude them from the trie code */
- if (OP(scan) != EXACTF) {
- OP(scan) = EXACTFU_SS;
- }
- s++;
- }
- break;
- case LATIN_SMALL_LETTER_SHARP_S:
- if (OP(scan) == EXACTF) {
- *has_exactf_sharp_s = TRUE;
- }
- break;
+ /* The below is perhaps overboard, but this allows us to save a
+ * test each time through the loop at the expense of a mask. This
+ * is because on both EBCDIC and ASCII machines, 'S' and 's' differ
+ * by a single bit. On ASCII they are 32 apart; on EBCDIC, they
+ * are 64. This uses an exclusive 'or' to find that bit and then
+ * inverts it to form a mask, with just a single 0, in the bit
+ * position where 'S' and 's' differ. */
+ const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
+ const U8 s_masked = 's' & S_or_s_mask;
+
+ while (s < upper) {
+ int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
+ if (! len) { /* Not a multi-char fold. */
+ if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
+ {
+ *has_exactf_sharp_s = TRUE;
+ }
+ s++;
+ continue;
+ }
+
+ if (len == 2
+ && ((*s & S_or_s_mask) == s_masked)
+ && ((*(s+1) & S_or_s_mask) == s_masked))
+ {
+
+ /* EXACTF nodes need to know that the minimum length
+ * changed so that a sharp s in the string can match this
+ * ss in the pattern, but they remain EXACTF nodes, as they
+ * won't match this unless the target string is is UTF-8,
+ * which we don't know until runtime */
+ if (OP(scan) != EXACTF) {
+ OP(scan) = EXACTFU_SS;
+ }
}
+
+ *min_subtract += len - 1;
+ s += len;
}
}
}
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
dVAR;
- I32 min = 0, pars = 0, code;
+ I32 min = 0; /* There must be at least this number of characters to match */
+ I32 pars = 0, code;
regnode *scan = *scanp, *next;
I32 delta = 0;
int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
fake_study_recurse:
while ( scan && OP(scan) != END && scan < last ){
- UV min_subtract = 0; /* How much to subtract from the minimum node
- length to get a real minimum (because the
- folded version may be shorter) */
+ UV min_subtract = 0; /* How mmany chars to subtract from the minimum
+ node length to get a real minimum (because
+ the folded version may be shorter) */
bool has_exactf_sharp_s = FALSE;
/* Peephole optimizer: */
DEBUG_STUDYDATA("Peep:", data,depth);
* trietype so we can turn them into a trie. If/when we
* allow NOTHING to start a trie sequence this condition will be
* required, and it isn't expensive so we leave it in for now. */
- if ( trietype != NOTHING )
+ if ( trietype && trietype != NOTHING )
make_trie( pRExC_state,
startbranch, first, cur, tail, count,
trietype, depth+1 );
"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
});
- if ( last ) {
+ if ( last && trietype ) {
if ( trietype != NOTHING ) {
/* the last branch of the sequence was part of a trie,
* so we have to construct it here outside of the loop
if (uc >= 0x100 ||
(!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
&& !ANYOF_BITMAP_TEST(data->start_class, uc)
- && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
+ && (!(data->start_class->flags & ANYOF_LOC_FOLD)
|| !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
)
{
uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
}
- else if (has_exactf_sharp_s) {
+ if (has_exactf_sharp_s) {
RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
}
min += l - min_subtract;
- if (min < 0) {
- min = 0;
- }
+ assert (min >= 0);
delta += min_subtract;
if (flags & SCF_DO_SUBSTR) {
data->pos_min += l - min_subtract;
if (compat) {
ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
- data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
if (OP(scan) == EXACTFL) {
/* XXX This set is probably no longer necessary, and
* probably wrong as LOCALE now is on in the initial
* state */
- data->start_class->flags |= ANYOF_LOCALE;
+ data->start_class->flags |= ANYOF_LOCALE|ANYOF_LOC_FOLD;
}
else {
}
}
else if (flags & SCF_DO_STCLASS_OR) {
- if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
+ if (data->start_class->flags & ANYOF_LOC_FOLD) {
/* false positive possible if the class is case-folded.
Assume that the locale settings are the same... */
if (uc < 0x100) {
&& !(data->flags & SF_HAS_EVAL)
&& !deltanext /* atom is fixed width */
&& minnext != 0 /* CURLYM can't handle zero width */
+ && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
) {
/* XXXX How to optimize if data == 0? */
/* Optimize to a simpler form. */
cl_and(data->start_class, and_withp);
flags &= ~SCF_DO_STCLASS;
}
- min += 1;
- delta += 1;
+ min++;
+ delta++; /* Because of the 2 char string cr-lf */
if (flags & SCF_DO_SUBSTR) {
SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
data->pos_min += 1;
case ALNUM:
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
+ ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NWORDCHAR);
if (OP(scan) == ALNUMU) {
for (value = 0; value < 256; value++) {
if (!isWORDCHAR_L1(value)) {
}
else {
if (data->start_class->flags & ANYOF_LOCALE)
- ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
+ ANYOF_CLASS_SET(data->start_class,ANYOF_WORDCHAR);
/* Even if under locale, set the bits for non-locale
* in case it isn't a true locale-node. This will
case NALNUM:
if (flags & SCF_DO_STCLASS_AND) {
if (!(data->start_class->flags & ANYOF_LOCALE)) {
- ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
+ ANYOF_CLASS_CLEAR(data->start_class,ANYOF_WORDCHAR);
if (OP(scan) == NALNUMU) {
for (value = 0; value < 256; value++) {
if (isWORDCHAR_L1(value)) {
}
else {
if (data->start_class->flags & ANYOF_LOCALE)
- ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
+ ANYOF_CLASS_SET(data->start_class,ANYOF_NWORDCHAR);
/* Even if under locale, set the bits for non-locale in
* case it isn't a true locale-node. This will create
}
#endif
-/* public(ish) wrapper for Perl_re_op_compile that only takes an SV
- * pattern rather than a list of OPs */
+/* public(ish) entry point for the perl core's own regex compiling code.
+ * It's actually a wrapper for Perl_re_op_compile that only takes an SV
+ * pattern rather than a list of OPs, and uses the internal engine rather
+ * than the current one */
REGEXP *
Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
{
SV *pat = pattern; /* defeat constness! */
PERL_ARGS_ASSERT_RE_COMPILE;
- return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(),
- NULL, NULL, rx_flags, 0);
+ return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
+#ifdef PERL_IN_XSUB_RE
+ &my_reg_engine,
+#else
+ &PL_core_reg_engine,
+#endif
+ NULL, NULL, rx_flags, 0);
}
/* see if there are any run-time code blocks in the pattern.
/* merge the main (r1) and run-time (r2) code blocks into one */
{
- RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
+ RXi_GET_DECL(ReANY((REGEXP *)qr), r2);
struct reg_code_block *new_block, *dst;
RExC_state_t * const r1 = pRExC_state; /* convenient alias */
int i1 = 0, i2 = 0;
if (!r2->num_code_blocks) /* we guessed wrong */
+ {
+ SvREFCNT_dec(qr);
return 1;
+ }
Newx(new_block,
r1->num_code_blocks + r2->num_code_blocks,
}
+STATIC bool
+S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
+{
+ /* This is the common code for setting up the floating and fixed length
+ * string data extracted from Perlre_op_compile() below. Returns a boolean
+ * as to whether succeeded or not */
+
+ I32 t,ml;
+
+ if (! (longest_length
+ || (eol /* Can't have SEOL and MULTI */
+ && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
+ )
+ /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
+ || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
+ {
+ return FALSE;
+ }
+
+ /* copy the information about the longest from the reg_scan_data
+ over to the program. */
+ if (SvUTF8(sv_longest)) {
+ *rx_utf8 = sv_longest;
+ *rx_substr = NULL;
+ } else {
+ *rx_substr = sv_longest;
+ *rx_utf8 = NULL;
+ }
+ /* end_shift is how many chars that must be matched that
+ follow this item. We calculate it ahead of time as once the
+ lookbehind offset is added in we lose the ability to correctly
+ calculate it.*/
+ ml = minlen ? *(minlen) : (I32)longest_length;
+ *rx_end_shift = ml - offset
+ - longest_length + (SvTAIL(sv_longest) != 0)
+ + lookbehind;
+
+ t = (eol/* Can't have SEOL and MULTI */
+ && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
+ fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
+
+ return TRUE;
+}
+
/*
* Perl_re_op_compile - the perl internal RE engine's function to compile a
* regular expression into internal code.
dVAR;
REGEXP *rx;
struct regexp *r;
- register regexp_internal *ri;
+ regexp_internal *ri;
STRLEN plen;
char * VOL exp;
char* xend;
I32 minlen = 0;
U32 rx_flags;
SV * VOL pat;
+ SV * VOL code_blocksv = NULL;
/* these are all flags - maybe they should be turned
* into a single int with different bit masks */
PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
+
+ PL_HasMultiCharFold = _new_invlist_C_array(_Perl_Multi_Char_Folds_invlist);
}
#endif
else {
while (SvAMAGIC(msv)
&& (sv = AMG_CALLunary(msv, string_amg))
- && sv != msv)
- {
+ && sv != msv
+ && !( SvROK(msv)
+ && SvROK(sv)
+ && SvRV(msv) == SvRV(sv))
+ ) {
msv = sv;
SvGETMAGIC(msv);
}
&& RX_ENGINE((REGEXP*)rx)->op_comp)
{
- RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
+ RXi_GET_DECL(ReANY((REGEXP *)rx), ri);
if (ri->num_code_blocks) {
int i;
/* the presence of an embedded qr// with code means
for (i=0; i < ri->num_code_blocks; i++) {
struct reg_code_block *src, *dst;
STRLEN offset = orig_patlen
- + ((struct regexp *)SvANY(rx))->pre_prefix;
+ + ReANY((REGEXP *)rx)->pre_prefix;
assert(n < pRExC_state->num_code_blocks);
src = &ri->code_blocks[i];
dst = &pRExC_state->code_blocks[n];
runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
exp, plen);
if (!runtime_code) {
- ReREFCNT_inc(old_re);
if (used_setjump) {
JMPENV_POP;
}
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
RExC_extralen = 0;
RExC_override_recoding = 0;
+ RExC_in_multi_char_class = 0;
/* First pass: determine size, legality. */
RExC_parse = exp;
RExC_lastnum=0;
RExC_lastparse=NULL;
);
+ /* reg may croak on us, not giving us a chance to free
+ pRExC_state->code_blocks. We cannot SAVEFREEPV it now, as we may
+ need it to survive as long as the regexp (qr/(?{})/).
+ We must check that code_blocksv is not already set, because we may
+ have longjmped back. */
+ if (pRExC_state->code_blocks && !code_blocksv) {
+ code_blocksv = newSV_type(SVt_PV);
+ SAVEFREESV(code_blocksv);
+ SvPV_set(code_blocksv, (char *)pRExC_state->code_blocks);
+ SvLEN_set(code_blocksv, 1); /*sufficient to make sv_clear free it*/
+ }
if (reg(pRExC_state, 0, &flags,1) == NULL) {
RExC_precomp = NULL;
- Safefree(pRExC_state->code_blocks);
return(NULL);
}
+ if (code_blocksv)
+ SvLEN_set(code_blocksv,0); /* no you can't have it, sv_clear */
/* Here, finished first pass. Get rid of any added setjmp */
if (used_setjump) {
of zeroing when in debug mode, thus anything assigned has to
happen after that */
rx = (REGEXP*) newSV_type(SVt_REGEXP);
- r = (struct regexp*)SvANY(rx);
+ r = ReANY(rx);
Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
char, regexp_internal);
if ( r == NULL || ri == NULL )
ri->num_code_blocks = pRExC_state->num_code_blocks;
}
else
+ {
+ int n;
+ for (n = 0; n < pRExC_state->num_code_blocks; n++)
+ if (pRExC_state->code_blocks[n].src_regex)
+ SAVEFREESV(pRExC_state->code_blocks[n].src_regex);
SAVEFREEPV(pRExC_state->code_blocks);
+ }
{
bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
+ (sizeof(STD_PAT_MODS) - 1)
+ (sizeof("(?:)") - 1);
- p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
- SvPOK_on(rx);
+ Newx(p, wraplen + 1, char); /* +1 for the ending NUL */
+ r->xpv_len_u.xpvlenu_pv = p;
if (RExC_utf8)
SvFLAGS(rx) |= SVf_UTF8;
*p++='('; *p++='?';
*p++ = '\n';
*p++ = ')';
*p = 0;
- SvCUR_set(rx, p - SvPVX_const(rx));
+ SvCUR_set(rx, p - RX_WRAPPED(rx));
}
r->intflags = 0;
scan_commit(pRExC_state, &data,&minlen,0);
SvREFCNT_dec(data.last_found);
- /* Note that code very similar to this but for anchored string
- follows immediately below, changes may need to be made to both.
- Be careful.
- */
longest_float_length = CHR_SVLEN(data.longest_float);
- if (longest_float_length
- || (data.flags & SF_FL_BEFORE_EOL
- && (!(data.flags & SF_FL_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE))))
- {
- I32 t,ml;
- /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
- if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
- || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
- && data.offset_fixed == data.offset_float_min
- && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
- goto remove_float; /* As in (a)+. */
-
- /* copy the information about the longest float from the reg_scan_data
- over to the program. */
- if (SvUTF8(data.longest_float)) {
- r->float_utf8 = data.longest_float;
- r->float_substr = NULL;
- } else {
- r->float_substr = data.longest_float;
- r->float_utf8 = NULL;
- }
- /* float_end_shift is how many chars that must be matched that
- follow this item. We calculate it ahead of time as once the
- lookbehind offset is added in we lose the ability to correctly
- calculate it.*/
- ml = data.minlen_float ? *(data.minlen_float)
- : (I32)longest_float_length;
- r->float_end_shift = ml - data.offset_float_min
- - longest_float_length + (SvTAIL(data.longest_float) != 0)
- + data.lookbehind_float;
+ if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
+ && data.offset_fixed == data.offset_float_min
+ && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
+ && S_setup_longest (aTHX_ pRExC_state,
+ data.longest_float,
+ &(r->float_utf8),
+ &(r->float_substr),
+ &(r->float_end_shift),
+ data.lookbehind_float,
+ data.offset_float_min,
+ data.minlen_float,
+ longest_float_length,
+ data.flags & SF_FL_BEFORE_EOL,
+ data.flags & SF_FL_BEFORE_MEOL))
+ {
r->float_min_offset = data.offset_float_min - data.lookbehind_float;
r->float_max_offset = data.offset_float_max;
if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
r->float_max_offset -= data.lookbehind_float;
-
- t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
- && (!(data.flags & SF_FL_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE)));
- fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
}
else {
- remove_float:
r->float_substr = r->float_utf8 = NULL;
SvREFCNT_dec(data.longest_float);
longest_float_length = 0;
}
- /* Note that code very similar to this but for floating string
- is immediately above, changes may need to be made to both.
- Be careful.
- */
longest_fixed_length = CHR_SVLEN(data.longest_fixed);
- /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
- if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
- && (longest_fixed_length
- || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
- && (!(data.flags & SF_FIX_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE)))) )
+ if (S_setup_longest (aTHX_ pRExC_state,
+ data.longest_fixed,
+ &(r->anchored_utf8),
+ &(r->anchored_substr),
+ &(r->anchored_end_shift),
+ data.lookbehind_fixed,
+ data.offset_fixed,
+ data.minlen_fixed,
+ longest_fixed_length,
+ data.flags & SF_FIX_BEFORE_EOL,
+ data.flags & SF_FIX_BEFORE_MEOL))
{
- I32 t,ml;
-
- /* copy the information about the longest fixed
- from the reg_scan_data over to the program. */
- if (SvUTF8(data.longest_fixed)) {
- r->anchored_utf8 = data.longest_fixed;
- r->anchored_substr = NULL;
- } else {
- r->anchored_substr = data.longest_fixed;
- r->anchored_utf8 = NULL;
- }
- /* fixed_end_shift is how many chars that must be matched that
- follow this item. We calculate it ahead of time as once the
- lookbehind offset is added in we lose the ability to correctly
- calculate it.*/
- ml = data.minlen_fixed ? *(data.minlen_fixed)
- : (I32)longest_fixed_length;
- r->anchored_end_shift = ml - data.offset_fixed
- - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
- + data.lookbehind_fixed;
r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
-
- t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
- && (!(data.flags & SF_FIX_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE)));
- fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
}
else {
r->anchored_substr = r->anchored_utf8 = NULL;
SvREFCNT_dec(data.longest_fixed);
longest_fixed_length = 0;
}
+
if (ri->regstclass
&& (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
ri->regstclass = NULL;
if (RExC_seen & REG_SEEN_CANY)
r->extflags |= RXf_CANY_SEEN;
if (RExC_seen & REG_SEEN_VERBARG)
+ {
r->intflags |= PREGf_VERBARG_SEEN;
+ r->extflags |= RXf_MODIFIES_VARS;
+ }
if (RExC_seen & REG_SEEN_CUTGROUP)
r->intflags |= PREGf_CUTGROUP_SEEN;
if (pm_flags & PMf_USE_RE_EVAL)
#ifdef STUPID_PATTERN_CHECKS
if (RX_PRELEN(rx) == 0)
r->extflags |= RXf_NULL;
- if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
- /* XXX: this should happen BEFORE we compile */
- r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
- else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
+ if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
r->extflags |= RXf_WHITE;
else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
r->extflags |= RXf_START_ONLY;
#else
- if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
- /* XXX: this should happen BEFORE we compile */
- r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
- else {
+ {
regnode *first = ri->program + 1;
U8 fop = OP(first);
{
AV *retarray = NULL;
SV *ret;
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
const U32 flags)
{
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
SV*
Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
{
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
SV*
Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
{
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
SV *ret;
AV *av;
I32 length;
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
SV*
Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
{
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
AV *av = newAV();
PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
SV * const sv)
{
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
char *s = NULL;
I32 i = 0;
I32 s1, t1;
+ I32 n = paren;
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
- if (!rx->subbeg) {
- sv_setsv(sv,&PL_sv_undef);
- return;
- }
- else
- if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
- /* $` */
+ if ( ( n == RX_BUFF_IDX_CARET_PREMATCH
+ || n == RX_BUFF_IDX_CARET_FULLMATCH
+ || n == RX_BUFF_IDX_CARET_POSTMATCH
+ )
+ && !(rx->extflags & RXf_PMf_KEEPCOPY)
+ )
+ goto ret_undef;
+
+ if (!rx->subbeg)
+ goto ret_undef;
+
+ if (n == RX_BUFF_IDX_CARET_FULLMATCH)
+ /* no need to distinguish between them any more */
+ n = RX_BUFF_IDX_FULLMATCH;
+
+ if ((n == RX_BUFF_IDX_PREMATCH || n == RX_BUFF_IDX_CARET_PREMATCH)
+ && rx->offs[0].start != -1)
+ {
+ /* $`, ${^PREMATCH} */
i = rx->offs[0].start;
s = rx->subbeg;
}
else
- if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
- /* $' */
- s = rx->subbeg + rx->offs[0].end;
- i = rx->sublen - rx->offs[0].end;
+ if ((n == RX_BUFF_IDX_POSTMATCH || n == RX_BUFF_IDX_CARET_POSTMATCH)
+ && rx->offs[0].end != -1)
+ {
+ /* $', ${^POSTMATCH} */
+ s = rx->subbeg - rx->suboffset + rx->offs[0].end;
+ i = rx->sublen + rx->suboffset - rx->offs[0].end;
}
else
- if ( 0 <= paren && paren <= (I32)rx->nparens &&
- (s1 = rx->offs[paren].start) != -1 &&
- (t1 = rx->offs[paren].end) != -1)
+ if ( 0 <= n && n <= (I32)rx->nparens &&
+ (s1 = rx->offs[n].start) != -1 &&
+ (t1 = rx->offs[n].end) != -1)
{
- /* $& $1 ... */
+ /* $&, ${^MATCH}, $1 ... */
i = t1 - s1;
- s = rx->subbeg + s1;
+ s = rx->subbeg + s1 - rx->suboffset;
} else {
- sv_setsv(sv,&PL_sv_undef);
- return;
+ goto ret_undef;
}
+
+ assert(s >= rx->subbeg);
assert(rx->sublen >= (s - rx->subbeg) + i );
if (i >= 0) {
const int oldtainted = PL_tainted;
SvTAINTED_off(sv);
}
} else {
+ ret_undef:
sv_setsv(sv,&PL_sv_undef);
return;
}
Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
const I32 paren)
{
- struct regexp *const rx = (struct regexp *)SvANY(r);
+ struct regexp *const rx = ReANY(r);
I32 i;
I32 s1, t1;
PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
/* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
- switch (paren) {
- /* $` / ${^PREMATCH} */
- case RX_BUFF_IDX_PREMATCH:
+ switch (paren) {
+ case RX_BUFF_IDX_CARET_PREMATCH: /* ${^PREMATCH} */
+ if (!(rx->extflags & RXf_PMf_KEEPCOPY))
+ goto warn_undef;
+ /*FALLTHROUGH*/
+
+ case RX_BUFF_IDX_PREMATCH: /* $` */
if (rx->offs[0].start != -1) {
i = rx->offs[0].start;
if (i > 0) {
}
}
return 0;
- /* $' / ${^POSTMATCH} */
- case RX_BUFF_IDX_POSTMATCH:
+
+ case RX_BUFF_IDX_CARET_POSTMATCH: /* ${^POSTMATCH} */
+ if (!(rx->extflags & RXf_PMf_KEEPCOPY))
+ goto warn_undef;
+ case RX_BUFF_IDX_POSTMATCH: /* $' */
if (rx->offs[0].end != -1) {
i = rx->sublen - rx->offs[0].end;
if (i > 0) {
}
}
return 0;
+
+ case RX_BUFF_IDX_CARET_FULLMATCH: /* ${^MATCH} */
+ if (!(rx->extflags & RXf_PMf_KEEPCOPY))
+ goto warn_undef;
+ /*FALLTHROUGH*/
+
/* $& / ${^MATCH}, $1, $2, ... */
default:
if (paren <= (I32)rx->nparens &&
i = t1 - s1;
goto getlen;
} else {
+ warn_undef:
if (ckWARN(WARN_UNINITIALIZED))
report_uninit((const SV *)sv);
return 0;
}
getlen:
if (i > 0 && RXp_MATCH_UTF8(rx)) {
- const char * const s = rx->subbeg + s1;
+ const char * const s = rx->subbeg - rx->suboffset + s1;
const U8 *ep;
STRLEN el;
* list.)
* Taking the complement (inverting) an inversion list is quite simple, if the
* first element is 0, remove it; otherwise add a 0 element at the beginning.
- * This implementation reserves an element at the beginning of each inversion list
- * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
- * beginning of the list is either that element if 0, or the next one if 1.
+ * This implementation reserves an element at the beginning of each inversion
+ * list to contain 0 when the list contains 0, and contains 1 otherwise. The
+ * actual beginning of the list is either that element if 0, or the next one if
+ * 1.
*
* More about inversion lists can be found in "Unicode Demystified"
* Chapter 13 by Richard Gillam, published by Addison-Wesley.
* Some of the methods should always be private to the implementation, and some
* should eventually be made public */
-#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
-#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
-
-/* This is a combination of a version and data structure type, so that one
- * being passed in can be validated to be an inversion list of the correct
- * vintage. When the structure of the header is changed, a new random number
- * in the range 2**31-1 should be generated and the new() method changed to
- * insert that at this location. Then, if an auxiliary program doesn't change
- * correspondingly, it will be discovered immediately */
-#define INVLIST_VERSION_ID_OFFSET 2
-#define INVLIST_VERSION_ID 1064334010
-
-/* For safety, when adding new elements, remember to #undef them at the end of
- * the inversion list code section */
+/* The header definitions are in F<inline_invlist.c> */
-#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
-/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
- * contains the code point U+00000, and begins here. If 1, the inversion list
- * doesn't contain U+0000, and it begins at the next UV in the array.
- * Inverting an inversion list consists of adding or removing the 0 at the
- * beginning of it. By reserving a space for that 0, inversion can be made
- * very fast */
-
-#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
-
-/* Internally things are UVs */
#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
/* Must be empty */
- assert(! *get_invlist_len_addr(invlist));
+ assert(! *_get_invlist_len_addr(invlist));
/* 1^1 = 0; 1^0 = 1 */
*zero = 1 ^ will_have_0;
/* Must not be empty. If these fail, you probably didn't check for <len>
* being non-zero before trying to get the array */
- assert(*get_invlist_len_addr(invlist));
+ assert(*_get_invlist_len_addr(invlist));
assert(*get_invlist_zero_addr(invlist) == 0
|| *get_invlist_zero_addr(invlist) == 1);
+ *get_invlist_zero_addr(invlist));
}
-PERL_STATIC_INLINE UV*
-S_get_invlist_len_addr(pTHX_ SV* invlist)
-{
- /* Return the address of the UV that contains the current number
- * of used elements in the inversion list */
-
- PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
-
- return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
-}
-
-PERL_STATIC_INLINE UV
-S_invlist_len(pTHX_ SV* const invlist)
-{
- /* Returns the current number of elements stored in the inversion list's
- * array */
-
- PERL_ARGS_ASSERT_INVLIST_LEN;
-
- return *get_invlist_len_addr(invlist);
-}
-
PERL_STATIC_INLINE void
S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
{
PERL_ARGS_ASSERT_INVLIST_SET_LEN;
- *get_invlist_len_addr(invlist) = len;
+ *_get_invlist_len_addr(invlist) = len;
assert(len <= SvLEN(invlist));
* Note that when inverting, SvCUR shouldn't change */
}
+PERL_STATIC_INLINE IV*
+S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
+{
+ /* Return the address of the UV that is reserved to hold the cached index
+ * */
+
+ PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
+
+ return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
+}
+
+PERL_STATIC_INLINE IV
+S_invlist_previous_index(pTHX_ SV* const invlist)
+{
+ /* Returns cached index of previous search */
+
+ PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
+
+ return *get_invlist_previous_index_addr(invlist);
+}
+
+PERL_STATIC_INLINE void
+S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
+{
+ /* Caches <index> for later retrieval */
+
+ PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
+
+ assert(index == 0 || index < (int) _invlist_len(invlist));
+
+ *get_invlist_previous_index_addr(invlist) = index;
+}
+
PERL_STATIC_INLINE UV
S_invlist_max(pTHX_ SV* const invlist)
{
* properly */
*get_invlist_zero_addr(new_list) = UV_MAX;
+ *get_invlist_previous_index_addr(new_list) = 0;
*get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
-#if HEADER_LENGTH != 4
+#if HEADER_LENGTH != 5
# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
#endif
SvPV_set(invlist, (char *) list);
SvLEN_set(invlist, 0); /* Means we own the contents, and the system
shouldn't touch it */
- SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
+ SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
SvPV_shrink_to_cur((SV *) invlist);
}
-/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
- * etc */
-#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
-#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
-
#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
STATIC void
UV* array;
UV max = invlist_max(invlist);
- UV len = invlist_len(invlist);
+ UV len = _invlist_len(invlist);
PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
}
}
-STATIC IV
-S_invlist_search(pTHX_ SV* const invlist, const UV cp)
+#ifndef PERL_IN_XSUB_RE
+
+IV
+Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
{
/* Searches the inversion list for the entry that contains the input code
* point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
* contains <cp> */
IV low = 0;
- IV high = invlist_len(invlist);
- const UV * const array = invlist_array(invlist);
+ IV mid;
+ IV high = _invlist_len(invlist);
+ const IV highest_element = high - 1;
+ const UV* array;
- PERL_ARGS_ASSERT_INVLIST_SEARCH;
+ PERL_ARGS_ASSERT__INVLIST_SEARCH;
- /* If list is empty or the code point is before the first element, return
- * failure. */
- if (high == 0 || cp < array[0]) {
+ /* If list is empty, return failure. */
+ if (high == 0) {
return -1;
}
+ /* If the code point is before the first element, return failure. (We
+ * can't combine this with the test above, because we can't get the array
+ * unless we know the list is non-empty) */
+ array = invlist_array(invlist);
+
+ mid = invlist_previous_index(invlist);
+ assert(mid >=0 && mid <= highest_element);
+
+ /* <mid> contains the cache of the result of the previous call to this
+ * function (0 the first time). See if this call is for the same result,
+ * or if it is for mid-1. This is under the theory that calls to this
+ * function will often be for related code points that are near each other.
+ * And benchmarks show that caching gives better results. We also test
+ * here if the code point is within the bounds of the list. These tests
+ * replace others that would have had to be made anyway to make sure that
+ * the array bounds were not exceeded, and give us extra information at the
+ * same time */
+ if (cp >= array[mid]) {
+ if (cp >= array[highest_element]) {
+ return highest_element;
+ }
+
+ /* Here, array[mid] <= cp < array[highest_element]. This means that
+ * the final element is not the answer, so can exclude it; it also
+ * means that <mid> is not the final element, so can refer to 'mid + 1'
+ * safely */
+ if (cp < array[mid + 1]) {
+ return mid;
+ }
+ high--;
+ low = mid + 1;
+ }
+ else { /* cp < aray[mid] */
+ if (cp < array[0]) { /* Fail if outside the array */
+ return -1;
+ }
+ high = mid;
+ if (cp >= array[mid - 1]) {
+ goto found_entry;
+ }
+ }
+
/* Binary search. What we are looking for is <i> such that
* array[i] <= cp < array[i+1]
- * The loop below converges on the i+1. */
+ * The loop below converges on the i+1. Note that there may not be an
+ * (i+1)th element in the array, and things work nonetheless */
while (low < high) {
- IV mid = (low + high) / 2;
- if (array[mid] <= cp) {
+ mid = (low + high) / 2;
+ assert(mid <= highest_element);
+ if (array[mid] <= cp) { /* cp >= array[mid] */
low = mid + 1;
/* We could do this extra test to exit the loop early.
}
}
- return high - 1;
+ found_entry:
+ high--;
+ invlist_set_previous_index(invlist, high);
+ return high;
}
-#ifndef PERL_IN_XSUB_RE
-
void
Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
{
* that <swatch> is all 0's on input */
UV current = start;
- const IV len = invlist_len(invlist);
+ const IV len = _invlist_len(invlist);
IV i;
const UV * array;
array = invlist_array(invlist);
/* Find which element it is */
- i = invlist_search(invlist, start);
+ i = _invlist_search(invlist, start);
/* We populate from <start> to <end> */
while (current < end) {
current = array[i];
if (current >= end) { /* Finished if beyond the end of what we
are populating */
- return;
+ if (LIKELY(end < UV_MAX)) {
+ return;
+ }
+
+ /* We get here when the upper bound is the maximum
+ * representable on the machine, and we are looking for just
+ * that code point. Have to special case it */
+ i = len;
+ goto join_end_of_list;
}
}
assert(current >= start);
swatch[offset >> 3] |= 1 << (offset & 7);
}
+ join_end_of_list:
+
/* Quit if at the end of the list */
if (i >= len) {
assert(a != b);
/* If either one is empty, the union is the other one */
- if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
+ if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
if (*output == a) {
if (a != NULL) {
SvREFCNT_dec(a);
} /* else *output already = b; */
return;
}
- else if ((len_b = invlist_len(b)) == 0) {
+ else if ((len_b = _invlist_len(b)) == 0) {
if (*output == b) {
SvREFCNT_dec(b);
}
/* Set result to final length, which can change the pointer to array_u, so
* re-find it */
- if (len_u != invlist_len(u)) {
+ if (len_u != _invlist_len(u)) {
invlist_set_len(u, len_u);
invlist_trim(u);
array_u = invlist_array(u);
assert(a != b);
/* Special case if either one is empty */
- len_a = invlist_len(a);
- if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
+ len_a = _invlist_len(a);
+ if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
if (len_a != 0 && complement_b) {
/* Set result to final length, which can change the pointer to array_r, so
* re-find it */
- if (len_r != invlist_len(r)) {
+ if (len_r != _invlist_len(r)) {
invlist_set_len(r, len_r);
invlist_trim(r);
array_r = invlist_array(r);
len = 0;
}
else {
- len = invlist_len(invlist);
+ len = _invlist_len(invlist);
}
/* If comes after the final entry, can just append it to the end */
if (len == 0
|| start >= invlist_array(invlist)
- [invlist_len(invlist) - 1])
+ [_invlist_len(invlist) - 1])
{
_append_range_to_invlist(invlist, start, end);
return invlist;
#endif
-STATIC bool
-S__invlist_contains_cp(pTHX_ SV* const invlist, const UV cp)
-{
- /* Does <invlist> contain code point <cp> as part of the set? */
-
- IV index = invlist_search(invlist, cp);
-
- PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP;
-
- return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index);
-}
-
PERL_STATIC_INLINE SV*
S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
return _add_range_to_invlist(invlist, cp, cp);
* have a zero; removes it otherwise. As described above, the data
* structure is set up so that this is very efficient */
- UV* len_pos = get_invlist_len_addr(invlist);
+ UV* len_pos = _get_invlist_len_addr(invlist);
PERL_ARGS_ASSERT__INVLIST_INVERT;
_invlist_invert(invlist);
- len = invlist_len(invlist);
+ len = _invlist_len(invlist);
if (len != 0) { /* If empty do nothing */
array = invlist_array(invlist);
/* Need to allocate extra space to accommodate Perl's addition of a
* trailing NUL to SvPV's, since it thinks they are always strings */
- SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
+ SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
STRLEN length = SvCUR(invlist);
PERL_ARGS_ASSERT_INVLIST_CLONE;
* will start over at the beginning of the list */
UV* pos = get_invlist_iter_addr(invlist);
- UV len = invlist_len(invlist);
+ UV len = _invlist_len(invlist);
UV *array;
PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
* 0, or if the list is empty. If this distinction matters to you, check
* for emptiness before calling this function */
- UV len = invlist_len(invlist);
+ UV len = _invlist_len(invlist);
UV *array;
PERL_ARGS_ASSERT_INVLIST_HIGHEST;
UV* array_a = invlist_array(a);
UV* array_b = invlist_array(b);
- UV len_a = invlist_len(a);
- UV len_b = invlist_len(b);
+ UV len_a = _invlist_len(a);
+ UV len_b = _invlist_len(b);
UV i = 0; /* current index into the arrays */
bool retval = TRUE; /* Assume are identical until proven otherwise */
/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
{
dVAR;
- register regnode *ret; /* Will be the head of the group. */
- register regnode *br;
- register regnode *lastbr;
- register regnode *ender = NULL;
- register I32 parno = 0;
+ regnode *ret; /* Will be the head of the group. */
+ regnode *br;
+ regnode *lastbr;
+ regnode *ender = NULL;
+ I32 parno = 0;
I32 flags;
U32 oregflags = RExC_flags;
bool have_branch = 0;
S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
{
dVAR;
- register regnode *ret;
- register regnode *chain = NULL;
- register regnode *latest;
+ regnode *ret;
+ regnode *chain = NULL;
+ regnode *latest;
I32 flags = 0, c = 0;
GET_RE_DEBUG_FLAGS_DECL;
S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
dVAR;
- register regnode *ret;
- register char op;
- register char *next;
+ regnode *ret;
+ char op;
+ char *next;
I32 flags;
const char * const origparse = RExC_parse;
I32 min;
char *parse_start;
#endif
const char *maxpos = NULL;
+
+ /* Save the original in case we change the emitted regop to a FAIL. */
+ regnode * const orig_emit = RExC_emit;
+
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGPIECE;
vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
RExC_parse = next;
nextchar(pRExC_state);
+ if (max < min) { /* If can't match, warn and optimize to fail
+ unconditionally */
+ if (SIZE_ONLY) {
+ ckWARNreg(RExC_parse, "Quantifier {n,m} with n > m can't match");
+
+ /* We can't back off the size because we have to reserve
+ * enough space for all the things we are about to throw
+ * away, but we can shrink it by the ammount we are about
+ * to re-use here */
+ RExC_size = PREVOPER(RExC_size) - regarglen[(U8)OPFAIL];
+ }
+ else {
+ RExC_emit = orig_emit;
+ }
+ ret = reg_node(pRExC_state, OPFAIL);
+ return ret;
+ }
do_curly:
if ((flags&SIMPLE)) {
*flagp = WORST;
if (max > 0)
*flagp |= HASWIDTH;
- if (max < min)
- vFAIL("Can't do {n,m} with n > m");
if (!SIZE_ONLY) {
ARG1_SET(ret, (U16)min);
ARG2_SET(ret, (U16)max);
return(ret);
}
-
-/* reg_namedseq(pRExC_state,UVp, UV depth)
+STATIC bool
+S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
+{
- This is expected to be called by a parser routine that has
- recognized '\N' and needs to handle the rest. RExC_parse is
- expected to point at the first char following the N at the time
- of the call.
+ /* This is expected to be called by a parser routine that has recognized '\N'
+ and needs to handle the rest. RExC_parse is expected to point at the first
+ char following the N at the time of the call. On successful return,
+ RExC_parse has been updated to point to just after the sequence identified
+ by this routine, and <*flagp> has been updated.
- The \N may be inside (indicated by valuep not being NULL) or outside a
+ The \N may be inside (indicated by the boolean <in_char_class>) or outside a
character class.
\N may begin either a named sequence, or if outside a character class, mean
to match a non-newline. For non single-quoted regexes, the tokenizer has
- attempted to decide which, and in the case of a named sequence converted it
+ attempted to decide which, and in the case of a named sequence, converted it
into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
where c1... are the characters in the sequence. For single-quoted regexes,
the tokenizer passes the \N sequence through unchanged; this code will not
- attempt to determine this nor expand those. The net effect is that if the
- beginning of the passed-in pattern isn't '{U+' or there is no '}', it
- signals that this \N occurrence means to match a non-newline.
-
+ attempt to determine this nor expand those, instead raising a syntax error.
+ The net effect is that if the beginning of the passed-in pattern isn't '{U+'
+ or there is no '}', it signals that this \N occurrence means to match a
+ non-newline.
+
Only the \N{U+...} form should occur in a character class, for the same
reason that '.' inside a character class means to just match a period: it
just doesn't make sense.
-
- If valuep is non-null then it is assumed that we are parsing inside
- of a charclass definition and the first codepoint in the resolved
- string is returned via *valuep and the routine will return NULL.
- In this mode if a multichar string is returned from the charnames
- handler, a warning will be issued, and only the first char in the
- sequence will be examined. If the string returned is zero length
- then the value of *valuep is undefined and NON-NULL will
- be returned to indicate failure. (This will NOT be a valid pointer
- to a regnode.)
-
- If valuep is null then it is assumed that we are parsing normal text and a
- new EXACT node is inserted into the program containing the resolved string,
- and a pointer to the new node is returned. But if the string is zero length
- a NOTHING node is emitted instead.
- On success RExC_parse is set to the char following the endbrace.
- Parsing failures will generate a fatal error via vFAIL(...)
+ The function raises an error (via vFAIL), and doesn't return for various
+ syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
+ success; it returns FALSE otherwise.
+
+ If <valuep> is non-null, it means the caller can accept an input sequence
+ consisting of a just a single code point; <*valuep> is set to that value
+ if the input is such.
+
+ If <node_p> is non-null it signifies that the caller can accept any other
+ legal sequence (i.e., one that isn't just a single code point). <*node_p>
+ is set as follows:
+ 1) \N means not-a-NL: points to a newly created REG_ANY node;
+ 2) \N{}: points to a new NOTHING node;
+ 3) otherwise: points to a new EXACT node containing the resolved
+ string.
+ Note that FALSE is returned for single code point sequences if <valuep> is
+ null.
*/
-STATIC regnode *
-S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
-{
+
char * endbrace; /* '}' following the name */
- regnode *ret = NULL;
char* p;
+ char *endchar; /* Points to '.' or '}' ending cur char in the input
+ stream */
+ bool has_multiple_chars; /* true if the input stream contains a sequence of
+ more than one character */
GET_RE_DEBUG_FLAGS_DECL;
- PERL_ARGS_ASSERT_REG_NAMEDSEQ;
+ PERL_ARGS_ASSERT_GROK_BSLASH_N;
GET_RE_DEBUG_FLAGS;
+ assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
+
/* The [^\n] meaning of \N ignores spaces and comments under the /x
* modifier. The other meaning does not */
p = (RExC_flags & RXf_PMf_EXTENDED)
? regwhite( pRExC_state, RExC_parse )
: RExC_parse;
-
+
/* Disambiguate between \N meaning a named character versus \N meaning
* [^\n]. The former is assumed when it can't be the latter. */
if (*p != '{' || regcurly(p)) {
RExC_parse = p;
- if (valuep) {
+ if (! node_p) {
/* no bare \N in a charclass */
- vFAIL("\\N in a character class must be a named character: \\N{...}");
- }
+ if (in_char_class) {
+ vFAIL("\\N in a character class must be a named character: \\N{...}");
+ }
+ return FALSE;
+ }
nextchar(pRExC_state);
- ret = reg_node(pRExC_state, REG_ANY);
+ *node_p = reg_node(pRExC_state, REG_ANY);
*flagp |= HASWIDTH|SIMPLE;
RExC_naughty++;
RExC_parse--;
- Set_Node_Length(ret, 1); /* MJD */
- return ret;
+ Set_Node_Length(*node_p, 1); /* MJD */
+ return TRUE;
}
- /* Here, we have decided it should be a named sequence */
+ /* Here, we have decided it should be a named character or sequence */
/* The test above made sure that the next real character is a '{', but
* under the /x modifier, it could be separated by space (or a comment and
}
if (endbrace == RExC_parse) { /* empty: \N{} */
- if (! valuep) {
- RExC_parse = endbrace + 1;
- return reg_node(pRExC_state,NOTHING);
- }
-
- if (SIZE_ONLY) {
- ckWARNreg(RExC_parse,
- "Ignoring zero length \\N{} in character class"
- );
- RExC_parse = endbrace + 1;
+ bool ret = TRUE;
+ if (node_p) {
+ *node_p = reg_node(pRExC_state,NOTHING);
+ }
+ else if (in_char_class) {
+ if (SIZE_ONLY && in_char_class) {
+ ckWARNreg(RExC_parse,
+ "Ignoring zero length \\N{} in character class"
+ );
+ }
+ ret = FALSE;
}
- *valuep = 0;
- return (regnode *) &RExC_parse; /* Invalid regnode pointer */
+ else {
+ return FALSE;
+ }
+ nextchar(pRExC_state);
+ return ret;
}
- REQUIRE_UTF8; /* named sequences imply Unicode semantics */
+ RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
RExC_parse += 2; /* Skip past the 'U+' */
- if (valuep) { /* In a bracketed char class */
- /* We only pay attention to the first char of
- multichar strings being returned. I kinda wonder
+ endchar = RExC_parse + strcspn(RExC_parse, ".}");
+
+ /* Code points are separated by dots. If none, there is only one code
+ * point, and is terminated by the brace */
+ has_multiple_chars = (endchar < endbrace);
+
+ if (valuep && (! has_multiple_chars || in_char_class)) {
+ /* We only pay attention to the first char of
+ multichar strings being returned in char classes. I kinda wonder
if this makes sense as it does change the behaviour
from earlier versions, OTOH that behaviour was broken
as well. XXX Solution is to recharacterize as
[rest-of-class]|multi1|multi2... */
- STRLEN length_of_hex;
- I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
+ I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
| PERL_SCAN_DISALLOW_PREFIX
| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
-
- char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
- if (endchar < endbrace) {
- ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
- }
- length_of_hex = (STRLEN)(endchar - RExC_parse);
- *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
+ *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
/* The tokenizer should have guaranteed validity, but it's possible to
* bypass it by using single quoting, so check */
? UTF8SKIP(RExC_parse)
: 1;
/* Guard against malformed utf8 */
- if (RExC_parse >= endchar) RExC_parse = endchar;
+ if (RExC_parse >= endchar) {
+ RExC_parse = endchar;
+ }
vFAIL("Invalid hexadecimal number in \\N{U+...}");
- }
+ }
+
+ if (in_char_class && has_multiple_chars) {
+ ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+ }
- RExC_parse = endbrace + 1;
- if (endchar == endbrace) return NULL;
+ RExC_parse = endbrace + 1;
+ }
+ else if (! node_p || ! has_multiple_chars) {
- ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
+ /* Here, the input is legal, but not according to the caller's
+ * options. We fail without advancing the parse, so that the
+ * caller can try again */
+ RExC_parse = p;
+ return FALSE;
}
- else { /* Not a char class */
+ else {
/* What is done here is to convert this to a sub-pattern of the form
* (?:\x{char1}\x{char2}...)
SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
STRLEN len;
- char *endchar; /* Points to '.' or '}' ending cur char in the input
- stream */
char *orig_end = RExC_end;
+ I32 flags;
while (RExC_parse < endbrace) {
- /* Code points are separated by dots. If none, there is only one
- * code point, and is terminated by the brace */
- endchar = RExC_parse + strcspn(RExC_parse, ".}");
-
/* Convert to notation the rest of the code understands */
sv_catpv(substitute_parse, "\\x{");
sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
/* Point to the beginning of the next character in the sequence. */
RExC_parse = endchar + 1;
+ endchar = RExC_parse + strcspn(RExC_parse, ".}");
}
sv_catpv(substitute_parse, ")");
/* The values are Unicode, and therefore not subject to recoding */
RExC_override_recoding = 1;
- ret = reg(pRExC_state, 1, flagp, depth+1);
+ *node_p = reg(pRExC_state, 1, &flags, depth+1);
+ *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
RExC_parse = endbrace;
RExC_end = orig_end;
RExC_override_recoding = 0;
- nextchar(pRExC_state);
+ nextchar(pRExC_state);
}
- return ret;
+ return TRUE;
}
}
PERL_STATIC_INLINE void
-S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, STRLEN len, UV code_point)
+S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
{
- /* This knows the details about sizing an EXACTish node, and potentially
- * populating it with a single character. If <len> is non-zero, it assumes
- * that the node has already been populated, and just does the sizing,
- * ignoring <code_point>. Otherwise it looks at <code_point> and
- * calculates what <len> should be. In pass 1, it sizes the node
- * appropriately. In pass 2, it additionally will populate the node's
- * STRING with <code_point>, if <len> is 0.
+ /* This knows the details about sizing an EXACTish node, setting flags for
+ * it (by setting <*flagp>, and potentially populating it with a single
+ * character.
+ *
+ * If <len> (the length in bytes) is non-zero, this function assumes that
+ * the node has already been populated, and just does the sizing. In this
+ * case <code_point> should be the final code point that has already been
+ * placed into the node. This value will be ignored except that under some
+ * circumstances <*flagp> is set based on it.
+ *
+ * If <len> is zero, the function assumes that the node is to contain only
+ * the single character given by <code_point> and calculates what <len>
+ * should be. In pass 1, it sizes the node appropriately. In pass 2, it
+ * additionally will populate the node's STRING with <code_point>, if <len>
+ * is 0. In both cases <*flagp> is appropriately set
*
* It knows that under FOLD, UTF characters and the Latin Sharp S must be
* folded (the latter only when the rules indicate it can match 'ss') */
Copy((char *) character, STRING(node), len, char);
}
}
+
+ *flagp |= HASWIDTH;
+
+ /* A single character node is SIMPLE, except for the special-cased SHARP S
+ * under /di. */
+ if ((len == 1 || (UTF && len == UNISKIP(code_point)))
+ && (code_point != LATIN_SMALL_LETTER_SHARP_S
+ || ! FOLD || ! DEPENDS_SEMANTICS))
+ {
+ *flagp |= SIMPLE;
+ }
}
/*
S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
dVAR;
- register regnode *ret = NULL;
+ regnode *ret = NULL;
I32 flags;
char *parse_start = RExC_parse;
U8 op;
case '[':
{
char * const oregcomp_parse = ++RExC_parse;
- ret = regclass(pRExC_state,depth+1);
+ ret = regclass(pRExC_state, flagp,depth+1);
if (*RExC_parse != ']') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched [");
}
nextchar(pRExC_state);
- *flagp |= HASWIDTH|SIMPLE;
Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
break;
}
}
RExC_parse--;
- ret = regclass(pRExC_state,depth+1);
+ ret = regclass(pRExC_state, flagp,depth+1);
RExC_end = oldregxend;
RExC_parse--;
Set_Node_Offset(ret, parse_start + 2);
Set_Node_Cur_Length(ret);
nextchar(pRExC_state);
- *flagp |= HASWIDTH|SIMPLE;
}
break;
case 'N':
- /* Handle \N and \N{NAME} here and not below because it can be
- multicharacter. join_exact() will join them up later on.
- Also this makes sure that things like /\N{BLAH}+/ and
- \N{BLAH} being multi char Just Happen. dmq*/
+ /* Handle \N and \N{NAME} with multiple code points here and not
+ * below because it can be multicharacter. join_exact() will join
+ * them up later on. Also this makes sure that things like
+ * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
+ * The options to the grok function call causes it to fail if the
+ * sequence is just a single code point. We then go treat it as
+ * just another character in the current EXACT node, and hence it
+ * gets uniform treatment with all the other characters. The
+ * special treatment for quantifiers is not needed for such single
+ * character sequences */
++RExC_parse;
- ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
+ if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
+ RExC_parse--;
+ goto defchar;
+ }
break;
case 'k': /* Handle \k<NAME> and \k'NAME' */
parse_named_seq:
RExC_parse++;
defchar: {
- register STRLEN len;
- register UV ender;
- register char *p;
+ STRLEN len = 0;
+ UV ender;
+ char *p;
char *s;
+#define MAX_NODE_STRING_SIZE 127
+ char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
+ char *s0;
+ U8 upper_parse = MAX_NODE_STRING_SIZE;
STRLEN foldlen;
- U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
U8 node_type;
+ bool next_is_quantifier;
+ char * oldp = NULL;
- /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
- * it is folded to 'ss' even if not utf8 */
- bool is_exactfu_sharp_s;
+ /* If a folding node contains only code points that don't
+ * participate in folds, it can be changed into an EXACT node,
+ * which allows the optimizer more things to look for */
+ bool maybe_exact;
ender = 0;
node_type = compute_EXACTish(pRExC_state);
ret = reg_node(pRExC_state, node_type);
- s = STRING(ret);
+
+ /* In pass1, folded, we use a temporary buffer instead of the
+ * actual node, as the node doesn't exist yet */
+ s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
+
+ s0 = s;
+
+ reparse:
+
+ /* We do the EXACTFish to EXACT node only if folding, and not if in
+ * locale, as whether a character folds or not isn't known until
+ * runtime */
+ maybe_exact = FOLD && ! LOC;
/* XXX The node can hold up to 255 bytes, yet this only goes to
* 127. I (khw) do not know why. Keeping it somewhat less than
* could back off to end with only a code point that isn't such a
* non-final, but it is possible for there not to be any in the
* entire node. */
- for (len = 0, p = RExC_parse - 1;
- len < 127 && p < RExC_end;
+ for (p = RExC_parse - 1;
+ len < upper_parse && p < RExC_end;
len++)
{
- char * const oldp = p;
+ oldp = p;
if (RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
case 'g': case 'G': /* generic-backref, pos assertion */
case 'h': case 'H': /* HORIZWS */
case 'k': case 'K': /* named backref, keep marker */
- case 'N': /* named char sequence */
case 'p': case 'P': /* Unicode property */
case 'R': /* LNBREAK */
case 's': case 'S': /* space class */
ender = '\n';
p++;
break;
+ case 'N': /* Handle a single-code point named character. */
+ /* The options cause it to fail if a multiple code
+ * point sequence. Handle those in the switch() above
+ * */
+ RExC_parse = p + 1;
+ if (! grok_bslash_N(pRExC_state, NULL, &ender,
+ flagp, depth, FALSE))
+ {
+ RExC_parse = p = oldp;
+ goto loopdone;
+ }
+ p = RExC_parse;
+ if (ender > 0xff) {
+ REQUIRE_UTF8;
+ }
+ break;
case 'r':
ender = '\r';
p++;
break;
} /* End of switch on the literal */
- is_exactfu_sharp_s = (node_type == EXACTFU
- && ender == LATIN_SMALL_LETTER_SHARP_S);
+ /* Here, have looked at the literal character and <ender>
+ * contains its ordinal, <p> points to the character after it
+ */
+
if ( RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
- if ((UTF && FOLD) || is_exactfu_sharp_s) {
- /* Prime the casefolded buffer. Locale rules, which apply
- * only to code points < 256, aren't known until execution,
- * so for them, just output the original character using
- * utf8. If we start to fold non-UTF patterns, be sure to
- * update join_exact() */
- if (LOC && ender < 256) {
- if (UNI_IS_INVARIANT(ender)) {
- *tmpbuf = (U8) ender;
- foldlen = 1;
- } else {
- *tmpbuf = UTF8_TWO_BYTE_HI(ender);
- *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
- foldlen = 2;
- }
- }
- else if (isASCII(ender)) { /* Note: Here can't also be LOC
- */
- ender = toLOWER(ender);
- *tmpbuf = (U8) ender;
- foldlen = 1;
- }
- else if (! ASCII_FOLD_RESTRICTED && ! LOC) {
- /* Locale and /aa require more selectivity about the
- * fold, so are handled below. Otherwise, here, just
- * use the fold */
- ender = toFOLD_uni(ender, tmpbuf, &foldlen);
- }
- else {
- /* Under locale rules or /aa we are not to mix,
- * respectively, ords < 256 or ASCII with non-. So
- * reject folds that mix them, using only the
- * non-folded code point. So do the fold to a
- * temporary, and inspect each character in it. */
- U8 trialbuf[UTF8_MAXBYTES_CASE+1];
- U8* s = trialbuf;
- UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
- U8* e = s + foldlen;
- bool fold_ok = TRUE;
-
- while (s < e) {
- if (isASCII(*s)
- || (LOC && (UTF8_IS_INVARIANT(*s)
- || UTF8_IS_DOWNGRADEABLE_START(*s))))
- {
- fold_ok = FALSE;
- break;
- }
- s += UTF8SKIP(s);
- }
- if (fold_ok) {
- Copy(trialbuf, tmpbuf, foldlen, U8);
- ender = tmpender;
- }
- else {
- uvuni_to_utf8(tmpbuf, ender);
- foldlen = UNISKIP(ender);
- }
- }
- }
- if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
- if (len)
- p = oldp;
- else if (UTF || is_exactfu_sharp_s) {
- if (FOLD) {
- /* Emit all the Unicode characters. */
- STRLEN numlen;
- for (foldbuf = tmpbuf;
- foldlen;
- foldlen -= numlen) {
-
- /* tmpbuf has been constructed by us, so we
- * know it is valid utf8 */
- ender = valid_utf8_to_uvchr(foldbuf, &numlen);
- if (numlen > 0) {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- s += unilen;
- len += unilen;
- /* In EBCDIC the numlen
- * and unilen can differ. */
- foldbuf += numlen;
- if (numlen >= foldlen)
- break;
- }
- else
- break; /* "Can't happen." */
- }
- }
- else {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
- }
- }
- else {
- len++;
- REGC((char)ender, s++);
- }
- break;
+ /* If the next thing is a quantifier, it applies to this
+ * character only, which means that this character has to be in
+ * its own node and can't just be appended to the string in an
+ * existing node, so if there are already other characters in
+ * the node, close the node with just them, and set up to do
+ * this character again next time through, when it will be the
+ * only thing in its new node */
+ if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
+ {
+ p = oldp;
+ goto loopdone;
+ }
+
+ if (FOLD) {
+ if (UTF
+ /* See comments for join_exact() as to why we fold
+ * this non-UTF at compile time */
+ || (node_type == EXACTFU
+ && ender == LATIN_SMALL_LETTER_SHARP_S))
+ {
+
+
+ /* Prime the casefolded buffer. Locale rules, which
+ * apply only to code points < 256, aren't known until
+ * execution, so for them, just output the original
+ * character using utf8. If we start to fold non-UTF
+ * patterns, be sure to update join_exact() */
+ if (LOC && ender < 256) {
+ if (UNI_IS_INVARIANT(ender)) {
+ *s = (U8) ender;
+ foldlen = 1;
+ } else {
+ *s = UTF8_TWO_BYTE_HI(ender);
+ *(s + 1) = UTF8_TWO_BYTE_LO(ender);
+ foldlen = 2;
+ }
+ }
+ else {
+ UV folded = _to_uni_fold_flags(
+ ender,
+ (U8 *) s,
+ &foldlen,
+ FOLD_FLAGS_FULL
+ | ((LOC) ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0)
+ );
+
+ /* If this node only contains non-folding code
+ * points so far, see if this new one is also
+ * non-folding */
+ if (maybe_exact) {
+ if (folded != ender) {
+ maybe_exact = FALSE;
+ }
+ else {
+ /* Here the fold is the original; we have
+ * to check further to see if anything
+ * folds to it */
+ if (! PL_utf8_foldable) {
+ SV* swash = swash_init("utf8",
+ "_Perl_Any_Folds",
+ &PL_sv_undef, 1, 0);
+ PL_utf8_foldable =
+ _get_swash_invlist(swash);
+ SvREFCNT_dec(swash);
+ }
+ if (_invlist_contains_cp(PL_utf8_foldable,
+ ender))
+ {
+ maybe_exact = FALSE;
+ }
+ }
+ }
+ ender = folded;
+ }
+ s += foldlen;
+
+ /* The loop increments <len> each time, as all but this
+ * path (and the one just below for UTF) through it add
+ * a single byte to the EXACTish node. But this one
+ * has changed len to be the correct final value, so
+ * subtract one to cancel out the increment that
+ * follows */
+ len += foldlen - 1;
+ }
+ else {
+ *(s++) = ender;
+ maybe_exact &= ! IS_IN_SOME_FOLD_L1(ender);
+ }
}
- if (UTF || is_exactfu_sharp_s) {
- if (FOLD) {
- /* Emit all the Unicode characters. */
- STRLEN numlen;
- for (foldbuf = tmpbuf;
- foldlen;
- foldlen -= numlen) {
- ender = valid_utf8_to_uvchr(foldbuf, &numlen);
- if (numlen > 0) {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- len += unilen;
- s += unilen;
- /* In EBCDIC the numlen
- * and unilen can differ. */
- foldbuf += numlen;
- if (numlen >= foldlen)
- break;
- }
- else
- break;
- }
- }
- else {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
- }
- len--;
+ else if (UTF) {
+ const STRLEN unilen = reguni(pRExC_state, ender, s);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
+
+ /* See comment just above for - 1 */
+ len--;
}
else {
REGC((char)ender, s++);
+ }
+
+ if (next_is_quantifier) {
+
+ /* Here, the next input is a quantifier, and to get here,
+ * the current character is the only one in the node.
+ * Also, here <len> doesn't include the final byte for this
+ * character */
+ len++;
+ goto loopdone;
}
- }
+
+ } /* End of loop through literal characters */
+
+ /* Here we have either exhausted the input or ran out of room in
+ * the node. (If we encountered a character that can't be in the
+ * node, transfer is made directly to <loopdone>, and so we
+ * wouldn't have fallen off the end of the loop.) In the latter
+ * case, we artificially have to split the node into two, because
+ * we just don't have enough space to hold everything. This
+ * creates a problem if the final character participates in a
+ * multi-character fold in the non-final position, as a match that
+ * should have occurred won't, due to the way nodes are matched,
+ * and our artificial boundary. So back off until we find a non-
+ * problematic character -- one that isn't at the beginning or
+ * middle of such a fold. (Either it doesn't participate in any
+ * folds, or appears only in the final position of all the folds it
+ * does participate in.) A better solution with far fewer false
+ * positives, and that would fill the nodes more completely, would
+ * be to actually have available all the multi-character folds to
+ * test against, and to back-off only far enough to be sure that
+ * this node isn't ending with a partial one. <upper_parse> is set
+ * further below (if we need to reparse the node) to include just
+ * up through that final non-problematic character that this code
+ * identifies, so when it is set to less than the full node, we can
+ * skip the rest of this */
+ if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
+
+ const STRLEN full_len = len;
+
+ assert(len >= MAX_NODE_STRING_SIZE);
+
+ /* Here, <s> points to the final byte of the final character.
+ * Look backwards through the string until find a non-
+ * problematic character */
+
+ if (! UTF) {
+
+ /* These two have no multi-char folds to non-UTF characters
+ */
+ if (ASCII_FOLD_RESTRICTED || LOC) {
+ goto loopdone;
+ }
+
+ while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
+ len = s - s0 + 1;
+ }
+ else {
+ if (! PL_NonL1NonFinalFold) {
+ PL_NonL1NonFinalFold = _new_invlist_C_array(
+ NonL1_Perl_Non_Final_Folds_invlist);
+ }
+
+ /* Point to the first byte of the final character */
+ s = (char *) utf8_hop((U8 *) s, -1);
+
+ while (s >= s0) { /* Search backwards until find
+ non-problematic char */
+ if (UTF8_IS_INVARIANT(*s)) {
+
+ /* There are no ascii characters that participate
+ * in multi-char folds under /aa. In EBCDIC, the
+ * non-ascii invariants are all control characters,
+ * so don't ever participate in any folds. */
+ if (ASCII_FOLD_RESTRICTED
+ || ! IS_NON_FINAL_FOLD(*s))
+ {
+ break;
+ }
+ }
+ else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+
+ /* No Latin1 characters participate in multi-char
+ * folds under /l */
+ if (LOC
+ || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
+ *s, *(s+1))))
+ {
+ break;
+ }
+ }
+ else if (! _invlist_contains_cp(
+ PL_NonL1NonFinalFold,
+ valid_utf8_to_uvchr((U8 *) s, NULL)))
+ {
+ break;
+ }
+
+ /* Here, the current character is problematic in that
+ * it does occur in the non-final position of some
+ * fold, so try the character before it, but have to
+ * special case the very first byte in the string, so
+ * we don't read outside the string */
+ s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
+ } /* End of loop backwards through the string */
+
+ /* If there were only problematic characters in the string,
+ * <s> will point to before s0, in which case the length
+ * should be 0, otherwise include the length of the
+ * non-problematic character just found */
+ len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
+ }
+
+ /* Here, have found the final character, if any, that is
+ * non-problematic as far as ending the node without splitting
+ * it across a potential multi-char fold. <len> contains the
+ * number of bytes in the node up-to and including that
+ * character, or is 0 if there is no such character, meaning
+ * the whole node contains only problematic characters. In
+ * this case, give up and just take the node as-is. We can't
+ * do any better */
+ if (len == 0) {
+ len = full_len;
+ } else {
+
+ /* Here, the node does contain some characters that aren't
+ * problematic. If one such is the final character in the
+ * node, we are done */
+ if (len == full_len) {
+ goto loopdone;
+ }
+ else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
+
+ /* If the final character is problematic, but the
+ * penultimate is not, back-off that last character to
+ * later start a new node with it */
+ p = oldp;
+ goto loopdone;
+ }
+
+ /* Here, the final non-problematic character is earlier
+ * in the input than the penultimate character. What we do
+ * is reparse from the beginning, going up only as far as
+ * this final ok one, thus guaranteeing that the node ends
+ * in an acceptable character. The reason we reparse is
+ * that we know how far in the character is, but we don't
+ * know how to correlate its position with the input parse.
+ * An alternate implementation would be to build that
+ * correlation as we go along during the original parse,
+ * but that would entail extra work for every node, whereas
+ * this code gets executed only when the string is too
+ * large for the node, and the final two characters are
+ * problematic, an infrequent occurrence. Yet another
+ * possible strategy would be to save the tail of the
+ * string, and the next time regatom is called, initialize
+ * with that. The problem with this is that unless you
+ * back off one more character, you won't be guaranteed
+ * regatom will get called again, unless regbranch,
+ * regpiece ... are also changed. If you do back off that
+ * extra character, so that there is input guaranteed to
+ * force calling regatom, you can't handle the case where
+ * just the first character in the node is acceptable. I
+ * (khw) decided to try this method which doesn't have that
+ * pitfall; if performance issues are found, we can do a
+ * combination of the current approach plus that one */
+ upper_parse = len;
+ len = 0;
+ s = s0;
+ goto reparse;
+ }
+ } /* End of verifying node ends with an appropriate char */
+
loopdone: /* Jumped to when encounters something that shouldn't be in
the node */
+
+ /* If 'maybe_exact' is still set here, means there are no
+ * code points in the node that participate in folds */
+ if (FOLD && maybe_exact) {
+ OP(ret) = EXACT;
+ }
+
+ /* I (khw) don't know if you can get here with zero length, but the
+ * old code handled this situation by creating a zero-length EXACT
+ * node. Might as well be NOTHING instead */
+ if (len == 0) {
+ OP(ret) = NOTHING;
+ }
+ else{
+ alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
+ }
+
RExC_parse = p - 1;
Set_Node_Cur_Length(ret); /* MJD */
nextchar(pRExC_state);
if (iv < 0)
vFAIL("Internal disaster");
}
- if (len > 0)
- *flagp |= HASWIDTH;
- if (len == 1 && UNI_IS_INVARIANT(ender))
- *flagp |= SIMPLE;
- alloc_maybe_populate_EXACT(pRExC_state, ret, len, 0);
- }
+ } /* End of label 'defchar:' */
break;
- }
+ } /* End of giant switch on input character */
return(ret);
}
switch (skip) {
case 4:
if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
- namedclass = ANYOF_ALNUM;
+ namedclass = ANYOF_WORDCHAR;
break;
case 5:
/* Names all of length 5. */
} \
}
-STATIC void
-S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
-{
- /* Adds input 'string' with length 'len' to the ANYOF node's unicode
- * alternate list, pointed to by 'alternate_ptr'. This is an array of
- * the multi-character folds of characters in the node */
- SV *sv;
-
- PERL_ARGS_ASSERT_ADD_ALTERNATE;
-
- if (! *alternate_ptr) {
- *alternate_ptr = newAV();
- }
- sv = newSVpvn_utf8((char*)string, len, TRUE);
- av_push(*alternate_ptr, sv);
- return;
-}
-
/* The names of properties whose definitions are not known at compile time are
* stored in this SV, after a constant heading. So if the length has been
* changed since initialization, then there is a run-time definition. */
* number defined in handy.h. */
#define namedclass_to_classnum(class) ((class) / 2)
-/*
- parse a class specification and produce either an ANYOF node that
- matches the pattern or perhaps will be optimized into an EXACTish node
- instead. The node contains a bit map for the first 256 characters, with the
- corresponding bit set if that character is in the list. For characters
- above 255, a range list is used */
-
STATIC regnode *
-S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
+S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
+ /* parse a bracketed class specification. Most of these will produce an ANYOF node;
+ * but something like [a] will produce an EXACT node; [aA], an EXACTFish
+ * node; [[:ascii:]], a POSIXA node; etc. It is more complex under /i with
+ * multi-character folds: it will be rewritten following the paradigm of
+ * this example, where the <multi-fold>s are characters which fold to
+ * multiple character sequences:
+ * /[abc\x{multi-fold1}def\x{multi-fold2}ghi]/i
+ * gets effectively rewritten as:
+ * /(?:\x{multi-fold1}|\x{multi-fold2}|[abcdefghi]/i
+ * reg() gets called (recursively) on the rewritten version, and this
+ * function will return what it constructs. (Actually the <multi-fold>s
+ * aren't physically removed from the [abcdefghi], it's just that they are
+ * ignored in the recursion by means of a a flag:
+ * <RExC_in_multi_char_class>.)
+ *
+ * ANYOF nodes contain a bit map for the first 256 characters, with the
+ * corresponding bit set if that character is in the list. For characters
+ * above 255, a range list or swash is used. There are extra bits for \w,
+ * etc. in locale ANYOFs, as what these match is not determinable at
+ * compile time */
+
dVAR;
- register UV nextvalue;
- register UV prevvalue = OOB_UNICODE;
- register IV range = 0;
- UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
- register regnode *ret;
+ UV nextvalue;
+ UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
+ IV range = 0;
+ UV value = OOB_UNICODE, save_value = OOB_UNICODE;
+ regnode *ret;
STRLEN numlen;
IV namedclass = OOB_NAMEDCLASS;
char *rangebegin = NULL;
bool need_class = 0;
- bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
SV *listsv = NULL;
STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
than just initialized. */
extended beyond the Latin1 range */
UV element_count = 0; /* Number of distinct elements in the class.
Optimizations may be possible if this is tiny */
+ AV * multi_char_matches = NULL; /* Code points that fold to more than one
+ character; used under /i */
UV n;
/* Unicode properties are stored in a swash; this holds the current one
* of the target string */
SV* cp_list = NULL;
- /* List of multi-character folds that are matched by this node */
- AV* unicode_alternate = NULL;
#ifdef EBCDIC
/* In a range, counts how many 0-2 of the ends of it came from literals,
* not escapes. Thus we can tell if 'A' was input vs \x{C1} */
/* Assume we are going to generate an ANYOF node. */
ret = reganode(pRExC_state, ANYOF, 0);
-
if (!SIZE_ONLY) {
ANYOF_FLAGS(ret) = 0;
}
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
- RExC_naughty++;
RExC_parse++;
invert = TRUE;
-
- /* We have decided to not allow multi-char folds in inverted character
- * classes, due to the confusion that can happen, especially with
- * classes that are designed for a non-Unicode world: You have the
- * peculiar case that:
- "s s" =~ /^[^\xDF]+$/i => Y
- "ss" =~ /^[^\xDF]+$/i => N
- *
- * See [perl #89750] */
- allow_full_fold = FALSE;
+ RExC_naughty++;
}
if (SIZE_ONLY) {
charclassloop:
namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
+ save_value = value;
+ save_prevvalue = prevvalue;
if (!range) {
rangebegin = RExC_parse;
* A similar issue a little bit later when switching on
* namedclass. --jhi */
switch ((I32)value) {
- case 'w': namedclass = ANYOF_ALNUM; break;
- case 'W': namedclass = ANYOF_NALNUM; break;
+ case 'w': namedclass = ANYOF_WORDCHAR; break;
+ case 'W': namedclass = ANYOF_NWORDCHAR; break;
case 's': namedclass = ANYOF_SPACE; break;
case 'S': namedclass = ANYOF_NSPACE; break;
case 'd': namedclass = ANYOF_DIGIT; break;
if this makes sense as it does change the behaviour
from earlier versions, OTOH that behaviour was broken
as well. */
- UV v; /* value is register so we cant & it /grrr */
- if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
+ if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
+ TRUE /* => charclass */))
+ {
goto parseit;
}
- value= v;
}
break;
case 'p':
case 'P':
{
char *e;
+
+ /* This routine will handle any undefined properties */
+ U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
+
if (RExC_parse >= RExC_end)
vFAIL2("Empty \\%c{}", (U8)value);
if (*RExC_parse == '{') {
swash = _core_swash_init("utf8", name, &PL_sv_undef,
1, /* binary */
0, /* not tr/// */
- TRUE, /* this routine will handle
- undefined properties */
- NULL, FALSE /* No inversion list */
+ NULL, /* No inversion list */
+ &swash_init_flags
);
- if ( ! swash
- || ! SvROK(swash)
- || ! SvTYPE(SvRV(swash)) == SVt_PVHV
- || ! (invlist = _get_swash_invlist(swash)))
- {
+ if (! swash || ! (invlist = _get_swash_invlist(swash))) {
if (swash) {
SvREFCNT_dec(swash);
swash = NULL;
* the swash is from a user-defined property, then this
* whole character class should be regarded as such */
has_user_defined_property =
- _is_swash_user_defined(swash);
+ (swash_init_flags
+ & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
/* Invert if asking for the complement */
if (value == 'P') {
runtime_posix_matches_above_Unicode);
break;
case ANYOF_ASCII:
+#ifdef HAS_ISASCII
if (LOC) {
ANYOF_CLASS_SET(ret, namedclass);
}
- else {
+ else
+#endif /* Not isascii(); just use the hard-coded definition for it */
_invlist_union(posixes, PL_ASCII, &posixes);
- }
break;
case ANYOF_NASCII:
+#ifdef HAS_ISASCII
if (LOC) {
ANYOF_CLASS_SET(ret, namedclass);
}
else {
+#endif
_invlist_union_complement_2nd(posixes,
PL_ASCII, &posixes);
if (DEPENDS_SEMANTICS) {
ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
}
+#ifdef HAS_ISASCII
}
+#endif
break;
case ANYOF_BLANK:
- DO_POSIX(ret, namedclass, posixes,
+ if (hasISBLANK || ! LOC) {
+ DO_POSIX(ret, namedclass, posixes,
PL_PosixBlank, PL_XPosixBlank);
+ }
+ else { /* There is no isblank() and we are in locale: We
+ use the ASCII range and the above-Latin1 range
+ code points */
+ SV* scratch_list = NULL;
+
+ /* Include all above-Latin1 blanks */
+ _invlist_intersection(PL_AboveLatin1,
+ PL_XPosixBlank,
+ &scratch_list);
+ /* Add it to the running total of posix classes */
+ if (! posixes) {
+ posixes = scratch_list;
+ }
+ else {
+ _invlist_union(posixes, scratch_list, &posixes);
+ SvREFCNT_dec(scratch_list);
+ }
+ /* Add the ASCII-range blanks to the running total. */
+ _invlist_union(posixes, PL_PosixBlank, &posixes);
+ }
break;
case ANYOF_NBLANK:
- DO_N_POSIX(ret, namedclass, posixes,
- PL_PosixBlank, PL_XPosixBlank);
+ if (hasISBLANK || ! LOC) {
+ DO_N_POSIX(ret, namedclass, posixes,
+ PL_PosixBlank, PL_XPosixBlank);
+ }
+ else { /* There is no isblank() and we are in locale */
+ SV* scratch_list = NULL;
+
+ /* Include all above-Latin1 non-blanks */
+ _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
+ &scratch_list);
+
+ /* Add them to the running total of posix classes */
+ _invlist_subtract(PL_AboveLatin1, PL_XPosixBlank,
+ &scratch_list);
+ if (! posixes) {
+ posixes = scratch_list;
+ }
+ else {
+ _invlist_union(posixes, scratch_list, &posixes);
+ SvREFCNT_dec(scratch_list);
+ }
+
+ /* Get the list of all non-ASCII-blanks in Latin 1, and
+ * add them to the running total */
+ _invlist_subtract(PL_Latin1, PL_PosixBlank,
+ &scratch_list);
+ _invlist_union(posixes, scratch_list, &posixes);
+ SvREFCNT_dec(scratch_list);
+ }
break;
case ANYOF_CNTRL:
DO_POSIX(ret, namedclass, posixes,
}
break;
}
- case ANYOF_ALNUM: /* Really is 'Word' */
+ case ANYOF_WORDCHAR:
DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
break;
- case ANYOF_NALNUM:
+ case ANYOF_NWORDCHAR:
DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
runtime_posix_matches_above_Unicode);
}
if (!SIZE_ONLY) {
cp_list = add_cp_to_invlist(cp_list, '-');
- element_count++;
}
+ element_count++;
} else
range = 1; /* yeah, it's a range! */
continue; /* but do it the next time */
RExC_uni_semantics = 1;
}
- /* Ready to process either the single value, or the completed range */
- if (!SIZE_ONLY) {
+ /* Ready to process either the single value, or the completed range.
+ * For single-valued non-inverted ranges, we consider the possibility
+ * of multi-char folds. (We made a conscious decision to not do this
+ * for the other cases because it can often lead to non-intuitive
+ * results. For example, you have the peculiar case that:
+ * "s s" =~ /^[^\xDF]+$/i => Y
+ * "ss" =~ /^[^\xDF]+$/i => N
+ *
+ * See [perl #89750] */
+ if (FOLD && ! invert && value == prevvalue) {
+ if (value == LATIN_SMALL_LETTER_SHARP_S
+ || (value > 255 && _invlist_contains_cp(PL_HasMultiCharFold,
+ value)))
+ {
+ /* Here <value> is indeed a multi-char fold. Get what it is */
+
+ U8 foldbuf[UTF8_MAXBYTES_CASE];
+ STRLEN foldlen;
+
+ UV folded = _to_uni_fold_flags(
+ value,
+ foldbuf,
+ &foldlen,
+ FOLD_FLAGS_FULL
+ | ((LOC) ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0)
+ );
+
+ /* Here, <folded> should be the first character of the
+ * multi-char fold of <value>, with <foldbuf> containing the
+ * whole thing. But, if this fold is not allowed (because of
+ * the flags), <fold> will be the same as <value>, and should
+ * be processed like any other character, so skip the special
+ * handling */
+ if (folded != value) {
+
+ /* Skip if we are recursed, currently parsing the class
+ * again. Otherwise add this character to the list of
+ * multi-char folds. */
+ if (! RExC_in_multi_char_class) {
+ AV** this_array_ptr;
+ AV* this_array;
+ STRLEN cp_count = utf8_length(foldbuf,
+ foldbuf + foldlen);
+ SV* multi_fold = sv_2mortal(newSVpvn("", 0));
+
+ Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
+
+
+ if (! multi_char_matches) {
+ multi_char_matches = newAV();
+ }
+
+ /* <multi_char_matches> is actually an array of arrays.
+ * There will be one or two top-level elements: [2],
+ * and/or [3]. The [2] element is an array, each
+ * element thereof is a character which folds to two
+ * characters; likewise for [3]. (Unicode guarantees a
+ * maximum of 3 characters in any fold.) When we
+ * rewrite the character class below, we will do so
+ * such that the longest folds are written first, so
+ * that it prefers the longest matching strings first.
+ * This is done even if it turns out that any
+ * quantifier is non-greedy, out of programmer
+ * laziness. Tom Christiansen has agreed that this is
+ * ok. This makes the test for the ligature 'ffi' come
+ * before the test for 'ff' */
+ if (av_exists(multi_char_matches, cp_count)) {
+ this_array_ptr = (AV**) av_fetch(multi_char_matches,
+ cp_count, FALSE);
+ this_array = *this_array_ptr;
+ }
+ else {
+ this_array = newAV();
+ av_store(multi_char_matches, cp_count,
+ (SV*) this_array);
+ }
+ av_push(this_array, multi_fold);
+ }
+
+ /* This element should not be processed further in this
+ * class */
+ element_count--;
+ value = save_value;
+ prevvalue = save_prevvalue;
+ continue;
+ }
+ }
+ }
+
+ /* Deal with this element of the class */
+ if (! SIZE_ONLY) {
#ifndef EBCDIC
cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
#else
range = 0; /* this range (if it was one) is done now */
} /* End of loop through all the text within the brackets */
+ /* If anything in the class expands to more than one character, we have to
+ * deal with them by building up a substitute parse string, and recursively
+ * calling reg() on it, instead of proceeding */
+ if (multi_char_matches) {
+ SV * substitute_parse = newSVpvn_flags("?:", 2, SVs_TEMP);
+ I32 cp_count;
+ STRLEN len;
+ char *save_end = RExC_end;
+ char *save_parse = RExC_parse;
+ bool first_time = TRUE; /* First multi-char occurrence doesn't get
+ a "|" */
+ I32 reg_flags;
+
+ assert(! invert);
+#if 0 /* Have decided not to deal with multi-char folds in inverted classes,
+ because too confusing */
+ if (invert) {
+ sv_catpv(substitute_parse, "(?:");
+ }
+#endif
+
+ /* Look at the longest folds first */
+ for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
+
+ if (av_exists(multi_char_matches, cp_count)) {
+ AV** this_array_ptr;
+ SV* this_sequence;
+
+ this_array_ptr = (AV**) av_fetch(multi_char_matches,
+ cp_count, FALSE);
+ while ((this_sequence = av_pop(*this_array_ptr)) !=
+ &PL_sv_undef)
+ {
+ if (! first_time) {
+ sv_catpv(substitute_parse, "|");
+ }
+ first_time = FALSE;
+
+ sv_catpv(substitute_parse, SvPVX(this_sequence));
+ }
+ }
+ }
+
+ /* If the character class contains anything else besides these
+ * multi-character folds, have to include it in recursive parsing */
+ if (element_count) {
+ sv_catpv(substitute_parse, "|[");
+ sv_catpvn(substitute_parse, orig_parse, RExC_parse - orig_parse);
+ sv_catpv(substitute_parse, "]");
+ }
+
+ sv_catpv(substitute_parse, ")");
+#if 0
+ if (invert) {
+ /* This is a way to get the parse to skip forward a whole named
+ * sequence instead of matching the 2nd character when it fails the
+ * first */
+ sv_catpv(substitute_parse, "(*THEN)(*SKIP)(*FAIL)|.)");
+ }
+#endif
+
+ RExC_parse = SvPV(substitute_parse, len);
+ RExC_end = RExC_parse + len;
+ RExC_in_multi_char_class = 1;
+ RExC_emit = (regnode *)orig_emit;
+
+ ret = reg(pRExC_state, 1, ®_flags, depth+1);
+
+ *flagp |= reg_flags&(HASWIDTH|SIMPLE|SPSTART|POSTPONED);
+
+ RExC_parse = save_parse;
+ RExC_end = save_end;
+ RExC_in_multi_char_class = 0;
+ SvREFCNT_dec(multi_char_matches);
+ return ret;
+ }
+
/* If the character class contains only a single element, it may be
* optimizable into another node type which is smaller and runs faster.
* Check if this is the case for this class */
* modifier to the regex. We first calculate the base node
* type, and if it should be inverted */
- case ANYOF_NALNUM:
+ case ANYOF_NWORDCHAR:
invert = ! invert;
/* FALLTHROUGH */
- case ANYOF_ALNUM:
+ case ANYOF_WORDCHAR:
op = ALNUM;
goto join_charset_classes;
if (invert) {
op += NALNUM - ALNUM;
}
+ *flagp |= HASWIDTH|SIMPLE;
break;
/* The second group doesn't depend of the charset modifiers.
case ANYOF_HORIZWS:
is_horizws:
op = (invert) ? NHORIZWS : HORIZWS;
+ *flagp |= HASWIDTH|SIMPLE;
break;
case ANYOF_NVERTWS:
/* FALLTHROUGH */
case ANYOF_VERTWS:
op = (invert) ? NVERTWS : VERTWS;
+ *flagp |= HASWIDTH|SIMPLE;
break;
case ANYOF_MAX:
if (invert) {
if (! LOC && value == '\n') {
op = REG_ANY; /* Optimize [^\n] */
+ *flagp |= HASWIDTH|SIMPLE;
+ RExC_naughty++;
}
}
else if (value < 256 || UTF) {
if (prevvalue == '0') {
if (value == '9') {
op = (invert) ? NDIGITA : DIGITA;
+ *flagp |= HASWIDTH|SIMPLE;
}
}
}
if (! SIZE_ONLY) {
FLAGS(ret) = arg;
}
+ *flagp |= HASWIDTH|SIMPLE;
}
else if (PL_regkind[op] == EXACT) {
- alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value);
+ alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
}
RExC_parse = (char *) cur_parse;
if (SIZE_ONLY)
return ret;
- /****** !SIZE_ONLY AFTER HERE *********/
+ /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
/* If folding, we calculate all characters that could fold to or from the
* ones already on the list */
SV* fold_intersection = NULL;
- /* In the Latin1 range, the characters that can be folded-to or -from
- * are precisely the alphabetic characters. If the highest code point
- * is within Latin1, we can use the compiled-in list, and not have to
- * go out to disk. */
+ /* If the highest code point is within Latin1, we can use the
+ * compiled-in Alphas list, and not have to go out to disk. This
+ * yields two false positives, the masculine and feminine oridinal
+ * indicators, which are weeded out below using the
+ * IS_IN_SOME_FOLD_L1() macro */
if (invlist_highest(cp_list) < 256) {
_invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
}
* rules hard-coded into Perl. (This case happens legitimately
* during compilation of Perl itself before the Unicode tables
* are generated) */
- if (invlist_len(PL_utf8_foldable) == 0) {
+ if (_invlist_len(PL_utf8_foldable) == 0) {
PL_utf8_foldclosures = newHV();
}
else {
* to force that */
if (! PL_utf8_tofold) {
U8 dummy[UTF8_MAXBYTES+1];
- STRLEN dummy_len;
- /* This particular string is above \xff in both UTF-8
- * and UTFEBCDIC */
- to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
+ /* This string is just a short named one above \xff */
+ to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
assert(PL_utf8_tofold); /* Verify that worked */
}
PL_utf8_foldclosures =
U8 foldbuf[UTF8_MAXBYTES_CASE+1];
STRLEN foldlen;
- UV f;
+ SV** listp;
if (j < 256) {
* mappings, though that is not really likely, and may be
* caught by the default: case of the switch below. */
- if (PL_fold_latin1[j] != j) {
+ if (IS_IN_SOME_FOLD_L1(j)) {
/* ASCII is always matched; non-ASCII is matched only
* under Unicode rules */
&& (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
{
/* Certain Latin1 characters have matches outside
- * Latin1, or are multi-character. To get here, 'j' is
- * one of those characters. None of these matches is
- * valid for ASCII characters under /aa, which is why
- * the 'if' just above excludes those. The matches
- * fall into three categories:
- * 1) They are singly folded-to or -from an above 255
- * character, e.g., LATIN SMALL LETTER Y WITH
- * DIAERESIS and LATIN CAPITAL LETTER Y WITH
- * DIAERESIS;
- * 2) They are part of a multi-char fold with another
- * latin1 character; only LATIN SMALL LETTER
- * SHARP S => "ss" fits this;
- * 3) They are part of a multi-char fold with a
- * character outside of Latin1, such as various
- * ligatures.
- * We aren't dealing fully with multi-char folds, except
- * we do deal with the pattern containing a character
- * that has a multi-char fold (not so much the inverse).
- * For types 1) and 3), the matches only happen when the
- * target string is utf8; that's not true for 2), and we
- * set a flag for it.
- *
- * The code below adds the single fold closures for 'j'
- * to the inversion list. */
+ * Latin1. To get here, <j> is one of those
+ * characters. None of these matches is valid for
+ * ASCII characters under /aa, which is why the 'if'
+ * just above excludes those. These matches only
+ * happen when the target string is utf8. The code
+ * below adds the single fold closures for <j> to the
+ * inversion list. */
switch (j) {
case 'k':
case 'K':
case LATIN_SMALL_LETTER_SHARP_S:
cp_list = add_cp_to_invlist(cp_list,
LATIN_CAPITAL_LETTER_SHARP_S);
-
- /* Under /a, /d, and /u, this can match the two
- * chars "ss" */
- if (! ASCII_FOLD_RESTRICTED) {
- add_alternate(&unicode_alternate,
- (U8 *) "ss", 2);
-
- /* And under /u or /a, it can match even if
- * the target is not utf8 */
- if (AT_LEAST_UNI_SEMANTICS) {
- ANYOF_FLAGS(ret) |=
- ANYOF_NONBITMAP_NON_UTF8;
- }
- }
break;
case 'F': case 'f':
case 'I': case 'i':
* express, so they can't match unless the
* target string is in UTF-8, so no action here
* is necessary, as regexec.c properly handles
- * the general case for UTF-8 matching */
+ * the general case for UTF-8 matching and
+ * multi-char folds */
break;
default:
/* Use deprecated warning to increase the
}
/* Here is an above Latin1 character. We don't have the rules
- * hard-coded for it. First, get its fold */
- f = _to_uni_fold_flags(j, foldbuf, &foldlen,
- ((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
- | ((LOC)
- ? FOLD_FLAGS_LOCALE
- : (ASCII_FOLD_RESTRICTED)
- ? FOLD_FLAGS_NOMIX_ASCII
- : 0));
-
- if (foldlen > (STRLEN)UNISKIP(f)) {
-
- /* Any multicharacter foldings (disallowed in lookbehind
- * patterns) require the following transform: [ABCDEF] ->
- * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
- * folds into "rst", all other characters fold to single
- * characters. We save away these multicharacter foldings,
- * to be later saved as part of the additional "s" data. */
- if (! RExC_in_lookbehind) {
- U8* loc = foldbuf;
- U8* e = foldbuf + foldlen;
-
- /* If any of the folded characters of this are in the
- * Latin1 range, tell the regex engine that this can
- * match a non-utf8 target string. */
- while (loc < e) {
- if (UTF8_IS_INVARIANT(*loc)
- || UTF8_IS_DOWNGRADEABLE_START(*loc))
- {
- ANYOF_FLAGS(ret)
- |= ANYOF_NONBITMAP_NON_UTF8;
- break;
- }
- loc += UTF8SKIP(loc);
+ * hard-coded for it. First, get its fold. This is the simple
+ * fold, as the multi-character folds have been handled earlier
+ * and separated out */
+ _to_uni_fold_flags(j, foldbuf, &foldlen,
+ ((LOC)
+ ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
+
+ /* Single character fold of above Latin1. Add everything in
+ * its fold closure to the list that this node should match.
+ * The fold closures data structure is a hash with the keys
+ * being the UTF-8 of every character that is folded to, like
+ * 'k', and the values each an array of all code points that
+ * fold to its key. e.g. [ 'k', 'K', KELVIN_SIGN ].
+ * Multi-character folds are not included */
+ if ((listp = hv_fetch(PL_utf8_foldclosures,
+ (char *) foldbuf, foldlen, FALSE)))
+ {
+ AV* list = (AV*) *listp;
+ IV k;
+ for (k = 0; k <= av_len(list); k++) {
+ SV** c_p = av_fetch(list, k, FALSE);
+ UV c;
+ if (c_p == NULL) {
+ Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
}
+ c = SvUV(*c_p);
- add_alternate(&unicode_alternate, foldbuf, foldlen);
- }
- }
- else {
- /* Single character fold of above Latin1. Add everything
- * in its fold closure to the list that this node should
- * match */
- SV** listp;
-
- /* The fold closures data structure is a hash with the keys
- * being every character that is folded to, like 'k', and
- * the values each an array of everything that folds to its
- * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
- if ((listp = hv_fetch(PL_utf8_foldclosures,
- (char *) foldbuf, foldlen, FALSE)))
- {
- AV* list = (AV*) *listp;
- IV k;
- for (k = 0; k <= av_len(list); k++) {
- SV** c_p = av_fetch(list, k, FALSE);
- UV c;
- if (c_p == NULL) {
- Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
- }
- c = SvUV(*c_p);
-
- /* /aa doesn't allow folds between ASCII and non-;
- * /l doesn't allow them between above and below
- * 256 */
- if ((ASCII_FOLD_RESTRICTED
- && (isASCII(c) != isASCII(j)))
- || (LOC && ((c < 256) != (j < 256))))
- {
- continue;
- }
+ /* /aa doesn't allow folds between ASCII and non-; /l
+ * doesn't allow them between above and below 256 */
+ if ((ASCII_FOLD_RESTRICTED
+ && (isASCII(c) != isASCII(j)))
+ || (LOC && ((c < 256) != (j < 256))))
+ {
+ continue;
+ }
- /* Folds involving non-ascii Latin1 characters
- * under /d are added to a separate list */
- if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
- {
- cp_list = add_cp_to_invlist(cp_list, c);
- }
- else {
- depends_list = add_cp_to_invlist(depends_list, c);
- }
- }
- }
- }
+ /* Folds involving non-ascii Latin1 characters
+ * under /d are added to a separate list */
+ if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
+ {
+ cp_list = add_cp_to_invlist(cp_list, c);
+ }
+ else {
+ depends_list = add_cp_to_invlist(depends_list, c);
+ }
+ }
+ }
}
}
SvREFCNT_dec(fold_intersection);
* folded until runtime */
/* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
- * at compile time. Besides not inverting folded locale now, we can't invert
- * if there are things such as \w, which aren't known until runtime */
+ * at compile time. Besides not inverting folded locale now, we can't
+ * invert if there are things such as \w, which aren't known until runtime
+ * */
if (invert
&& ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
&& ! depends_list
- && ! unicode_alternate
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
{
_invlist_invert(cp_list);
* until runtime; set the run-time fold flag for these. (We don't have to
* worry about properties folding, as that is taken care of by the swash
* fetching) */
- if (FOLD && (LOC || unicode_alternate))
+ if (FOLD && LOC)
{
- ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
+ ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
}
/* Some character classes are equivalent to other nodes. Such nodes take
* node types they could possibly match using _invlistEQ(). */
if (cp_list
- && ! unicode_alternate
&& ! invert
&& ! depends_list
&& ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
* it doesn't match anything. (perluniprops.pod notes such
* properties) */
op = OPFAIL;
+ *flagp |= HASWIDTH|SIMPLE;
}
else if (start == end) { /* The range is a single code point */
if (! invlist_iternext(cp_list, &start, &end)
* EXACTFish node that any such are likely to be. We can
* do this iff the code point doesn't participate in any
* folds. For example, an EXACTF of a colon is the same as
- * an EXACT one, since nothing folds to or from a colon.
- * In the Latin1 range, being an alpha means that the
- * character participates in a fold (except for the
- * feminine and masculine ordinals, which I (khw) don't
- * think are worrying about optimizing for). */
+ * an EXACT one, since nothing folds to or from a colon. */
if (value < 256) {
- if (isALPHA_L1(value)) {
+ if (IS_IN_SOME_FOLD_L1(value)) {
op = EXACT;
}
}
else if (start == 0) {
if (end == UV_MAX) {
op = SANY;
+ *flagp |= HASWIDTH|SIMPLE;
+ RExC_naughty++;
}
else if (end == '\n' - 1
&& invlist_iternext(cp_list, &start, &end)
&& start == '\n' + 1 && end == UV_MAX)
{
op = REG_ANY;
+ *flagp |= HASWIDTH|SIMPLE;
+ RExC_naughty++;
}
}
RExC_parse = (char *)cur_parse;
if (PL_regkind[op] == EXACT) {
- alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value);
+ alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
}
SvREFCNT_dec(listsv);
}
/* If have completely emptied it, remove it completely */
- if (invlist_len(cp_list) == 0) {
+ if (_invlist_len(cp_list) == 0) {
SvREFCNT_dec(cp_list);
cp_list = NULL;
}
}
if (! cp_list
- && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
- && ! unicode_alternate)
+ && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
{
ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
SvREFCNT_dec(listsv);
- SvREFCNT_dec(unicode_alternate);
}
else {
/* av[0] stores the character class description in its textual form:
* av[1] if NULL, is a placeholder to later contain the swash computed
* from av[0]. But if no further computation need be done, the
* swash is stored there now.
- * av[2] stores the multicharacter foldings, used later in
- * regexec.c:S_reginclass().
- * av[3] stores the cp_list inversion list for use in addition or
+ * av[2] stores the cp_list inversion list for use in addition or
* instead of av[0]; used only if av[1] is NULL
- * av[4] is set if any component of the class is from a user-defined
+ * av[3] is set if any component of the class is from a user-defined
* property; used only if av[1] is NULL */
AV * const av = newAV();
SV *rv;
else {
av_store(av, 1, NULL);
if (cp_list) {
- av_store(av, 3, cp_list);
- av_store(av, 4, newSVuv(has_user_defined_property));
+ av_store(av, 2, cp_list);
+ av_store(av, 3, newSVuv(has_user_defined_property));
}
}
- /* Store any computed multi-char folds only if we are allowing
- * them */
- if (allow_full_fold) {
- av_store(av, 2, MUTABLE_SV(unicode_alternate));
- if (unicode_alternate) { /* This node is variable length */
- OP(ret) = ANYOFV;
- }
- }
- else {
- av_store(av, 2, NULL);
- }
rv = newRV_noinc(MUTABLE_SV(av));
n = add_data(pRExC_state, 1, "s");
RExC_rxi->data->data[n] = (void*)rv;
ARG_SET(ret, n);
}
+
+ *flagp |= HASWIDTH|SIMPLE;
return ret;
}
#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
{
dVAR;
- register regnode *ptr;
+ regnode *ptr;
regnode * const ret = RExC_emit;
GET_RE_DEBUG_FLAGS_DECL;
S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
{
dVAR;
- register regnode *ptr;
+ regnode *ptr;
regnode * const ret = RExC_emit;
GET_RE_DEBUG_FLAGS_DECL;
S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
{
dVAR;
- register regnode *src;
- register regnode *dst;
- register regnode *place;
+ regnode *src;
+ regnode *dst;
+ regnode *place;
const int offset = regarglen[(U8)op];
const int size = NODE_STEP_REGNODE + offset;
GET_RE_DEBUG_FLAGS_DECL;
S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
{
dVAR;
- register regnode *scan;
+ regnode *scan;
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGTAIL;
S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
{
dVAR;
- register regnode *scan;
+ regnode *scan;
U8 exact = PSEUDO;
#ifdef EXPERIMENTAL_INPLACESCAN
I32 min = 0;
{
#ifdef DEBUGGING
dVAR;
- register int k;
+ int k;
/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
static const char * const anyofs[] = {
if (flags & ANYOF_LOCALE)
sv_catpvs(sv, "{loc}");
- if (flags & ANYOF_LOC_NONBITMAP_FOLD)
+ if (flags & ANYOF_LOC_FOLD)
sv_catpvs(sv, "{i}");
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
if (flags & ANYOF_INVERT)
if (ANYOF_NONBITMAP(o)) {
SV *lv; /* Set if there is something outside the bit map */
- SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
+ SV * const sw = regclass_swash(prog, o, FALSE, &lv, NULL);
bool byte_output = FALSE; /* If something in the bitmap has been
output */
Perl_re_intuit_string(pTHX_ REGEXP * const r)
{ /* Assume that RE_INTUIT is set */
dVAR;
- struct regexp *const prog = (struct regexp *)SvANY(r);
+ struct regexp *const prog = ReANY(r);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_RE_INTUIT_STRING;
Perl_pregfree2(pTHX_ REGEXP *rx)
{
dVAR;
- struct regexp *const r = (struct regexp *)SvANY(rx);
+ struct regexp *const r = ReANY(rx);
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_PREGFREE2;
} else {
CALLREGFREE_PVT(rx); /* free the private data */
SvREFCNT_dec(RXp_PAREN_NAMES(r));
+ Safefree(r->xpv_len_u.xpvlenu_pv);
}
if (r->substrs) {
SvREFCNT_dec(r->anchored_substr);
#endif
Safefree(r->offs);
SvREFCNT_dec(r->qr_anoncv);
+ rx->sv_u.svu_rx = 0;
}
/* reg_temp_copy()
Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
{
struct regexp *ret;
- struct regexp *const r = (struct regexp *)SvANY(rx);
+ struct regexp *const r = ReANY(rx);
+ const bool islv = ret_x && SvTYPE(ret_x) == SVt_PVLV;
PERL_ARGS_ASSERT_REG_TEMP_COPY;
if (!ret_x)
ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
- ret = (struct regexp *)SvANY(ret_x);
+ else {
+ SvOK_off((SV *)ret_x);
+ if (islv) {
+ /* For PVLVs, SvANY points to the xpvlv body while sv_u points
+ to the regexp. (For SVt_REGEXPs, sv_upgrade has already
+ made both spots point to the same regexp body.) */
+ REGEXP *temp = (REGEXP *)newSV_type(SVt_REGEXP);
+ assert(!SvPVX(ret_x));
+ ret_x->sv_u.svu_rx = temp->sv_any;
+ temp->sv_any = NULL;
+ SvFLAGS(temp) = (SvFLAGS(temp) & ~SVTYPEMASK) | SVt_NULL;
+ SvREFCNT_dec(temp);
+ /* SvCUR still resides in the xpvlv struct, so the regexp copy-
+ ing below will not set it. */
+ SvCUR_set(ret_x, SvCUR(rx));
+ }
+ }
+ /* This ensures that SvTHINKFIRST(sv) is true, and hence that
+ sv_force_normal(sv) is called. */
+ SvFAKE_on(ret_x);
+ ret = ReANY(ret_x);
- (void)ReREFCNT_inc(rx);
- /* We can take advantage of the existing "copied buffer" mechanism in SVs
- by pointing directly at the buffer, but flagging that the allocated
- space in the copy is zero. As we've just done a struct copy, it's now
- a case of zero-ing that, rather than copying the current length. */
- SvPV_set(ret_x, RX_WRAPPED(rx));
- SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
+ SvFLAGS(ret_x) |= SvUTF8(rx);
+ /* We share the same string buffer as the original regexp, on which we
+ hold a reference count, incremented when mother_re is set below.
+ The string pointer is copied here, being part of the regexp struct.
+ */
memcpy(&(ret->xpv_cur), &(r->xpv_cur),
sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
- SvLEN_set(ret_x, 0);
- SvSTASH_set(ret_x, NULL);
- SvMAGIC_set(ret_x, NULL);
if (r->offs) {
const I32 npar = r->nparens+1;
Newx(ret->offs, npar, regexp_paren_pair);
#ifdef PERL_OLD_COPY_ON_WRITE
ret->saved_copy = NULL;
#endif
- ret->mother_re = rx;
+ ret->mother_re = ReREFCNT_inc(r->mother_re ? r->mother_re : rx);
SvREFCNT_inc_void(ret->qr_anoncv);
return ret_x;
Perl_regfree_internal(pTHX_ REGEXP * const rx)
{
dVAR;
- struct regexp *const r = (struct regexp *)SvANY(rx);
+ struct regexp *const r = ReANY(rx);
RXi_GET_DECL(r,ri);
GET_RE_DEBUG_FLAGS_DECL;
{
dVAR;
I32 npar;
- const struct regexp *r = (const struct regexp *)SvANY(sstr);
- struct regexp *ret = (struct regexp *)SvANY(dstr);
+ const struct regexp *r = ReANY(sstr);
+ struct regexp *ret = ReANY(dstr);
PERL_ARGS_ASSERT_RE_DUP_GUTS;
ret->saved_copy = NULL;
#endif
- if (ret->mother_re) {
- if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
- /* Our storage points directly to our mother regexp, but that's
+ /* Whether mother_re be set or no, we need to copy the string. We
+ cannot refrain from copying it when the storage points directly to
+ our mother regexp, because that's
1: a buffer in a different thread
2: something we no longer hold a reference on
so we need to copy it locally. */
- /* Note we need to use SvCUR(), rather than
- SvLEN(), on our mother_re, because it, in
- turn, may well be pointing to its own mother_re. */
- SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
- SvCUR(ret->mother_re)+1));
- SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
- }
- ret->mother_re = NULL;
- }
+ RX_WRAPPED(dstr) = SAVEPVN(RX_WRAPPED(sstr), SvCUR(sstr)+1);
+ ret->mother_re = NULL;
ret->gofs = 0;
}
#endif /* PERL_IN_XSUB_RE */
Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
{
dVAR;
- struct regexp *const r = (struct regexp *)SvANY(rx);
+ struct regexp *const r = ReANY(rx);
regexp_internal *reti;
int len;
RXi_GET_DECL(r,ri);
Perl_regnext(pTHX_ register regnode *p)
{
dVAR;
- register I32 offset;
+ I32 offset;
if (!p)
return(NULL);
PL_reg_oldsaved = NULL;
PL_reg_oldsavedlen = 0;
+ PL_reg_oldsavedoffset = 0;
+ PL_reg_oldsavedcoffset = 0;
PL_reg_maxiter = 0;
PL_reg_leftiter = 0;
PL_reg_poscache = NULL;
SV* sv, I32 indent, U32 depth)
{
dVAR;
- register U8 op = PSEUDO; /* Arbitrary non-END op. */
- register const regnode *next;
+ U8 op = PSEUDO; /* Arbitrary non-END op. */
+ const regnode *next;
const regnode *optstart= NULL;
RXi_GET_DECL(r,ri);
if (PL_regkind[(U8)op] == BRANCHJ) {
assert(next);
{
- register const regnode *nnode = (OP(next) == LONGJMP
- ? regnext((regnode *)next)
- : next);
+ const regnode *nnode = (OP(next) == LONGJMP
+ ? regnext((regnode *)next)
+ : next);
if (last && nnode > last)
nnode = last;
DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);