#endif
#include "dquote_static.c"
+#ifndef PERL_IN_XSUB_RE
+# include "charclass_invlists.h"
+#endif
+
+#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
#ifdef op
#undef op
#define STATIC static
#endif
+
typedef struct RExC_state_t {
- U32 flags; /* are we folding, multilining? */
+ U32 flags; /* RXf_* are we folding, multilining? */
+ U32 pm_flags; /* PMf_* stuff from the calling PMOP */
char *precomp; /* uncompiled string. */
REGEXP *rx_sv; /* The SV that is the regexp. */
regexp *rx; /* perl core regexp structure */
I32 nestroot; /* root parens we are in - used by accept */
I32 extralen;
I32 seen_zerolen;
- I32 seen_evals;
regnode **open_parens; /* pointers to open parens */
regnode **close_parens; /* pointers to close parens */
regnode *opend; /* END node in program */
I32 in_lookbehind;
I32 contains_locale;
I32 override_recoding;
+ struct reg_code_block *code_blocks; /* positions of literal (?{})
+ within pattern */
+ int num_code_blocks; /* size of code_blocks[] */
+ int code_index; /* next code_blocks[] slot */
#if ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#endif
+ SV *runtime_code_qr; /* qr with the runtime code blocks */
#ifdef DEBUGGING
const char *lastparse;
I32 lastnum;
} RExC_state_t;
#define RExC_flags (pRExC_state->flags)
+#define RExC_pm_flags (pRExC_state->pm_flags)
#define RExC_precomp (pRExC_state->precomp)
#define RExC_rx_sv (pRExC_state->rx_sv)
#define RExC_rx (pRExC_state->rx)
#define RExC_nestroot (pRExC_state->nestroot)
#define RExC_extralen (pRExC_state->extralen)
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
-#define RExC_seen_evals (pRExC_state->seen_evals)
#define RExC_utf8 (pRExC_state->utf8)
#define RExC_uni_semantics (pRExC_state->uni_semantics)
#define RExC_orig_utf8 (pRExC_state->orig_utf8)
#define HASWIDTH 0x01 /* Known to match non-null strings. */
/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
- * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
+ * character, and if utf8, must be invariant. Note that this is not the same
+ * thing as REGNODE_SIMPLE */
#define SIMPLE 0x02
#define SPSTART 0x04 /* Starts with * or +. */
#define TRYAGAIN 0x08 /* Weeded out a declaration. */
#define SCF_SEEN_ACCEPT 0x8000
#define UTF cBOOL(RExC_utf8)
+
+/* The enums for all these are ordered so things work out correctly */
#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
-#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
+#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
*(d++) = uv;
*/
-#define TRIE_STORE_REVCHAR \
+#define TRIE_STORE_REVCHAR(val) \
STMT_START { \
if (UTF) { \
- SV *zlopp = newSV(2); \
+ SV *zlopp = newSV(7); /* XXX: optimize me */ \
unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
- unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
+ unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, val); \
SvCUR_set(zlopp, kapow - flrbbbbb); \
SvPOK_on(zlopp); \
SvUTF8_on(zlopp); \
av_push(revcharmap, zlopp); \
} else { \
- char ooooff = (char)uvc; \
+ char ooooff = (char)val; \
av_push(revcharmap, newSVpvn(&ooooff, 1)); \
} \
} STMT_END
-#define TRIE_READ_CHAR STMT_START { \
- wordlen++; \
- if ( UTF ) { \
- if ( folder ) { \
- if ( foldlen > 0 ) { \
- uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
- foldlen -= len; \
- scan += len; \
- len = 0; \
- } else { \
- len = UTF8SKIP(uc);\
- uvc = to_utf8_fold( uc, foldbuf, &foldlen); \
- foldlen -= UNISKIP( uvc ); \
- scan = foldbuf + UNISKIP( uvc ); \
- } \
- } else { \
- uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
- } \
- } else { \
- uvc = (U32)*uc; \
- len = 1; \
- } \
+#define TRIE_READ_CHAR STMT_START { \
+ wordlen++; \
+ if ( UTF ) { \
+ /* if it is UTF then it is either already folded, or does not need folding */ \
+ uvc = utf8n_to_uvuni( (const U8*) uc, UTF8_MAXLEN, &len, uniflags); \
+ } \
+ else if (folder == PL_fold_latin1) { \
+ /* if we use this folder we have to obey unicode rules on latin-1 data */ \
+ if ( foldlen > 0 ) { \
+ uvc = utf8n_to_uvuni( (const U8*) scan, UTF8_MAXLEN, &len, uniflags ); \
+ foldlen -= len; \
+ scan += len; \
+ len = 0; \
+ } else { \
+ len = 1; \
+ uvc = _to_fold_latin1( (U8) *uc, foldbuf, &foldlen, 1); \
+ skiplen = UNISKIP(uvc); \
+ foldlen -= skiplen; \
+ scan = foldbuf + skiplen; \
+ } \
+ } else { \
+ /* raw data, will be folded later if needed */ \
+ uvc = (U32)*uc; \
+ len = 1; \
+ } \
} STMT_END
switch (flags) {
case EXACT: break;
case EXACTFA:
+ case EXACTFU_SS:
+ case EXACTFU_TRICKYFOLD:
case EXACTFU: folder = PL_fold_latin1; break;
case EXACTF: folder = PL_fold; break;
case EXACTFL: folder = PL_fold_locale; break;
- default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
+ default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
}
trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
trie->wordcount = word_count;
RExC_rxi->data->data[ data_slot ] = (void*)trie;
trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
- if (!(UTF && folder))
+ if (flags == EXACT)
trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
trie->wordcount+1, sizeof(reg_trie_wordinfo));
if (!SvIOK(re_trie_maxbuff)) {
sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
}
- DEBUG_OPTIMISE_r({
+ DEBUG_TRIE_COMPILE_r({
PerlIO_printf( Perl_debug_log,
"%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
(int)depth * 2 + 2, "",
*/
for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
- regnode * const noper = NEXTOPER( cur );
+ regnode *noper = NEXTOPER( cur );
const U8 *uc = (U8*)STRING( noper );
- const U8 * const e = uc + STR_LEN( noper );
+ const U8 *e = uc + STR_LEN( noper );
STRLEN foldlen = 0;
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ STRLEN skiplen = 0;
const U8 *scan = (U8*)NULL;
U32 wordlen = 0; /* required init */
STRLEN chars = 0;
bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
if (OP(noper) == NOTHING) {
- trie->minlen= 0;
- continue;
+ regnode *noper_next= regnext(noper);
+ if (noper_next != tail && OP(noper_next) == flags) {
+ noper = noper_next;
+ uc= (U8*)STRING(noper);
+ e= uc + STR_LEN(noper);
+ trie->minlen= STR_LEN(noper);
+ } else {
+ trie->minlen= 0;
+ continue;
+ }
}
- if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
+
+ if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
regardless of encoding */
-
+ if (OP( noper ) == EXACTFU_SS) {
+ /* false positives are ok, so just set this */
+ TRIE_BITMAP_SET(trie,0xDF);
+ }
+ }
for ( ; uc < e ; uc += len ) {
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
chars++;
if ( uvc < 256 ) {
+ if ( folder ) {
+ U8 folded= folder[ (U8) uvc ];
+ if ( !trie->charmap[ folded ] ) {
+ trie->charmap[ folded ]=( ++trie->uniquecharcount );
+ TRIE_STORE_REVCHAR( folded );
+ }
+ }
if ( !trie->charmap[ uvc ] ) {
trie->charmap[ uvc ]=( ++trie->uniquecharcount );
- if ( folder )
- trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
- TRIE_STORE_REVCHAR;
+ TRIE_STORE_REVCHAR( uvc );
}
if ( set_bit ) {
/* store the codepoint in the bitmap, and its folded
* equivalent. */
- TRIE_BITMAP_SET(trie,uvc);
+ TRIE_BITMAP_SET(trie, uvc);
/* store the folded codepoint */
- if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
+ if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
if ( !UTF ) {
/* store first byte of utf8 representation of
if ( !SvTRUE( *svpp ) ) {
sv_setiv( *svpp, ++trie->uniquecharcount );
- TRIE_STORE_REVCHAR;
+ TRIE_STORE_REVCHAR(uvc);
}
}
}
if( cur == first ) {
- trie->minlen=chars;
- trie->maxlen=chars;
+ trie->minlen = chars;
+ trie->maxlen = chars;
} else if (chars < trie->minlen) {
- trie->minlen=chars;
+ trie->minlen = chars;
} else if (chars > trie->maxlen) {
- trie->maxlen=chars;
+ trie->maxlen = chars;
+ }
+ if (OP( noper ) == EXACTFU_SS) {
+ /* XXX: workaround - 'ss' could match "\x{DF}" so minlen could be 1 and not 2*/
+ if (trie->minlen > 1)
+ trie->minlen= 1;
+ }
+ if (OP( noper ) == EXACTFU_TRICKYFOLD) {
+ /* XXX: workround - things like "\x{1FBE}\x{0308}\x{0301}" can match "\x{0390}"
+ * - We assume that any such sequence might match a 2 byte string */
+ if (trie->minlen > 2 )
+ trie->minlen= 2;
}
} /* end first pass */
for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
- regnode * const noper = NEXTOPER( cur );
+ regnode *noper = NEXTOPER( cur );
U8 *uc = (U8*)STRING( noper );
- const U8 * const e = uc + STR_LEN( noper );
+ const U8 *e = uc + STR_LEN( noper );
U32 state = 1; /* required init */
U16 charid = 0; /* sanity init */
U8 *scan = (U8*)NULL; /* sanity init */
STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ STRLEN skiplen = 0;
+
+ if (OP(noper) == NOTHING) {
+ regnode *noper_next= regnext(noper);
+ if (noper_next != tail && OP(noper_next) == flags) {
+ noper = noper_next;
+ uc= (U8*)STRING(noper);
+ e= uc + STR_LEN(noper);
+ }
+ }
if (OP(noper) != NOTHING) {
for ( ; uc < e ; uc += len ) {
for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
- regnode * const noper = NEXTOPER( cur );
+ regnode *noper = NEXTOPER( cur );
const U8 *uc = (U8*)STRING( noper );
- const U8 * const e = uc + STR_LEN( noper );
+ const U8 *e = uc + STR_LEN( noper );
U32 state = 1; /* required init */
STRLEN foldlen = 0; /* required init */
U32 wordlen = 0; /* required init */
+ STRLEN skiplen = 0;
U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
+ if (OP(noper) == NOTHING) {
+ regnode *noper_next= regnext(noper);
+ if (noper_next != tail && OP(noper_next) == flags) {
+ noper = noper_next;
+ uc= (U8*)STRING(noper);
+ e= uc + STR_LEN(noper);
+ }
+ }
+
if ( OP(noper) != NOTHING ) {
for ( ; uc < e ; uc += len ) {
* The adjacent nodes actually may be separated by NOTHING kind nodes, and
* these get optimized out
*
- * If there are problematic code sequences, *min_change is set to the delta
- * that the minimum size of the node can be off from its actual size. And, the
- * node type of the result is changed to reflect that it contains these
+ * If there are problematic code sequences, *min_subtract is set to the delta
+ * that the minimum size of the node can be less than its actual size. And,
+ * the node type of the result is changed to reflect that it contains these
* sequences.
*
+ * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
+ * and contains LATIN SMALL LETTER SHARP S
+ *
* This is as good a place as any to discuss the design of handling these
* problematic sequences. It's been wrong in Perl for a very long time. There
* are three code points in Unicode whose folded lengths differ so much from
* construction. Why only these are problematic, and not others where lengths
* also differ is something I (khw) do not understand. New versions of Unicode
* might add more such code points. Hopefully the logic in fold_grind.t that
- * figures out what to test (in part by veriying that each size-combination
+ * figures out what to test (in part by verifying that each size-combination
* gets tested) will catch any that do come along, so they can be added to the
- * special handling below. The chances of this are actually rather small, as
- * most, if not all, of the world's scripts that have casefolding have already
- * been encoded by Unicode. Also, a number of Unicode's decisions were made to
- * allow compatibility with pre-existing standards, and almost all of those
- * have already been dealt with. These would otherwise be the most likely
- * candidates for generating further tricky sequences. In other words, Unicode
- * by itself is unlikely to add new ones unless it is for compatibility with
- * pre-existing standards.
+ * special handling below. The chances of new ones are actually rather small,
+ * as most, if not all, of the world's scripts that have casefolding have
+ * already been encoded by Unicode. Also, a number of Unicode's decisions were
+ * made to allow compatibility with pre-existing standards, and almost all of
+ * those have already been dealt with. These would otherwise be the most
+ * likely candidates for generating further tricky sequences. In other words,
+ * Unicode by itself is unlikely to add new ones unless it is for compatibility
+ * with pre-existing standards, and there aren't many of those left.
*
* The previous designs for dealing with these involved assigning a special
* node for them. This approach doesn't work, as evidenced by this example:
- * "\xDFs" =~ /s\xDF/ui
+ * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
* Both these fold to "sss", but if the pattern is parsed to create a node of
* that would match just the \xDF, it won't be able to handle the case where a
* successful match would have to cross the node's boundary. The new approach
* There are a number of components to the approach (a lot of work for just
* three code points!):
* 1) This routine examines each EXACTFish node that could contain the
- * problematic sequences, and if found, returns in *min_change the total
- * delta between the actual length of the string and one that could match
- * it. This delta is used by the caller to adjust the min length of the
- * match, and the delta between min and max, so that the optimizer doesn't
- * reject these possibilities based on size constraints
+ * problematic sequences. It returns in *min_subtract how much to
+ * subtract from the the actual length of the string to get a real minimum
+ * for one that could match it. This number is usually 0 except for the
+ * problematic sequences. This delta is used by the caller to adjust the
+ * min length of the match, and the delta between min and max, so that the
+ * optimizer doesn't reject these possibilities based on size constraints.
* 2) These sequences are not currently correctly handled by the trie code
* either, so it changes the joined node type to ops that are not handled
- * by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
+ * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
* 3) This is sufficient for the two Greek sequences (described below), but
* the one involving the Sharp s (\xDF) needs more. The node type
* EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
* takes advantage of this. Generally, an EXACTFish node that is in UTF-8
* is pre-folded by regcomp.c. This saves effort in regex matching.
* However, probably mostly for historical reasons, the pre-folding isn't
- * done for non-UTF8 patterns. The fold possibilities for these are quite
- * simple, except for the sharp s. All the ones that don't involve a
- * UTF-8 target string are members of a fold-pair, and arrays are set up
- * for all of them that quickly find the other member of the pair. It
- * might actually be faster to pre-fold these, but it isn't currently
- * done, except for the sharp s. Code elsewhere in this file makes sure
- * that it gets folded to 'ss', even if the pattern isn't UTF-8. This
- * avoids the issues describe in the next item.
+ * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
+ * nodes, as what they fold to isn't known until runtime.) The fold
+ * possibilities for the non-UTF8 patterns are quite simple, except for
+ * the sharp s. All the ones that don't involve a UTF-8 target string
+ * are members of a fold-pair, and arrays are set up for all of them
+ * that quickly find the other member of the pair. It might actually
+ * be faster to pre-fold these, but it isn't currently done, except for
+ * the sharp s. Code elsewhere in this file makes sure that it gets
+ * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
+ * issues described in the next item.
* 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
* 'ss' or not is not knowable at compile time. It will match iff the
* target string is in UTF-8, unlike the EXACTFU nodes, where it always
* cases are either 1-1 folds when no UTF-8 is involved; or is true by
* virtue of having this file pre-fold UTF-8 patterns. I'm
* reluctant to try to change this assumption, so instead the code punts.
- * Elsewhere in this file, each EXACTF node is examined for the sharp s.
- * If found, a flag is set that later causes the optimizer in this file to
- * not set values for the floating and fixed string lengths, and thus
- * avoid the optimizer code in regexec.c that makes this invalid
- * assumption. Thus, there is no optimization based on string lengths for
- * EXACTF nodes that contain the sharp s. This only happens for /id rules
- * (which means the pattern isn't in UTF-8).
+ * This routine examines EXACTF nodes for the sharp s, and returns a
+ * boolean indicating whether or not the node is an EXACTF node that
+ * contains a sharp s. When it is true, the caller sets a flag that later
+ * causes the optimizer in this file to not set values for the floating
+ * and fixed string lengths, and thus avoids the optimizer code in
+ * regexec.c that makes the invalid assumption. Thus, there is no
+ * optimization based on string lengths for EXACTF nodes that contain the
+ * sharp s. This only happens for /id rules (which means the pattern
+ * isn't in UTF-8).
*/
-#define JOIN_EXACT(scan,min_change,flags) \
+#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
if (PL_regkind[OP(scan)] == EXACT) \
- join_exact(pRExC_state,(scan),(min_change),(flags),NULL,depth+1)
+ join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
STATIC U32
-S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, IV *min_change, U32 flags,regnode *val, U32 depth) {
+S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
/* Merge several consecutive EXACTish nodes into one. */
regnode *n = regnext(scan);
U32 stringok = 1;
}
#endif
}
-#define GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS 0x0390
-#define IOTA_D_T GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA_AND_TONOS
-#define GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS 0x03B0
-#define UPSILON_D_T GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS
- *min_change = 0;
+ *min_subtract = 0;
+ *has_exactf_sharp_s = FALSE;
/* Here, all the adjacent mergeable EXACTish nodes have been merged. We
* can now analyze for sequences of problematic code points. (Prior to
* this final joining, sequences could have been split over boundaries, and
- * hence missed). The sequences only happen in folding */
+ * hence missed). The sequences only happen in folding, hence for any
+ * non-EXACT EXACTish node */
if (OP(scan) != EXACT) {
- char *s, *t;
- char * s0 = STRING(scan);
- char * const s_end = s0 + STR_LEN(scan);
-
- /* First we look at the sequences that can occur only in UTF-8 strings.
- * The sequences are of length 6 */
- if (UTF && STR_LEN(scan) >= 6) {
+ U8 *s;
+ U8 * s0 = (U8*) STRING(scan);
+ U8 * const s_end = s0 + STR_LEN(scan);
+
+ /* The below is perhaps overboard, but this allows us to save a test
+ * each time through the loop at the expense of a mask. This is
+ * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
+ * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
+ * This uses an exclusive 'or' to find that bit and then inverts it to
+ * form a mask, with just a single 0, in the bit position where 'S' and
+ * 's' differ. */
+ const U8 S_or_s_mask = (U8) ~ ('S' ^ 's');
+ const U8 s_masked = 's' & S_or_s_mask;
+
+ /* One pass is made over the node's string looking for all the
+ * possibilities. to avoid some tests in the loop, there are two main
+ * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
+ * non-UTF-8 */
+ if (UTF) {
- /* Two problematic code points in Unicode casefolding of EXACT
- * nodes:
+ /* There are two problematic Greek code points in Unicode
+ * casefolding
*
* U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
* U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
* minimum length computation. (there are other code points that
* also fold to these two sequences, but the delta is smaller)
*
- * What we'll do is to look for the tail four bytes, and then peek
- * at the preceding two bytes to see whether we need to decrease
- * the minimum length by four (six minus two).
+ * If these sequences are found, the minimum length is decreased by
+ * four (six minus two).
*
- * Thanks to the design of UTF-8, there cannot be false matches:
- * A sequence of valid UTF-8 bytes cannot be a subsequence of
- * another valid sequence of UTF-8 bytes. */
+ * Similarly, 'ss' may match the single char and byte LATIN SMALL
+ * LETTER SHARP S. We decrease the min length by 1 for each
+ * occurrence of 'ss' found */
#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
- const char U390_first_byte = '\xb4';
- const char U390_2nd_byte = '\x68';
- const char U3B0_first_byte = '\xb5';
- const char U3B0_2nd_byte = '\x46';
- const char tail[] = "\xaf\x49\xaf\x42";
+# define U390_first_byte 0xb4
+ const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
+# define U3B0_first_byte 0xb5
+ const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
#else
- const char U390_first_byte = '\xce';
- const char U390_2nd_byte = '\xb9';
- const char U3B0_first_byte = '\xcf';
- const char U3B0_2nd_byte = '\x85';
- const char tail[] = "\xcc\x88\xcc\x81";
+# define U390_first_byte 0xce
+ const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
+# define U3B0_first_byte 0xcf
+ const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
#endif
- const STRLEN tail_len = sizeof(tail) - 1;
- for (s = s0 + 2; /* +2 is to skip the non-tail */
- s <= s_end - tail_len
- && (t = ninstr(s, s_end, tail, tail + tail_len));
- s = t + tail_len)
+ const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
+ yields a net of 0 */
+ /* Examine the string for one of the problematic sequences */
+ for (s = s0;
+ s < s_end - 1; /* Can stop 1 before the end, as minimum length
+ * sequence we are looking for is 2 */
+ s += UTF8SKIP(s))
{
- if ((t[-1] == U390_2nd_byte && t[-2] == U390_first_byte)
- || (t[-1] == U3B0_2nd_byte && t[-2] == U3B0_first_byte))
- {
- *min_change -= 4;
- /* This can't currently be handled by tries, so change the
- * node type to indicate this. */
- if (OP(scan) == EXACTFU) {
- OP(scan) = EXACTFU_NO_TRIE;
- }
+ /* Look for the first byte in each problematic sequence */
+ switch (*s) {
+ /* We don't have to worry about other things that fold to
+ * 's' (such as the long s, U+017F), as all above-latin1
+ * code points have been pre-folded */
+ case 's':
+ case 'S':
+
+ /* Current character is an 's' or 'S'. If next one is
+ * as well, we have the dreaded sequence */
+ if (((*(s+1) & S_or_s_mask) == s_masked)
+ /* These two node types don't have special handling
+ * for 'ss' */
+ && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
+ {
+ *min_subtract += 1;
+ OP(scan) = EXACTFU_SS;
+ s++; /* No need to look at this character again */
+ }
+ break;
+
+ case U390_first_byte:
+ if (s_end - s >= len
+
+ /* The 1's are because are skipping comparing the
+ * first byte */
+ && memEQ(s + 1, U390_tail, len - 1))
+ {
+ goto greek_sequence;
+ }
+ break;
+
+ case U3B0_first_byte:
+ if (! (s_end - s >= len
+ && memEQ(s + 1, U3B0_tail, len - 1)))
+ {
+ break;
+ }
+ greek_sequence:
+ *min_subtract += 4;
+
+ /* This can't currently be handled by trie's, so change
+ * the node type to indicate this. If EXACTFA and
+ * EXACTFL were ever to be handled by trie's, this
+ * would have to be changed. If this node has already
+ * been changed to EXACTFU_SS in this loop, leave it as
+ * is. (I (khw) think it doesn't matter in regexec.c
+ * for UTF patterns, but no need to change it */
+ if (OP(scan) == EXACTFU) {
+ OP(scan) = EXACTFU_TRICKYFOLD;
+ }
+ s += 6; /* We already know what this sequence is. Skip
+ the rest of it */
+ break;
}
}
}
+ else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
- /* The third problematic sequence is 'ss', which can match just the
- * single byte LATIN SMALL LETTER SHARP S, and it can do it in both
- * non- and UTF-8. Code elsewhere in this file makes sure, however,
- * that the sharp s gets folded to 'ss' under Unicode rules even if not
- * UTF-8. */
- if (STR_LEN(scan) >= 2
- && (OP(scan) == EXACTFU
- || OP(scan) == EXACTFU_NO_TRIE /* The code above could have
- set to this node type */
- || OP(scan) == EXACTF))
- {
- /* The string will be folded to 'ss' if it's in UTF-8, but it could
- * include capital 'S' instead of lower case when not UTF-8. We
- * could have different code to handle the two cases, but this is
- * not necessary since both S and s are invariants under UTF-8; and
- * not worth it, especially because we can use just one test for
- * either 'S' or 's' each * time through the loop (plus a mask).
- * Ths is because on both EBCDIC and ASCII machines, 'S' and 's'
- * differ by a single bit. On ASCII they are 32 apart; on EBCDIC,
- * they are 64. This uses an exclusive 'or' to find that bit and
- * then inverts it to form a mask, with just a single 0, in the bit
- * position where 'S' and 's' differ. */
- const char S_or_s_mask = ~ ('S' ^ 's');
- const char s_masked = 's' & S_or_s_mask;
-
- for (s = s0; s < s_end - 1; s++) {
- if (((*s & S_or_s_mask) == s_masked)
- && ((*(s+1) & S_or_s_mask) == s_masked))
- {
- s++;
- *min_change -= 1;
-
- /* EXACTFU_SS also isn't trie'able, so don't have to
- * preserve EXACTFU_NO_TRIE. EXACTF is also not trie'able,
- * and because we essentially punt the optimizations in its
- * case, we don't need to indicate that it has an ss */
- if (OP(scan) == EXACTFU || OP(scan) == EXACTFU_NO_TRIE) {
- OP(scan) = EXACTFU_SS;
- }
+ /* Here, the pattern is not UTF-8. We need to look only for the
+ * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
+ * in the final position. Otherwise we can stop looking 1 byte
+ * earlier because have to find both the first and second 's' */
+ const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
+
+ for (s = s0; s < upper; s++) {
+ switch (*s) {
+ case 'S':
+ case 's':
+ if (s_end - s > 1
+ && ((*(s+1) & S_or_s_mask) == s_masked))
+ {
+ *min_subtract += 1;
+
+ /* EXACTF nodes need to know that the minimum
+ * length changed so that a sharp s in the string
+ * can match this ss in the pattern, but they
+ * remain EXACTF nodes, as they are not trie'able,
+ * so don't have to invent a new node type to
+ * exclude them from the trie code */
+ if (OP(scan) != EXACTF) {
+ OP(scan) = EXACTFU_SS;
+ }
+ s++;
+ }
+ break;
+ case LATIN_SMALL_LETTER_SHARP_S:
+ if (OP(scan) == EXACTF) {
+ *has_exactf_sharp_s = TRUE;
+ }
+ break;
}
}
}
fake_study_recurse:
while ( scan && OP(scan) != END && scan < last ){
- IV min_change = 0;
+ UV min_subtract = 0; /* How much to subtract from the minimum node
+ length to get a real minimum (because the
+ folded version may be shorter) */
+ bool has_exactf_sharp_s = FALSE;
/* Peephole optimizer: */
DEBUG_STUDYDATA("Peep:", data,depth);
DEBUG_PEEP("Peep",scan,depth);
- JOIN_EXACT(scan,&min_change,0);
+
+ /* Its not clear to khw or hv why this is done here, and not in the
+ * clauses that deal with EXACT nodes. khw's guess is that it's
+ * because of a previous design */
+ JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
/* Follow the next-chain of the current node and optimize
away all the NOTHINGs from it. */
regnode *first = (regnode *)NULL;
regnode *last = (regnode *)NULL;
regnode *tail = scan;
- U8 optype = 0;
+ U8 trietype = 0;
U32 count=0;
#ifdef DEBUGGING
}
- DEBUG_OPTIMISE_r({
+ DEBUG_TRIE_COMPILE_r({
regprop(RExC_rx, mysv, tail );
PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
(int)depth * 2 + 2, "",
/*
- step through the branches, cur represents each
- branch, noper is the first thing to be matched
- as part of that branch and noper_next is the
- regnext() of that node. if noper is an EXACT
- and noper_next is the same as scan (our current
- position in the regex) then the EXACT branch is
- a possible optimization target. Once we have
- two or more consecutive such branches we can
- create a trie of the EXACT's contents and stich
- it in place. If the sequence represents all of
- the branches we eliminate the whole thing and
- replace it with a single TRIE. If it is a
- subsequence then we need to stitch it in. This
- means the first branch has to remain, and needs
- to be repointed at the item on the branch chain
- following the last branch optimized. This could
- be either a BRANCH, in which case the
- subsequence is internal, or it could be the
- item following the branch sequence in which
- case the subsequence is at the end.
+ Step through the branches
+ cur represents each branch,
+ noper is the first thing to be matched as part of that branch
+ noper_next is the regnext() of that node.
+
+ We normally handle a case like this /FOO[xyz]|BAR[pqr]/
+ via a "jump trie" but we also support building with NOJUMPTRIE,
+ which restricts the trie logic to structures like /FOO|BAR/.
+
+ If noper is a trieable nodetype then the branch is a possible optimization
+ target. If we are building under NOJUMPTRIE then we require that noper_next
+ is the same as scan (our current position in the regex program).
+
+ Once we have two or more consecutive such branches we can create a
+ trie of the EXACT's contents and stitch it in place into the program.
+
+ If the sequence represents all of the branches in the alternation we
+ replace the entire thing with a single TRIE node.
+
+ Otherwise when it is a subsequence we need to stitch it in place and
+ replace only the relevant branches. This means the first branch has
+ to remain as it is used by the alternation logic, and its next pointer,
+ and needs to be repointed at the item on the branch chain following
+ the last branch we have optimized away.
+
+ This could be either a BRANCH, in which case the subsequence is internal,
+ or it could be the item following the branch sequence in which case the
+ subsequence is at the end (which does not necessarily mean the first node
+ is the start of the alternation).
+
+ TRIE_TYPE(X) is a define which maps the optype to a trietype.
+
+ optype | trietype
+ ----------------+-----------
+ NOTHING | NOTHING
+ EXACT | EXACT
+ EXACTFU | EXACTFU
+ EXACTFU_SS | EXACTFU
+ EXACTFU_TRICKYFOLD | EXACTFU
+ EXACTFA | 0
+
*/
+#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
+ ( EXACT == (X) ) ? EXACT : \
+ ( EXACTFU == (X) || EXACTFU_SS == (X) || EXACTFU_TRICKYFOLD == (X) ) ? EXACTFU : \
+ 0 )
/* dont use tail as the end marker for this traverse */
for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
regnode * const noper = NEXTOPER( cur );
+ U8 noper_type = OP( noper );
+ U8 noper_trietype = TRIE_TYPE( noper_type );
#if defined(DEBUGGING) || defined(NOJUMPTRIE)
regnode * const noper_next = regnext( noper );
+ U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
+ U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
#endif
- DEBUG_OPTIMISE_r({
+ DEBUG_TRIE_COMPILE_r({
regprop(RExC_rx, mysv, cur);
PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
(int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
PerlIO_printf( Perl_debug_log,"\t=> %s\t",
SvPV_nolen_const(mysv));
}
- PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
- REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
+ PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
+ REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
+ PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
+ );
});
- if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
- : PL_regkind[ OP( noper ) ] == EXACT )
- || OP(noper) == NOTHING )
+
+ /* Is noper a trieable nodetype that can be merged with the
+ * current trie (if there is one)? */
+ if ( noper_trietype
+ &&
+ (
+ ( noper_trietype == NOTHING)
+ || ( trietype == NOTHING )
+ || ( trietype == noper_trietype )
+ )
#ifdef NOJUMPTRIE
&& noper_next == tail
#endif
&& count < U16_MAX)
{
- count++;
- if ( !first || optype == NOTHING ) {
- if (!first) first = cur;
- optype = OP( noper );
+ /* Handle mergable triable node
+ * Either we are the first node in a new trieable sequence,
+ * in which case we do some bookkeeping, otherwise we update
+ * the end pointer. */
+ if ( !first ) {
+ first = cur;
+ if ( noper_trietype == NOTHING ) {
+#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
+ regnode * const noper_next = regnext( noper );
+ U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
+ U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
+#endif
+
+ if ( noper_next_trietype ) {
+ trietype = noper_next_trietype;
+ } else if (noper_next_type) {
+ /* a NOTHING regop is 1 regop wide. We need at least two
+ * for a trie so we can't merge this in */
+ first = NULL;
+ }
+ } else {
+ trietype = noper_trietype;
+ }
} else {
+ if ( trietype == NOTHING )
+ trietype = noper_trietype;
last = cur;
}
- } else {
-/*
- Currently the trie logic handles case insensitive matching properly only
- when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
- semantics).
-
- If/when this is fixed the following define can be swapped
- in below to fully enable trie logic.
-
-#define TRIE_TYPE_IS_SAFE 1
-
-*/
-#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
-
- if ( last && TRIE_TYPE_IS_SAFE ) {
- make_trie( pRExC_state,
- startbranch, first, cur, tail, count,
- optype, depth+1 );
+ if (first)
+ count++;
+ } /* end handle mergable triable node */
+ else {
+ /* handle unmergable node -
+ * noper may either be a triable node which can not be tried
+ * together with the current trie, or a non triable node */
+ if ( last ) {
+ /* If last is set and trietype is not NOTHING then we have found
+ * at least two triable branch sequences in a row of a similar
+ * trietype so we can turn them into a trie. If/when we
+ * allow NOTHING to start a trie sequence this condition will be
+ * required, and it isn't expensive so we leave it in for now. */
+ if ( trietype != NOTHING )
+ make_trie( pRExC_state,
+ startbranch, first, cur, tail, count,
+ trietype, depth+1 );
+ last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
}
- if ( PL_regkind[ OP( noper ) ] == EXACT
+ if ( noper_trietype
#ifdef NOJUMPTRIE
&& noper_next == tail
#endif
){
+ /* noper is triable, so we can start a new trie sequence */
count = 1;
first = cur;
- optype = OP( noper );
- } else {
+ trietype = noper_trietype;
+ } else if (first) {
+ /* if we already saw a first but the current node is not triable then we have
+ * to reset the first information. */
count = 0;
first = NULL;
- optype = 0;
+ trietype = 0;
}
- last = NULL;
- }
- }
- DEBUG_OPTIMISE_r({
+ } /* end handle unmergable node */
+ } /* loop over branches */
+ DEBUG_TRIE_COMPILE_r({
regprop(RExC_rx, mysv, cur);
PerlIO_printf( Perl_debug_log,
"%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
});
-
- if ( last && TRIE_TYPE_IS_SAFE ) {
- made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
+ if ( last ) {
+ if ( trietype != NOTHING ) {
+ /* the last branch of the sequence was part of a trie,
+ * so we have to construct it here outside of the loop
+ */
+ made= make_trie( pRExC_state, startbranch, first, scan, tail, count, trietype, depth+1 );
#ifdef TRIE_STUDY_OPT
- if ( ((made == MADE_EXACT_TRIE &&
- startbranch == first)
- || ( first_non_open == first )) &&
- depth==0 ) {
- flags |= SCF_TRIE_RESTUDY;
- if ( startbranch == first
- && scan == tail )
- {
- RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
+ if ( ((made == MADE_EXACT_TRIE &&
+ startbranch == first)
+ || ( first_non_open == first )) &&
+ depth==0 ) {
+ flags |= SCF_TRIE_RESTUDY;
+ if ( startbranch == first
+ && scan == tail )
+ {
+ RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
+ }
}
- }
#endif
- }
- }
+ } else {
+ /* at this point we know whatever we have is a NOTHING sequence/branch
+ * AND if 'startbranch' is 'first' then we can turn the whole thing into a NOTHING
+ */
+ if ( startbranch == first ) {
+ regnode *opt;
+ /* the entire thing is a NOTHING sequence, something like this:
+ * (?:|) So we can turn it into a plain NOTHING op. */
+ DEBUG_TRIE_COMPILE_r({
+ regprop(RExC_rx, mysv, cur);
+ PerlIO_printf( Perl_debug_log,
+ "%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
+ "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
+
+ });
+ OP(startbranch)= NOTHING;
+ NEXT_OFF(startbranch)= tail - startbranch;
+ for ( opt= startbranch + 1; opt < tail ; opt++ )
+ OP(opt)= OPTIMIZED;
+ }
+ }
+ } /* end if ( last) */
+ } /* TRIE_MAXBUF is non zero */
} /* do trie */
UV uc;
if (UTF) {
const U8 * const s = (U8*)STRING(scan);
+ uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
- uc = utf8_to_uvchr(s, NULL);
} else {
uc = *((U8*)STRING(scan));
}
* elsewhere that does compute the fold settles down, it
* can be extracted out and re-used here */
for (i = 0; i < 256; i++){
- if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
+ if (HAS_NONLATIN1_FOLD_CLOSURE(i)) {
ANYOF_BITMAP_SET(data->start_class, i);
}
}
}
if (UTF) {
const U8 * const s = (U8 *)STRING(scan);
+ uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
- uc = utf8_to_uvchr(s, NULL);
}
- else if (OP(scan) == EXACTF) {
- if (memchr(STRING(scan), LATIN_SMALL_LETTER_SHARP_S, l)) {
- RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
- }
+ else if (has_exactf_sharp_s) {
+ RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
}
- min += l + min_change;
+ min += l - min_subtract;
if (min < 0) {
min = 0;
}
- delta += abs(min_change);
+ delta += min_subtract;
if (flags & SCF_DO_SUBSTR) {
- data->pos_min += l + min_change;
+ data->pos_min += l - min_subtract;
if (data->pos_min < 0) {
data->pos_min = 0;
}
- data->pos_delta += abs(min_change);
- if (min_change) {
+ data->pos_delta += min_subtract;
+ if (min_subtract) {
data->longest = &(data->longest_float);
}
}
/* Also set the other member of the fold pair. In case
* that unicode semantics is called for at runtime, use
* the full latin1 fold. (Can't do this for locale,
- * because not known until runtime */
+ * because not known until runtime) */
ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
- /* All folds except under /iaa that include s, S, and
- * sharp_s also may include the others */
+ /* All other (EXACTFL handled above) folds except under
+ * /iaa that include s, S, and sharp_s also may include
+ * the others */
if (OP(scan) != EXACTFA) {
if (uc == 's' || uc == 'S') {
ANYOF_BITMAP_SET(data->start_class,
/* Lookbehind, or need to calculate parens/evals/stclass: */
&& (scan->flags || data || (flags & SCF_DO_STCLASS))
&& (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
+ if ( OP(scan) == UNLESSM &&
+ scan->flags == 0 &&
+ OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
+ OP(regnext(NEXTOPER(NEXTOPER(scan)))) == SUCCEED
+ ) {
+ regnode *opt;
+ regnode *upto= regnext(scan);
+ DEBUG_PARSE_r({
+ SV * const mysv_val=sv_newmortal();
+ DEBUG_STUDYDATA("OPFAIL",data,depth);
+
+ /*DEBUG_PARSE_MSG("opfail");*/
+ regprop(RExC_rx, mysv_val, upto);
+ PerlIO_printf(Perl_debug_log, "~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
+ SvPV_nolen_const(mysv_val),
+ (IV)REG_NODE_NUM(upto),
+ (IV)(upto - scan)
+ );
+ });
+ OP(scan) = OPFAIL;
+ NEXT_OFF(scan) = upto - scan;
+ for (opt= scan + 1; opt < upto ; opt++)
+ OP(opt) = OPTIMIZED;
+ scan= upto;
+ continue;
+ }
if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
|| OP(scan) == UNLESSM )
{
}
}
}
-
-
}
#endif
}
#endif
/*
- - pregcomp - compile a regular expression into internal code
- *
- * We can't allocate space until we know how big the compiled form will be,
- * but we can't compile it (and thus know how big it is) until we've got a
- * place to put the code. So we cheat: we compile it twice, once with code
- * generation turned off and size counting turned on, and once "for real".
- * This also means that we don't allocate space until we are sure that the
- * thing really will compile successfully, and we never have to move the
- * code and thus invalidate pointers into it. (Note that it has to be in
- * one piece because free() must be able to free it all.) [NB: not true in perl]
+ * pregcomp - compile a regular expression into internal code
*
- * Beware that the optimization-preparation code in here knows about some
- * of the structure of the compiled regexp. [I'll say.]
+ * Decides which engine's compiler to call based on the hint currently in
+ * scope
*/
+#ifndef PERL_IN_XSUB_RE
+/* return the currently in-scope regex engine (or the default if none) */
+
+regexp_engine const *
+Perl_current_re_engine(pTHX)
+{
+ dVAR;
+
+ if (IN_PERL_COMPILETIME) {
+ HV * const table = GvHV(PL_hintgv);
+ SV **ptr;
+
+ if (!table)
+ return &PL_core_reg_engine;
+ ptr = hv_fetchs(table, "regcomp", FALSE);
+ if ( !(ptr && SvIOK(*ptr) && SvIV(*ptr)))
+ return &PL_core_reg_engine;
+ return INT2PTR(regexp_engine*,SvIV(*ptr));
+ }
+ else {
+ SV *ptr;
+ if (!PL_curcop->cop_hints_hash)
+ return &PL_core_reg_engine;
+ ptr = cop_hints_fetch_pvs(PL_curcop, "regcomp", 0);
+ if ( !(ptr && SvIOK(ptr) && SvIV(ptr)))
+ return &PL_core_reg_engine;
+ return INT2PTR(regexp_engine*,SvIV(ptr));
+ }
+}
-#ifndef PERL_IN_XSUB_RE
-#define RE_ENGINE_PTR &PL_core_reg_engine
-#else
-extern const struct regexp_engine my_reg_engine;
-#define RE_ENGINE_PTR &my_reg_engine
-#endif
-#ifndef PERL_IN_XSUB_RE
REGEXP *
Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
{
dVAR;
- HV * const table = GvHV(PL_hintgv);
+ regexp_engine const *eng = current_re_engine();
+ GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_PREGCOMP;
- /* Dispatch a request to compile a regexp to correct
- regexp engine. */
- if (table) {
- SV **ptr= hv_fetchs(table, "regcomp", FALSE);
- GET_RE_DEBUG_FLAGS_DECL;
- if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
- const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
- DEBUG_COMPILE_r({
- PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
- SvIV(*ptr));
- });
- return CALLREGCOMP_ENG(eng, pattern, flags);
- }
- }
- return Perl_re_compile(aTHX_ pattern, flags);
+ /* Dispatch a request to compile a regexp to correct regexp engine. */
+ DEBUG_COMPILE_r({
+ PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
+ PTR2UV(eng));
+ });
+ return CALLREGCOMP_ENG(eng, pattern, flags);
}
#endif
+/* public(ish) wrapper for Perl_re_op_compile that only takes an SV
+ * pattern rather than a list of OPs */
+
+REGEXP *
+Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
+{
+ SV *pat = pattern; /* defeat constness! */
+ PERL_ARGS_ASSERT_RE_COMPILE;
+ return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(),
+ NULL, NULL, rx_flags, 0);
+}
+
+/* see if there are any run-time code blocks in the pattern.
+ * False positives are allowed */
+
+static bool
+S_has_runtime_code(pTHX_ RExC_state_t * const pRExC_state, OP *expr,
+ U32 pm_flags, char *pat, STRLEN plen)
+{
+ int n = 0;
+ STRLEN s;
+
+ /* avoid infinitely recursing when we recompile the pattern parcelled up
+ * as qr'...'. A single constant qr// string can't have have any
+ * run-time component in it, and thus, no runtime code. (A non-qr
+ * string, however, can, e.g. $x =~ '(?{})') */
+ if ((pm_flags & PMf_IS_QR) && expr && expr->op_type == OP_CONST)
+ return 0;
+
+ for (s = 0; s < plen; s++) {
+ if (n < pRExC_state->num_code_blocks
+ && s == pRExC_state->code_blocks[n].start)
+ {
+ s = pRExC_state->code_blocks[n].end;
+ n++;
+ continue;
+ }
+ /* TODO ideally should handle [..], (#..), /#.../x to reduce false
+ * positives here */
+ if (pat[s] == '(' && pat[s+1] == '?' &&
+ (pat[s+2] == '{' || (pat[s+2] == '?' && pat[s+3] == '{'))
+ )
+ return 1;
+ }
+ return 0;
+}
+
+/* Handle run-time code blocks. We will already have compiled any direct
+ * or indirect literal code blocks. Now, take the pattern 'pat' and make a
+ * copy of it, but with any literal code blocks blanked out and
+ * appropriate chars escaped; then feed it into
+ *
+ * eval "qr'modified_pattern'"
+ *
+ * For example,
+ *
+ * a\bc(?{"this was literal"})def'ghi\\jkl(?{"this is runtime"})mno
+ *
+ * becomes
+ *
+ * qr'a\\bc def\'ghi\\\\jkl(?{"this is runtime"})mno'
+ *
+ * After eval_sv()-ing that, grab any new code blocks from the returned qr
+ * and merge them with any code blocks of the original regexp.
+ *
+ * If the pat is non-UTF8, while the evalled qr is UTF8, don't merge;
+ * instead, just save the qr and return FALSE; this tells our caller that
+ * the original pattern needs upgrading to utf8.
+ */
+
+bool
+S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
+ char *pat, STRLEN plen)
+{
+ SV *qr;
+
+ GET_RE_DEBUG_FLAGS_DECL;
+
+ if (pRExC_state->runtime_code_qr) {
+ /* this is the second time we've been called; this should
+ * only happen if the main pattern got upgraded to utf8
+ * during compilation; re-use the qr we compiled first time
+ * round (which should be utf8 too)
+ */
+ qr = pRExC_state->runtime_code_qr;
+ pRExC_state->runtime_code_qr = NULL;
+ assert(RExC_utf8 && SvUTF8(qr));
+ }
+ else {
+ int n = 0;
+ STRLEN s;
+ char *p, *newpat;
+ int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
+ SV *sv, *qr_ref;
+ dSP;
+
+ /* determine how many extra chars we need for ' and \ escaping */
+ for (s = 0; s < plen; s++) {
+ if (pat[s] == '\'' || pat[s] == '\\')
+ newlen++;
+ }
+
+ Newx(newpat, newlen, char);
+ p = newpat;
+ *p++ = 'q'; *p++ = 'r'; *p++ = '\'';
+
+ for (s = 0; s < plen; s++) {
+ if (n < pRExC_state->num_code_blocks
+ && s == pRExC_state->code_blocks[n].start)
+ {
+ /* blank out literal code block */
+ assert(pat[s] == '(');
+ while (s <= pRExC_state->code_blocks[n].end) {
+ *p++ = ' ';
+ s++;
+ }
+ s--;
+ n++;
+ continue;
+ }
+ if (pat[s] == '\'' || pat[s] == '\\')
+ *p++ = '\\';
+ *p++ = pat[s];
+ }
+ *p++ = '\'';
+ if (pRExC_state->pm_flags & RXf_PMf_EXTENDED)
+ *p++ = 'x';
+ *p++ = '\0';
+ DEBUG_COMPILE_r({
+ PerlIO_printf(Perl_debug_log,
+ "%sre-parsing pattern for runtime code:%s %s\n",
+ PL_colors[4],PL_colors[5],newpat);
+ });
+
+ sv = newSVpvn_flags(newpat, p-newpat-1, RExC_utf8 ? SVf_UTF8 : 0);
+ Safefree(newpat);
+
+ ENTER;
+ SAVETMPS;
+ save_re_context();
+ PUSHSTACKi(PERLSI_REQUIRE);
+ /* this causes the toker to collapse \\ into \ when parsing
+ * qr''; normally only q'' does this. It also alters hints
+ * handling */
+ PL_reg_state.re_reparsing = TRUE;
+ eval_sv(sv, G_SCALAR);
+ SvREFCNT_dec(sv);
+ SPAGAIN;
+ qr_ref = POPs;
+ PUTBACK;
+ if (SvTRUE(ERRSV))
+ Perl_croak(aTHX_ "%s", SvPVx_nolen_const(ERRSV));
+ assert(SvROK(qr_ref));
+ qr = SvRV(qr_ref);
+ assert(SvTYPE(qr) == SVt_REGEXP && RX_ENGINE((REGEXP*)qr)->op_comp);
+ /* the leaving below frees the tmp qr_ref.
+ * Give qr a life of its own */
+ SvREFCNT_inc(qr);
+ POPSTACK;
+ FREETMPS;
+ LEAVE;
+
+ }
+
+ if (!RExC_utf8 && SvUTF8(qr)) {
+ /* first time through; the pattern got upgraded; save the
+ * qr for the next time through */
+ assert(!pRExC_state->runtime_code_qr);
+ pRExC_state->runtime_code_qr = qr;
+ return 0;
+ }
+
+
+ /* extract any code blocks within the returned qr// */
+
+
+ /* merge the main (r1) and run-time (r2) code blocks into one */
+ {
+ RXi_GET_DECL(((struct regexp*)SvANY(qr)), r2);
+ struct reg_code_block *new_block, *dst;
+ RExC_state_t * const r1 = pRExC_state; /* convenient alias */
+ int i1 = 0, i2 = 0;
+
+ if (!r2->num_code_blocks) /* we guessed wrong */
+ return 1;
+
+ Newx(new_block,
+ r1->num_code_blocks + r2->num_code_blocks,
+ struct reg_code_block);
+ dst = new_block;
+
+ while ( i1 < r1->num_code_blocks
+ || i2 < r2->num_code_blocks)
+ {
+ struct reg_code_block *src;
+ bool is_qr = 0;
+
+ if (i1 == r1->num_code_blocks) {
+ src = &r2->code_blocks[i2++];
+ is_qr = 1;
+ }
+ else if (i2 == r2->num_code_blocks)
+ src = &r1->code_blocks[i1++];
+ else if ( r1->code_blocks[i1].start
+ < r2->code_blocks[i2].start)
+ {
+ src = &r1->code_blocks[i1++];
+ assert(src->end < r2->code_blocks[i2].start);
+ }
+ else {
+ assert( r1->code_blocks[i1].start
+ > r2->code_blocks[i2].start);
+ src = &r2->code_blocks[i2++];
+ is_qr = 1;
+ assert(src->end < r1->code_blocks[i1].start);
+ }
+
+ assert(pat[src->start] == '(');
+ assert(pat[src->end] == ')');
+ dst->start = src->start;
+ dst->end = src->end;
+ dst->block = src->block;
+ dst->src_regex = is_qr ? (REGEXP*) SvREFCNT_inc( (SV*) qr)
+ : src->src_regex;
+ dst++;
+ }
+ r1->num_code_blocks += r2->num_code_blocks;
+ Safefree(r1->code_blocks);
+ r1->code_blocks = new_block;
+ }
+
+ SvREFCNT_dec(qr);
+ return 1;
+}
+
+
+/*
+ * Perl_re_op_compile - the perl internal RE engine's function to compile a
+ * regular expression into internal code.
+ * The pattern may be passed either as:
+ * a list of SVs (patternp plus pat_count)
+ * a list of OPs (expr)
+ * If both are passed, the SV list is used, but the OP list indicates
+ * which SVs are actually pre-compiled code blocks
+ *
+ * The SVs in the list have magic and qr overloading applied to them (and
+ * the list may be modified in-place with replacement SVs in the latter
+ * case).
+ *
+ * If the pattern hasn't changed from old_re, then old_re will be
+ * returned.
+ *
+ * eng is the current engine. If that engine has an op_comp method, then
+ * handle directly (i.e. we assume that op_comp was us); otherwise, just
+ * do the initial concatenation of arguments and pass on to the external
+ * engine.
+ *
+ * If is_bare_re is not null, set it to a boolean indicating whether the
+ * arg list reduced (after overloading) to a single bare regex which has
+ * been returned (i.e. /$qr/).
+ *
+ * orig_rx_flags contains RXf_* flags. See perlreapi.pod for more details.
+ *
+ * pm_flags contains the PMf_* flags, typically based on those from the
+ * pm_flags field of the related PMOP. Currently we're only interested in
+ * PMf_HAS_CV, PMf_IS_QR, PMf_USE_RE_EVAL.
+ *
+ * We can't allocate space until we know how big the compiled form will be,
+ * but we can't compile it (and thus know how big it is) until we've got a
+ * place to put the code. So we cheat: we compile it twice, once with code
+ * generation turned off and size counting turned on, and once "for real".
+ * This also means that we don't allocate space until we are sure that the
+ * thing really will compile successfully, and we never have to move the
+ * code and thus invalidate pointers into it. (Note that it has to be in
+ * one piece because free() must be able to free it all.) [NB: not true in perl]
+ *
+ * Beware that the optimization-preparation code in here knows about some
+ * of the structure of the compiled regexp. [I'll say.]
+ */
+
REGEXP *
-Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
+Perl_re_op_compile(pTHX_ SV ** const patternp, int pat_count,
+ OP *expr, const regexp_engine* eng, REGEXP *VOL old_re,
+ bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
{
dVAR;
REGEXP *rx;
struct regexp *r;
register regexp_internal *ri;
STRLEN plen;
- char* VOL exp;
+ char * VOL exp;
char* xend;
regnode *scan;
I32 flags;
I32 minlen = 0;
- U32 pm_flags;
+ U32 rx_flags;
+ SV * VOL pat;
/* these are all flags - maybe they should be turned
* into a single int with different bit masks */
I32 sawplus = 0;
I32 sawopen = 0;
bool used_setjump = FALSE;
- regex_charset initial_charset = get_regex_charset(orig_pm_flags);
-
+ regex_charset initial_charset = get_regex_charset(orig_rx_flags);
+ bool code_is_utf8 = 0;
+ bool VOL recompile = 0;
+ bool runtime_code = 0;
U8 jump_ret = 0;
dJMPENV;
scan_data_t data;
#endif
GET_RE_DEBUG_FLAGS_DECL;
- PERL_ARGS_ASSERT_RE_COMPILE;
+ PERL_ARGS_ASSERT_RE_OP_COMPILE;
DEBUG_r(if (!PL_colorset) reginitcolors());
- exp = SvPV(pattern, plen);
+#ifndef PERL_IN_XSUB_RE
+ /* Initialize these here instead of as-needed, as is quick and avoids
+ * having to test them each time otherwise */
+ if (! PL_AboveLatin1) {
+ PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
+ PL_ASCII = _new_invlist_C_array(ASCII_invlist);
+ PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
- if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
- RExC_utf8 = RExC_orig_utf8 = 0;
- }
- else {
- RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
- }
- RExC_uni_semantics = 0;
- RExC_contains_locale = 0;
+ PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
+ PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
- /****************** LONG JUMP TARGET HERE***********************/
- /* Longjmp back to here if have to switch in midstream to utf8 */
- if (! RExC_orig_utf8) {
- JMPENV_PUSH(jump_ret);
- used_setjump = TRUE;
- }
+ PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
+ PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
- if (jump_ret == 0) { /* First time through */
- xend = exp + plen;
+ PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
+ PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
- DEBUG_COMPILE_r({
- SV *dsv= sv_newmortal();
- RE_PV_QUOTED_DECL(s, RExC_utf8,
- dsv, exp, plen, 60);
- PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
- PL_colors[4],PL_colors[5],s);
- });
- }
- else { /* longjumped back */
- STRLEN len = plen;
+ PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
- /* If the cause for the longjmp was other than changing to utf8, pop
- * our own setjmp, and longjmp to the correct handler */
- if (jump_ret != UTF8_LONGJMP) {
- JMPENV_POP;
- JMPENV_JUMP(jump_ret);
- }
+ PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
+ PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
- GET_RE_DEBUG_FLAGS;
+ PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
- /* It's possible to write a regexp in ascii that represents Unicode
- codepoints outside of the byte range, such as via \x{100}. If we
- detect such a sequence we have to convert the entire pattern to utf8
- and then recompile, as our sizing calculation will have been based
- on 1 byte == 1 character, but we will need to use utf8 to encode
- at least some part of the pattern, and therefore must convert the whole
- thing.
- -- dmq */
- DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
+ PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
+ PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
+
+ PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
+ PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
+
+ PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
+ PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
+
+ PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
+ PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
+
+ PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
+ PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
+
+ PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
+ PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
+
+ PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
+ PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
+
+ PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
+ PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
+
+ PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
+
+ PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
+ PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
+
+ PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
+ PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
+ }
+#endif
+
+ pRExC_state->code_blocks = NULL;
+ pRExC_state->num_code_blocks = 0;
+
+ if (is_bare_re)
+ *is_bare_re = FALSE;
+
+ if (expr && (expr->op_type == OP_LIST ||
+ (expr->op_type == OP_NULL && expr->op_targ == OP_LIST))) {
+
+ /* is the source UTF8, and how many code blocks are there? */
+ OP *o;
+ int ncode = 0;
+
+ for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ if (o->op_type == OP_CONST && SvUTF8(cSVOPo_sv))
+ code_is_utf8 = 1;
+ else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
+ /* count of DO blocks */
+ ncode++;
+ }
+ if (ncode) {
+ pRExC_state->num_code_blocks = ncode;
+ Newx(pRExC_state->code_blocks, ncode, struct reg_code_block);
+ }
+ }
+
+ if (pat_count) {
+ /* handle a list of SVs */
+
+ SV **svp;
+
+ /* apply magic and RE overloading to each arg */
+ for (svp = patternp; svp < patternp + pat_count; svp++) {
+ SV *rx = *svp;
+ SvGETMAGIC(rx);
+ if (SvROK(rx) && SvAMAGIC(rx)) {
+ SV *sv = AMG_CALLunary(rx, regexp_amg);
+ if (sv) {
+ if (SvROK(sv))
+ sv = SvRV(sv);
+ if (SvTYPE(sv) != SVt_REGEXP)
+ Perl_croak(aTHX_ "Overloaded qr did not return a REGEXP");
+ *svp = sv;
+ }
+ }
+ }
+
+ if (pat_count > 1) {
+ /* concat multiple args and find any code block indexes */
+
+ OP *o = NULL;
+ int n = 0;
+ bool utf8 = 0;
+ STRLEN orig_patlen = 0;
+
+ if (pRExC_state->num_code_blocks) {
+ o = cLISTOPx(expr)->op_first;
+ assert(o->op_type == OP_PUSHMARK);
+ o = o->op_sibling;
+ }
+
+ pat = newSVpvn("", 0);
+ SAVEFREESV(pat);
+
+ /* determine if the pattern is going to be utf8 (needed
+ * in advance to align code block indices correctly).
+ * XXX This could fail to be detected for an arg with
+ * overloading but not concat overloading; but the main effect
+ * in this obscure case is to need a 'use re eval' for a
+ * literal code block */
+ for (svp = patternp; svp < patternp + pat_count; svp++) {
+ if (SvUTF8(*svp))
+ utf8 = 1;
+ }
+ if (utf8)
+ SvUTF8_on(pat);
+
+ for (svp = patternp; svp < patternp + pat_count; svp++) {
+ SV *sv, *msv = *svp;
+ SV *rx;
+ bool code = 0;
+ if (o) {
+ if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
+ assert(n < pRExC_state->num_code_blocks);
+ pRExC_state->code_blocks[n].start = SvCUR(pat);
+ pRExC_state->code_blocks[n].block = o;
+ pRExC_state->code_blocks[n].src_regex = NULL;
+ n++;
+ code = 1;
+ o = o->op_sibling; /* skip CONST */
+ assert(o);
+ }
+ o = o->op_sibling;;
+ }
+
+ if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
+ (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
+ {
+ sv_setsv(pat, sv);
+ /* overloading involved: all bets are off over literal
+ * code. Pretend we haven't seen it */
+ pRExC_state->num_code_blocks -= n;
+ n = 0;
+ rx = NULL;
+
+ }
+ else {
+ while (SvAMAGIC(msv)
+ && (sv = AMG_CALLunary(msv, string_amg))
+ && sv != msv)
+ {
+ msv = sv;
+ SvGETMAGIC(msv);
+ }
+ if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
+ msv = SvRV(msv);
+ orig_patlen = SvCUR(pat);
+ sv_catsv_nomg(pat, msv);
+ rx = msv;
+ if (code)
+ pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
+ }
+
+ /* extract any code blocks within any embedded qr//'s */
+ if (rx && SvTYPE(rx) == SVt_REGEXP
+ && RX_ENGINE((REGEXP*)rx)->op_comp)
+ {
+
+ RXi_GET_DECL(((struct regexp*)SvANY(rx)), ri);
+ if (ri->num_code_blocks) {
+ int i;
+ /* the presence of an embedded qr// with code means
+ * we should always recompile: the text of the
+ * qr// may not have changed, but it may be a
+ * different closure than last time */
+ recompile = 1;
+ Renew(pRExC_state->code_blocks,
+ pRExC_state->num_code_blocks + ri->num_code_blocks,
+ struct reg_code_block);
+ pRExC_state->num_code_blocks += ri->num_code_blocks;
+ for (i=0; i < ri->num_code_blocks; i++) {
+ struct reg_code_block *src, *dst;
+ STRLEN offset = orig_patlen
+ + ((struct regexp *)SvANY(rx))->pre_prefix;
+ assert(n < pRExC_state->num_code_blocks);
+ src = &ri->code_blocks[i];
+ dst = &pRExC_state->code_blocks[n];
+ dst->start = src->start + offset;
+ dst->end = src->end + offset;
+ dst->block = src->block;
+ dst->src_regex = (REGEXP*) SvREFCNT_inc( (SV*)
+ src->src_regex
+ ? src->src_regex
+ : (REGEXP*)rx);
+ n++;
+ }
+ }
+ }
+ }
+ SvSETMAGIC(pat);
+ }
+ else {
+ SV *sv;
+ pat = *patternp;
+ while (SvAMAGIC(pat)
+ && (sv = AMG_CALLunary(pat, string_amg))
+ && sv != pat)
+ {
+ pat = sv;
+ SvGETMAGIC(pat);
+ }
+ }
+
+ /* handle bare regex: foo =~ $re */
+ {
+ SV *re = pat;
+ if (SvROK(re))
+ re = SvRV(re);
+ if (SvTYPE(re) == SVt_REGEXP) {
+ if (is_bare_re)
+ *is_bare_re = TRUE;
+ SvREFCNT_inc(re);
+ Safefree(pRExC_state->code_blocks);
+ return (REGEXP*)re;
+ }
+ }
+ }
+ else {
+ /* not a list of SVs, so must be a list of OPs */
+ assert(expr);
+ if (expr->op_type == OP_LIST) {
+ int i = -1;
+ bool is_code = 0;
+ OP *o;
+
+ pat = newSVpvn("", 0);
+ SAVEFREESV(pat);
+ if (code_is_utf8)
+ SvUTF8_on(pat);
+
+ /* given a list of CONSTs and DO blocks in expr, append all
+ * the CONSTs to pat, and record the start and end of each
+ * code block in code_blocks[] (each DO{} op is followed by an
+ * OP_CONST containing the corresponding literal '(?{...})
+ * text)
+ */
+ for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ if (o->op_type == OP_CONST) {
+ sv_catsv(pat, cSVOPo_sv);
+ if (is_code) {
+ pRExC_state->code_blocks[i].end = SvCUR(pat)-1;
+ is_code = 0;
+ }
+ }
+ else if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL)) {
+ assert(i+1 < pRExC_state->num_code_blocks);
+ pRExC_state->code_blocks[++i].start = SvCUR(pat);
+ pRExC_state->code_blocks[i].block = o;
+ pRExC_state->code_blocks[i].src_regex = NULL;
+ is_code = 1;
+ }
+ }
+ }
+ else {
+ assert(expr->op_type == OP_CONST);
+ pat = cSVOPx_sv(expr);
+ }
+ }
+
+ exp = SvPV_nomg(pat, plen);
+
+ if (!eng->op_comp) {
+ if ((SvUTF8(pat) && IN_BYTES)
+ || SvGMAGICAL(pat) || SvAMAGIC(pat))
+ {
+ /* make a temporary copy; either to convert to bytes,
+ * or to avoid repeating get-magic / overloaded stringify */
+ pat = newSVpvn_flags(exp, plen, SVs_TEMP |
+ (IN_BYTES ? 0 : SvUTF8(pat)));
+ }
+ Safefree(pRExC_state->code_blocks);
+ return CALLREGCOMP_ENG(eng, pat, orig_rx_flags);
+ }
+
+ /* ignore the utf8ness if the pattern is 0 length */
+ RExC_utf8 = RExC_orig_utf8 = (plen == 0 || IN_BYTES) ? 0 : SvUTF8(pat);
+ RExC_uni_semantics = 0;
+ RExC_contains_locale = 0;
+ pRExC_state->runtime_code_qr = NULL;
+
+ /****************** LONG JUMP TARGET HERE***********************/
+ /* Longjmp back to here if have to switch in midstream to utf8 */
+ if (! RExC_orig_utf8) {
+ JMPENV_PUSH(jump_ret);
+ used_setjump = TRUE;
+ }
+
+ if (jump_ret == 0) { /* First time through */
+ xend = exp + plen;
+
+ DEBUG_COMPILE_r({
+ SV *dsv= sv_newmortal();
+ RE_PV_QUOTED_DECL(s, RExC_utf8,
+ dsv, exp, plen, 60);
+ PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
+ PL_colors[4],PL_colors[5],s);
+ });
+ }
+ else { /* longjumped back */
+ U8 *src, *dst;
+ int n=0;
+ STRLEN s = 0, d = 0;
+ bool do_end = 0;
+
+ /* If the cause for the longjmp was other than changing to utf8, pop
+ * our own setjmp, and longjmp to the correct handler */
+ if (jump_ret != UTF8_LONGJMP) {
+ JMPENV_POP;
+ JMPENV_JUMP(jump_ret);
+ }
+
+ GET_RE_DEBUG_FLAGS;
+
+ /* It's possible to write a regexp in ascii that represents Unicode
+ codepoints outside of the byte range, such as via \x{100}. If we
+ detect such a sequence we have to convert the entire pattern to utf8
+ and then recompile, as our sizing calculation will have been based
+ on 1 byte == 1 character, but we will need to use utf8 to encode
+ at least some part of the pattern, and therefore must convert the whole
+ thing.
+ -- dmq */
+ DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
"UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
- exp = (char*)Perl_bytes_to_utf8(aTHX_
- (U8*)SvPV_nomg(pattern, plen),
- &len);
- xend = exp + len;
- RExC_orig_utf8 = RExC_utf8 = 1;
- SAVEFREEPV(exp);
+
+ /* upgrade pattern to UTF8, and if there are code blocks,
+ * recalculate the indices.
+ * This is essentially an unrolled Perl_bytes_to_utf8() */
+
+ src = (U8*)SvPV_nomg(pat, plen);
+ Newx(dst, plen * 2 + 1, U8);
+
+ while (s < plen) {
+ const UV uv = NATIVE_TO_ASCII(src[s]);
+ if (UNI_IS_INVARIANT(uv))
+ dst[d] = (U8)UTF_TO_NATIVE(uv);
+ else {
+ dst[d++] = (U8)UTF8_EIGHT_BIT_HI(uv);
+ dst[d] = (U8)UTF8_EIGHT_BIT_LO(uv);
+ }
+ if (n < pRExC_state->num_code_blocks) {
+ if (!do_end && pRExC_state->code_blocks[n].start == s) {
+ pRExC_state->code_blocks[n].start = d;
+ assert(dst[d] == '(');
+ do_end = 1;
+ }
+ else if (do_end && pRExC_state->code_blocks[n].end == s) {
+ pRExC_state->code_blocks[n].end = d;
+ assert(dst[d] == ')');
+ do_end = 0;
+ n++;
+ }
+ }
+ s++;
+ d++;
+ }
+ dst[d] = '\0';
+ plen = d;
+ exp = (char*) dst;
+ xend = exp + plen;
+ SAVEFREEPV(exp);
+ RExC_orig_utf8 = RExC_utf8 = 1;
+ }
+
+ /* return old regex if pattern hasn't changed */
+
+ if ( old_re
+ && !recompile
+ && !!RX_UTF8(old_re) == !!RExC_utf8
+ && RX_PRECOMP(old_re)
+ && RX_PRELEN(old_re) == plen
+ && memEQ(RX_PRECOMP(old_re), exp, plen))
+ {
+ /* with runtime code, always recompile */
+ runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
+ exp, plen);
+ if (!runtime_code) {
+ ReREFCNT_inc(old_re);
+ if (used_setjump) {
+ JMPENV_POP;
+ }
+ Safefree(pRExC_state->code_blocks);
+ return old_re;
+ }
}
+ else if ((pm_flags & PMf_USE_RE_EVAL)
+ /* this second condition covers the non-regex literal case,
+ * i.e. $foo =~ '(?{})'. */
+ || ( !PL_reg_state.re_reparsing && IN_PERL_COMPILETIME
+ && (PL_hints & HINT_RE_EVAL))
+ )
+ runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
+ exp, plen);
#ifdef TRIE_STUDY_OPT
restudied = 0;
#endif
- pm_flags = orig_pm_flags;
+ rx_flags = orig_rx_flags;
if (initial_charset == REGEX_LOCALE_CHARSET) {
RExC_contains_locale = 1;
/* Set to use unicode semantics if the pattern is in utf8 and has the
* 'depends' charset specified, as it means unicode when utf8 */
- set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
+ set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
}
RExC_precomp = exp;
- RExC_flags = pm_flags;
+ RExC_flags = rx_flags;
+ RExC_pm_flags = pm_flags;
+
+ if (runtime_code) {
+ if (PL_tainting && PL_tainted)
+ Perl_croak(aTHX_ "Eval-group in insecure regular expression");
+
+ if (!S_compile_runtime_code(aTHX_ pRExC_state, exp, plen)) {
+ /* whoops, we have a non-utf8 pattern, whilst run-time code
+ * got compiled as utf8. Try again with a utf8 pattern */
+ JMPENV_JUMP(UTF8_LONGJMP);
+ }
+ }
+ assert(!pRExC_state->runtime_code_qr);
+
RExC_sawback = 0;
RExC_seen = 0;
RExC_in_lookbehind = 0;
RExC_seen_zerolen = *exp == '^' ? -1 : 0;
- RExC_seen_evals = 0;
RExC_extralen = 0;
RExC_override_recoding = 0;
#endif
RExC_recurse = NULL;
RExC_recurse_count = 0;
+ pRExC_state->code_index = 0;
#if 0 /* REGC() is (currently) a NOP at the first pass.
* Clever compilers notice this and complain. --jhi */
);
if (reg(pRExC_state, 0, &flags,1) == NULL) {
RExC_precomp = NULL;
+ Safefree(pRExC_state->code_blocks);
return(NULL);
}
/* The first pass could have found things that force Unicode semantics */
if ((RExC_utf8 || RExC_uni_semantics)
- && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
+ && get_regex_charset(rx_flags) == REGEX_DEPENDS_CHARSET)
{
- set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
+ set_regex_charset(&rx_flags, REGEX_UNICODE_CHARSET);
}
/* Small enough for pointer-storage convention?
/* non-zero initialization begins here */
RXi_SET( r, ri );
- r->engine= RE_ENGINE_PTR;
- r->extflags = pm_flags;
+ r->engine= eng;
+ r->extflags = rx_flags;
+ if (pm_flags & PMf_IS_QR) {
+ ri->code_blocks = pRExC_state->code_blocks;
+ ri->num_code_blocks = pRExC_state->num_code_blocks;
+ }
+ else
+ SAVEFREEPV(pRExC_state->code_blocks);
+
{
bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
SvPOK_on(rx);
- SvFLAGS(rx) |= SvUTF8(pattern);
+ if (RExC_utf8)
+ SvFLAGS(rx) |= SVf_UTF8;
*p++='('; *p++='?';
/* If a default, cover it using the caret */
RExC_rxi = ri;
/* Second pass: emit code. */
- RExC_flags = pm_flags; /* don't let top level (?i) bleed */
+ RExC_flags = rx_flags; /* don't let top level (?i) bleed */
+ RExC_pm_flags = pm_flags;
RExC_parse = exp;
RExC_end = xend;
RExC_naughty = 0;
RExC_emit_start = ri->program;
RExC_emit = ri->program;
RExC_emit_bound = ri->program + RExC_size + 1;
+ pRExC_state->code_index = 0;
- /* Store the count of eval-groups for security checks: */
- RExC_rx->seen_evals = RExC_seen_evals;
REGC((U8)REG_MAGIC, (char*) RExC_emit++);
if (reg(pRExC_state, 0, &flags,1) == NULL) {
ReREFCNT_dec(rx);
else if ((!sawopen || !RExC_sawback) &&
(OP(first) == STAR &&
PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
- !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
+ !(r->extflags & RXf_ANCH) && !pRExC_state->num_code_blocks)
{
/* turn .* into ^.* with an implied $*=1 */
const int type =
goto again;
}
if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
- && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
+ && !pRExC_state->num_code_blocks) /* May examine pos and $& */
/* x+ must match at the 1st pos of run of x's */
r->intflags |= PREGf_SKIP;
{
I32 t,ml;
+ /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
|| (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
&& data.offset_fixed == data.offset_float_min
Be careful.
*/
longest_fixed_length = CHR_SVLEN(data.longest_fixed);
+
+ /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
&& (longest_fixed_length
|| (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
r->extflags |= RXf_GPOS_SEEN;
if (RExC_seen & REG_SEEN_LOOKBEHIND)
r->extflags |= RXf_LOOKBEHIND_SEEN;
- if (RExC_seen & REG_SEEN_EVAL)
+ if (pRExC_state->num_code_blocks)
r->extflags |= RXf_EVAL_SEEN;
if (RExC_seen & REG_SEEN_CANY)
r->extflags |= RXf_CANY_SEEN;
r->intflags |= PREGf_VERBARG_SEEN;
if (RExC_seen & REG_SEEN_CUTGROUP)
r->intflags |= PREGf_CUTGROUP_SEEN;
+ if (pm_flags & PMf_USE_RE_EVAL)
+ r->intflags |= PREGf_USE_RE_EVAL;
if (RExC_paren_names)
RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
else
return rx;
}
-#undef RE_ENGINE_PTR
-
SV*
Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
do {
RExC_parse++;
} while (isALNUM(*RExC_parse));
+ } else {
+ RExC_parse++; /* so the <- from the vFAIL is after the offending character */
+ vFAIL("Group name must start with a non-digit word character");
}
-
if ( flags ) {
SV* sv_name
= newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
(unsigned long) flags);
}
- /* NOT REACHED */
+ assert(0); /* NOT REACHED */
}
return NULL;
}
#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
-#define INVLIST_ZERO_OFFSET 2 /* 0 or 1; must be last element in header */
+/* This is a combination of a version and data structure type, so that one
+ * being passed in can be validated to be an inversion list of the correct
+ * vintage. When the structure of the header is changed, a new random number
+ * in the range 2**31-1 should be generated and the new() method changed to
+ * insert that at this location. Then, if an auxiliary program doesn't change
+ * correspondingly, it will be discovered immediately */
+#define INVLIST_VERSION_ID_OFFSET 2
+#define INVLIST_VERSION_ID 1064334010
+
+/* For safety, when adding new elements, remember to #undef them at the end of
+ * the inversion list code section */
+
+#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
* contains the code point U+00000, and begins here. If 1, the inversion list
* doesn't contain U+0000, and it begins at the next UV in the array.
* properly */
*get_invlist_zero_addr(new_list) = UV_MAX;
+ *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
+#if HEADER_LENGTH != 4
+# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
+#endif
+
return new_list;
}
#endif
+STATIC SV*
+S__new_invlist_C_array(pTHX_ UV* list)
+{
+ /* Return a pointer to a newly constructed inversion list, initialized to
+ * point to <list>, which has to be in the exact correct inversion list
+ * form, including internal fields. Thus this is a dangerous routine that
+ * should not be used in the wrong hands */
+
+ SV* invlist = newSV_type(SVt_PV);
+
+ PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
+
+ SvPV_set(invlist, (char *) list);
+ SvLEN_set(invlist, 0); /* Means we own the contents, and the system
+ shouldn't touch it */
+ SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
+
+ if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
+ Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
+ }
+
+ return invlist;
+}
+
STATIC void
S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
{
#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
-#ifndef PERL_IN_XSUB_RE
-void
-Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
+#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
+
+STATIC void
+S__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
{
/* Subject to change or removal. Append the range from 'start' to 'end' at
* the end of the inversion list. The range must be above any existing
}
}
+#ifndef PERL_IN_XSUB_RE
+
STATIC IV
S_invlist_search(pTHX_ SV* const invlist, const UV cp)
{
return;
}
+
void
-Perl__invlist_union(pTHX_ SV* const a, SV* const b, SV** output)
+Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
{
/* Take the union of two inversion lists and point <output> to it. *output
* should be defined upon input, and if it points to one of the two lists,
- * the reference count to that list will be decremented.
+ * the reference count to that list will be decremented. The first list,
+ * <a>, may be NULL, in which case a copy of the second list is returned.
+ * If <complement_b> is TRUE, the union is taken of the complement
+ * (inversion) of <b> instead of b itself.
+ *
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
* length there. The preface says to incorporate its examples into your
*/
UV count = 0;
- PERL_ARGS_ASSERT__INVLIST_UNION;
+ PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
assert(a != b);
/* If either one is empty, the union is the other one */
- len_a = invlist_len(a);
- if (len_a == 0) {
+ if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
if (*output == a) {
- SvREFCNT_dec(a);
+ if (a != NULL) {
+ SvREFCNT_dec(a);
+ }
}
if (*output != b) {
*output = invlist_clone(b);
+ if (complement_b) {
+ _invlist_invert(*output);
+ }
} /* else *output already = b; */
return;
}
if (*output == b) {
SvREFCNT_dec(b);
}
- if (*output != a) {
- *output = invlist_clone(a);
- }
- /* else *output already = a; */
+
+ /* The complement of an empty list is a list that has everything in it,
+ * so the union with <a> includes everything too */
+ if (complement_b) {
+ if (a == *output) {
+ SvREFCNT_dec(a);
+ }
+ *output = _new_invlist(1);
+ _append_range_to_invlist(*output, 0, UV_MAX);
+ }
+ else if (*output != a) {
+ *output = invlist_clone(a);
+ }
+ /* else *output already = a; */
return;
}
array_a = invlist_array(a);
array_b = invlist_array(b);
+ /* If are to take the union of 'a' with the complement of b, set it
+ * up so are looking at b's complement. */
+ if (complement_b) {
+
+ /* To complement, we invert: if the first element is 0, remove it. To
+ * do this, we just pretend the array starts one later, and clear the
+ * flag as we don't have to do anything else later */
+ if (array_b[0] == 0) {
+ array_b++;
+ len_b--;
+ complement_b = FALSE;
+ }
+ else {
+
+ /* But if the first element is not zero, we unshift a 0 before the
+ * array. The data structure reserves a space for that 0 (which
+ * should be a '1' right now), so physical shifting is unneeded,
+ * but temporarily change that element to 0. Before exiting the
+ * routine, we must restore the element to '1' */
+ array_b--;
+ len_b++;
+ array_b[0] = 0;
+ }
+ }
+
/* Size the union for the worst case: that the sets are completely
* disjoint */
u = _new_invlist(len_a + len_b);
SvREFCNT_dec(*output);
}
+ /* If we've changed b, restore it */
+ if (complement_b) {
+ array_b[0] = 1;
+ }
+
*output = u;
return;
}
void
-Perl__invlist_intersection(pTHX_ SV* const a, SV* const b, SV** i)
+Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
{
/* Take the intersection of two inversion lists and point <i> to it. *i
* should be defined upon input, and if it points to one of the two lists,
* the reference count to that list will be decremented.
+ * If <complement_b> is TRUE, the result will be the intersection of <a>
+ * and the complement (or inversion) of <b> instead of <b> directly.
+ *
* The basis for this comes from "Unicode Demystified" Chapter 13 by
* Richard Gillam, published by Addison-Wesley, and explained at some
* length there. The preface says to incorporate its examples into your
*/
UV count = 0;
- PERL_ARGS_ASSERT__INVLIST_INTERSECTION;
+ PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
assert(a != b);
- /* If either one is empty, the intersection is null */
+ /* Special case if either one is empty */
len_a = invlist_len(a);
if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
- /* If the result is the same as one of the inputs, the input is being
- * overwritten */
+ if (len_a != 0 && complement_b) {
+
+ /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
+ * be empty. Here, also we are using 'b's complement, which hence
+ * must be every possible code point. Thus the intersection is
+ * simply 'a'. */
+ if (*i != a) {
+ *i = invlist_clone(a);
+
+ if (*i == b) {
+ SvREFCNT_dec(b);
+ }
+ }
+ /* else *i is already 'a' */
+ return;
+ }
+
+ /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
+ * intersection must be empty */
if (*i == a) {
SvREFCNT_dec(a);
}
else if (*i == b) {
SvREFCNT_dec(b);
}
-
*i = _new_invlist(0);
return;
}
array_a = invlist_array(a);
array_b = invlist_array(b);
+ /* If are to take the intersection of 'a' with the complement of b, set it
+ * up so are looking at b's complement. */
+ if (complement_b) {
+
+ /* To complement, we invert: if the first element is 0, remove it. To
+ * do this, we just pretend the array starts one later, and clear the
+ * flag as we don't have to do anything else later */
+ if (array_b[0] == 0) {
+ array_b++;
+ len_b--;
+ complement_b = FALSE;
+ }
+ else {
+
+ /* But if the first element is not zero, we unshift a 0 before the
+ * array. The data structure reserves a space for that 0 (which
+ * should be a '1' right now), so physical shifting is unneeded,
+ * but temporarily change that element to 0. Before exiting the
+ * routine, we must restore the element to '1' */
+ array_b--;
+ len_b++;
+ array_b[0] = 0;
+ }
+ }
+
/* Size the intersection for the worst case: that the intersection ends up
* fragmenting everything to be completely disjoint */
r= _new_invlist(len_a + len_b);
SvREFCNT_dec(*i);
}
+ /* If we've changed b, restore it */
+ if (complement_b) {
+ array_b[0] = 1;
+ }
+
*i = r;
return;
}
-#endif
-
-STATIC SV*
-S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
+SV*
+Perl__add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
{
/* Add the range from 'start' to 'end' inclusive to the inversion list's
* set. A pointer to the inversion list is returned. This may actually be
return invlist;
}
+#endif
+
PERL_STATIC_INLINE SV*
S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
- return add_range_to_invlist(invlist, cp, cp);
+ return _add_range_to_invlist(invlist, cp, cp);
}
#ifndef PERL_IN_XSUB_RE
return new_invlist;
}
-#ifndef PERL_IN_XSUB_RE
-void
-Perl__invlist_subtract(pTHX_ SV* const a, SV* const b, SV** result)
-{
- /* Point <result> to an inversion list which consists of all elements in
- * <a> that aren't also in <b>. *result should be defined upon input, and
- * if it points to C<b> its reference count will be decremented. */
-
- PERL_ARGS_ASSERT__INVLIST_SUBTRACT;
- assert(a != b);
-
- /* Subtracting nothing retains the original */
- if (invlist_len(b) == 0) {
-
- if (*result == b) {
- SvREFCNT_dec(b);
- }
-
- /* If the result is not to be the same variable as the original, create
- * a copy */
- if (*result != a) {
- *result = invlist_clone(a);
- }
- } else {
- SV *b_copy = invlist_clone(b);
- _invlist_invert(b_copy); /* Everything not in 'b' */
-
- if (*result == b) {
- SvREFCNT_dec(b);
- }
-
- _invlist_intersection(a, b_copy, result); /* Everything in 'a' not in
- 'b' */
- SvREFCNT_dec(b_copy);
- }
-
- return;
-}
-#endif
-
PERL_STATIC_INLINE UV*
S_get_invlist_iter_addr(pTHX_ SV* invlist)
{
return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
}
+PERL_STATIC_INLINE UV*
+S_get_invlist_version_id_addr(pTHX_ SV* invlist)
+{
+ /* Return the address of the UV that contains the version id. */
+
+ PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
+
+ return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
+}
+
PERL_STATIC_INLINE void
S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
{
#undef INVLIST_LEN_OFFSET
#undef INVLIST_ZERO_OFFSET
#undef INVLIST_ITER_OFFSET
+#undef INVLIST_VERSION_ID
/* End of inversion list object */
num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
}
goto gen_recurse_regop;
- /* NOT REACHED */
+ assert(0); /* NOT REACHED */
case '+':
if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
RExC_parse++;
nextchar(pRExC_state);
return ret;
} /* named and numeric backreferences */
- /* NOT REACHED */
+ assert(0); /* NOT REACHED */
case '?': /* (??...) */
is_logical = 1;
/* FALL THROUGH */
case '{': /* (?{...}) */
{
- I32 count = 1;
U32 n = 0;
- char c;
- char *s = RExC_parse;
+ struct reg_code_block *cb;
RExC_seen_zerolen++;
- RExC_seen |= REG_SEEN_EVAL;
- while (count && (c = *RExC_parse)) {
- if (c == '\\') {
- if (RExC_parse[1])
- RExC_parse++;
- }
- else if (c == '{')
- count++;
- else if (c == '}')
- count--;
- RExC_parse++;
- }
- if (*RExC_parse != ')') {
- RExC_parse = s;
- vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
+
+ if ( !pRExC_state->num_code_blocks
+ || pRExC_state->code_index >= pRExC_state->num_code_blocks
+ || pRExC_state->code_blocks[pRExC_state->code_index].start
+ != (STRLEN)((RExC_parse -3 - (is_logical ? 1 : 0))
+ - RExC_start)
+ ) {
+ if (RExC_pm_flags & PMf_USE_RE_EVAL)
+ FAIL("panic: Sequence (?{...}): no code block found\n");
+ FAIL("Eval-group not allowed at runtime, use re 'eval'");
}
+ /* this is a pre-compiled code block (?{...}) */
+ cb = &pRExC_state->code_blocks[pRExC_state->code_index];
+ RExC_parse = RExC_start + cb->end;
if (!SIZE_ONLY) {
- PAD *pad;
- OP_4tree *sop, *rop;
- SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
-
- ENTER;
- Perl_save_re_context(aTHX);
- rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
- sop->op_private |= OPpREFCOUNTED;
- /* re_dup will OpREFCNT_inc */
- OpREFCNT_set(sop, 1);
- LEAVE;
-
- n = add_data(pRExC_state, 3, "nop");
- RExC_rxi->data->data[n] = (void*)rop;
- RExC_rxi->data->data[n+1] = (void*)sop;
- RExC_rxi->data->data[n+2] = (void*)pad;
- SvREFCNT_dec(sv);
- }
- else { /* First pass */
- if (PL_reginterp_cnt < ++RExC_seen_evals
- && IN_PERL_RUNTIME)
- /* No compiled RE interpolated, has runtime
- components ===> unsafe. */
- FAIL("Eval-group not allowed at runtime, use re 'eval'");
- if (PL_tainting && PL_tainted)
- FAIL("Eval-group in insecure regular expression");
-#if PERL_VERSION > 8
- if (IN_PERL_COMPILETIME)
- PL_cv_has_eval = 1;
-#endif
+ OP *o = cb->block;
+ if (cb->src_regex) {
+ n = add_data(pRExC_state, 2, "rl");
+ RExC_rxi->data->data[n] =
+ (void*)SvREFCNT_inc((SV*)cb->src_regex);
+ RExC_rxi->data->data[n+1] = (void*)o;
+ }
+ else {
+ n = add_data(pRExC_state, 1,
+ (RExC_pm_flags & PMf_HAS_CV) ? "L" : "l");
+ RExC_rxi->data->data[n] = (void*)o;
+ }
}
-
+ pRExC_state->code_index++;
nextchar(pRExC_state);
+
if (is_logical) {
+ regnode *eval;
ret = reg_node(pRExC_state, LOGICAL);
- if (!SIZE_ONLY)
+ eval = reganode(pRExC_state, EVAL, n);
+ if (!SIZE_ONLY) {
ret->flags = 2;
- REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
+ /* for later propagation into (??{}) return value */
+ eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
+ }
+ REGTAIL(pRExC_state, ret, eval);
/* deal with the length of this later - MJD */
return ret;
}
U32 posflags = 0, negflags = 0;
U32 *flagsp = &posflags;
char has_charset_modifier = '\0';
- regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
- ? REGEX_UNICODE_CHARSET
- : REGEX_DEPENDS_CHARSET;
+ regex_charset cs = get_regex_charset(RExC_flags);
+ if (cs == REGEX_DEPENDS_CHARSET
+ && (RExC_utf8 || RExC_uni_semantics))
+ {
+ cs = REGEX_UNICODE_CHARSET;
+ }
while (*RExC_parse) {
/* && strchr("iogcmsx", *RExC_parse) */
}
break;
}
+ DEBUG_PARSE_r(if (!SIZE_ONLY) {
+ SV * const mysv_val1=sv_newmortal();
+ SV * const mysv_val2=sv_newmortal();
+ DEBUG_PARSE_MSG("lsbr");
+ regprop(RExC_rx, mysv_val1, lastbr);
+ regprop(RExC_rx, mysv_val2, ender);
+ PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
+ SvPV_nolen_const(mysv_val1),
+ (IV)REG_NODE_NUM(lastbr),
+ SvPV_nolen_const(mysv_val2),
+ (IV)REG_NODE_NUM(ender),
+ (IV)(ender - lastbr)
+ );
+ });
REGTAIL(pRExC_state, lastbr, ender);
if (have_branch && !SIZE_ONLY) {
+ char is_nothing= 1;
if (depth==1)
RExC_seen |= REG_TOP_LEVEL_BRANCHES;
const U8 op = PL_regkind[OP(br)];
if (op == BRANCH) {
REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
+ if (OP(NEXTOPER(br)) != NOTHING || regnext(NEXTOPER(br)) != ender)
+ is_nothing= 0;
}
else if (op == BRANCHJ) {
REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
+ /* for now we always disable this optimisation * /
+ if (OP(NEXTOPER(NEXTOPER(br))) != NOTHING || regnext(NEXTOPER(NEXTOPER(br))) != ender)
+ */
+ is_nothing= 0;
}
}
+ if (is_nothing) {
+ br= PL_regkind[OP(ret)] != BRANCH ? regnext(ret) : ret;
+ DEBUG_PARSE_r(if (!SIZE_ONLY) {
+ SV * const mysv_val1=sv_newmortal();
+ SV * const mysv_val2=sv_newmortal();
+ DEBUG_PARSE_MSG("NADA");
+ regprop(RExC_rx, mysv_val1, ret);
+ regprop(RExC_rx, mysv_val2, ender);
+ PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
+ SvPV_nolen_const(mysv_val1),
+ (IV)REG_NODE_NUM(ret),
+ SvPV_nolen_const(mysv_val2),
+ (IV)REG_NODE_NUM(ender),
+ (IV)(ender - ret)
+ );
+ });
+ OP(br)= NOTHING;
+ if (OP(ender) == TAIL) {
+ NEXT_OFF(br)= 0;
+ RExC_emit= br + 1;
+ } else {
+ regnode *opt;
+ for ( opt= br + 1; opt < ender ; opt++ )
+ OP(opt)= OPTIMIZED;
+ NEXT_OFF(br)= ender - br;
+ }
+ }
}
}
}
else
FAIL("Junk on end of regexp"); /* "Can't happen". */
- /* NOTREACHED */
+ assert(0); /* NOTREACHED */
}
if (RExC_in_lookbehind) {
sequence, we return.
Note: we have to be careful with escapes, as they can be both literal
- and special, and in the case of \10 and friends can either, depending
- on context. Specifically there are two separate switches for handling
+ and special, and in the case of \10 and friends, context determines which.
+
+ A summary of the code structure is:
+
+ switch (first_byte) {
+ cases for each special:
+ handle this special;
+ break;
+ case '\\':
+ switch (2nd byte) {
+ cases for each unambiguous special:
+ handle this special;
+ break;
+ cases for each ambigous special/literal:
+ disambiguate;
+ if (special) handle here
+ else goto defchar;
+ default: // unambiguously literal:
+ goto defchar;
+ }
+ default: // is a literal char
+ // FALL THROUGH
+ defchar:
+ create EXACTish node for literal;
+ while (more input and node isn't full) {
+ switch (input_byte) {
+ cases for each special;
+ make sure parse pointer is set so that the next call to
+ regatom will see this special first
+ goto loopdone; // EXACTish node terminated by prev. char
+ default:
+ append char to EXACTISH node;
+ }
+ get next input byte;
+ }
+ loopdone:
+ }
+ return the generated node;
+
+ Specifically there are two separate switches for handling
escape sequences, with the one for handling literal escapes requiring
a dummy entry for all of the special escapes that are actually handled
by the other.
vFAIL("Internal urp");
/* Supposed to be caught earlier. */
break;
- case '{':
- if (!regcurly(RExC_parse)) {
- RExC_parse++;
- goto defchar;
- }
- /* FALL THROUGH */
case '?':
case '+':
case '*':
ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
- if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
- ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
- }
goto finish_meta_pat;
case 'B':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
- if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
- ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
- }
goto finish_meta_pat;
case 's':
switch (get_regex_charset(RExC_flags)) {
vFAIL("Reference to nonexistent or unclosed group");
}
if (!isg && num > 9 && num >= RExC_npar)
+ /* Probably a character specified in octal, e.g. \35 */
goto defchar;
else {
char * const parse_start = RExC_parse - 1; /* MJD */
char *s;
STRLEN foldlen;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
- regnode * orig_emit;
U8 node_type;
/* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
bool is_exactfu_sharp_s;
ender = 0;
- orig_emit = RExC_emit; /* Save the original output node position in
- case we need to output a different node
- type */
node_type = ((! FOLD) ? EXACT
: (LOC)
? EXACTFL
break;
}
case 'x':
- if (*++p == '{') {
- char* const e = strchr(p, '}');
+ {
+ STRLEN brace_len = len;
+ UV result;
+ const char* error_msg;
- if (!e) {
- RExC_parse = p + 1;
- vFAIL("Missing right brace on \\x{}");
+ bool valid = grok_bslash_x(p,
+ &result,
+ &brace_len,
+ &error_msg,
+ 1);
+ p += brace_len;
+ if (! valid) {
+ RExC_parse = p; /* going to die anyway; point
+ to exact spot of failure */
+ vFAIL(error_msg);
}
else {
- I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX;
- STRLEN numlen = e - p - 1;
- ender = grok_hex(p + 1, &numlen, &flags, NULL);
- if (ender > 0xff)
- REQUIRE_UTF8;
- p = e + 1;
+ ender = result;
}
+ if (PL_encoding && ender < 0x100) {
+ goto recode_encoding;
+ }
+ if (ender > 0xff) {
+ REQUIRE_UTF8;
+ }
+ break;
}
- else {
- I32 flags = PERL_SCAN_DISALLOW_PREFIX;
- STRLEN numlen = 2;
- ender = grok_hex(p, &numlen, &flags, NULL);
- p += numlen;
- }
- if (PL_encoding && ender < 0x100)
- goto recode_encoding;
- break;
case 'c':
p++;
ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
break;
case '0': case '1': case '2': case '3':case '4':
- case '5': case '6': case '7': case '8':case '9':
+ case '5': case '6': case '7':
if (*p == '0' ||
(isDIGIT(p[1]) && atoi(p) >= RExC_npar))
{
FAIL("Trailing \\");
/* FALL THROUGH */
default:
- if (!SIZE_ONLY&& isALPHA(*p)) {
- /* Include any { following the alpha to emphasize
- * that it could be part of an escape at some point
- * in the future */
- int len = (*(p + 1) == '{') ? 2 : 1;
- ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
+ if (!SIZE_ONLY&& isALNUMC(*p)) {
+ ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
}
goto normal_default;
}
break;
+ case '{':
+ /* Currently we don't warn when the lbrace is at the start
+ * of a construct. This catches it in the middle of a
+ * literal string, or when its the first thing after
+ * something like "\b" */
+ if (! SIZE_ONLY
+ && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
+ {
+ ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
+ }
+ /*FALLTHROUGH*/
default:
normal_default:
if (UTF8_IS_START(*p) && UTF) {
/* Prime the casefolded buffer. Locale rules, which apply
* only to code points < 256, aren't known until execution,
* so for them, just output the original character using
- * utf8 */
+ * utf8. If we start to fold non-UTF patterns, be sure to
+ * update join_exact() */
if (LOC && ender < 256) {
if (UNI_IS_INVARIANT(ender)) {
*tmpbuf = (U8) ender;
for (foldbuf = tmpbuf;
foldlen;
foldlen -= numlen) {
- ender = utf8_to_uvchr(foldbuf, &numlen);
+
+ /* tmpbuf has been constructed by us, so we
+ * know it is valid utf8 */
+ ender = valid_utf8_to_uvchr(foldbuf, &numlen);
if (numlen > 0) {
const STRLEN unilen = reguni(pRExC_state, ender, s);
s += unilen;
for (foldbuf = tmpbuf;
foldlen;
foldlen -= numlen) {
- ender = utf8_to_uvchr(foldbuf, &numlen);
+ ender = valid_utf8_to_uvchr(foldbuf, &numlen);
if (numlen > 0) {
const STRLEN unilen = reguni(pRExC_state, ender, s);
len += unilen;
}
}
-/* No locale test, and always Unicode semantics, no ignore-case differences */
-#define _C_C_T_NOLOC_(NAME,TEST,WORD) \
-ANYOF_##NAME: \
- for (value = 0; value < 256; value++) \
- if (TEST) \
- stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
- yesno = '+'; \
- what = WORD; \
- break; \
-case ANYOF_N##NAME: \
- for (value = 0; value < 256; value++) \
- if (!TEST) \
- stored += set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
- yesno = '!'; \
- what = WORD; \
- break
-
-/* Like the above, but there are differences if we are in uni-8-bit or not, so
- * there are two tests passed in, to use depending on that. There aren't any
- * cases where the label is different from the name, so no need for that
- * parameter.
- * Sets 'what' to WORD which is the property name for non-bitmap code points;
- * But, uses FOLD_WORD instead if /i has been selected, to allow a different
- * property name */
-#define _C_C_T_(NAME, TEST_8, TEST_7, WORD, FOLD_WORD) \
-ANYOF_##NAME: \
- if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
- else if (UNI_SEMANTICS) { \
- for (value = 0; value < 256; value++) { \
- if (TEST_8(value)) stored += \
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
- } \
- } \
- else { \
- for (value = 0; value < 128; value++) { \
- if (TEST_7(UNI_TO_NATIVE(value))) stored += \
- set_regclass_bit(pRExC_state, ret, \
- (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
- } \
- } \
- yesno = '+'; \
- if (FOLD) { \
- what = FOLD_WORD; \
- } \
- else { \
- what = WORD; \
- } \
- break; \
-case ANYOF_N##NAME: \
- if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
- else if (UNI_SEMANTICS) { \
- for (value = 0; value < 256; value++) { \
- if (! TEST_8(value)) stored += \
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate); \
- } \
- } \
- else { \
- for (value = 0; value < 128; value++) { \
- if (! TEST_7(UNI_TO_NATIVE(value))) stored += set_regclass_bit( \
- pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
- } \
- if (AT_LEAST_ASCII_RESTRICTED) { \
- for (value = 128; value < 256; value++) { \
- stored += set_regclass_bit( \
- pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate); \
- } \
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL; \
- } \
- else { \
- /* For a non-ut8 target string with DEPENDS semantics, all above \
- * ASCII Latin1 code points match the complement of any of the \
- * classes. But in utf8, they have their Unicode semantics, so \
- * can't just set them in the bitmap, or else regexec.c will think \
- * they matched when they shouldn't. */ \
- ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL; \
- } \
- } \
- yesno = '!'; \
- if (FOLD) { \
- what = FOLD_WORD; \
- } \
- else { \
- what = WORD; \
- } \
- break
-
-STATIC U8
-S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
-{
-
- /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
- * Locale folding is done at run-time, so this function should not be
- * called for nodes that are for locales.
- *
- * This function sets the bit corresponding to the fold of the input
- * 'value', if not already set. The fold of 'f' is 'F', and the fold of
- * 'F' is 'f'.
- *
- * It also knows about the characters that are in the bitmap that have
- * folds that are matchable only outside it, and sets the appropriate lists
- * and flags.
- *
- * It returns the number of bits that actually changed from 0 to 1 */
-
- U8 stored = 0;
- U8 fold;
-
- PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
-
- fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
- : PL_fold[value];
-
- /* It assumes the bit for 'value' has already been set */
- if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
- ANYOF_BITMAP_SET(node, fold);
- stored++;
- }
- if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
- /* Certain Latin1 characters have matches outside the bitmap. To get
- * here, 'value' is one of those characters. None of these matches is
- * valid for ASCII characters under /aa, which have been excluded by
- * the 'if' above. The matches fall into three categories:
- * 1) They are singly folded-to or -from an above 255 character, as
- * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
- * WITH DIAERESIS;
- * 2) They are part of a multi-char fold with another character in the
- * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
- * 3) They are part of a multi-char fold with a character not in the
- * bitmap, such as various ligatures.
- * We aren't dealing fully with multi-char folds, except we do deal
- * with the pattern containing a character that has a multi-char fold
- * (not so much the inverse).
- * For types 1) and 3), the matches only happen when the target string
- * is utf8; that's not true for 2), and we set a flag for it.
- *
- * The code below adds to the passed in inversion list the single fold
- * closures for 'value'. The values are hard-coded here so that an
- * innocent-looking character class, like /[ks]/i won't have to go out
- * to disk to find the possible matches. XXX It would be better to
- * generate these via regen, in case a new version of the Unicode
- * standard adds new mappings, though that is not really likely. */
- switch (value) {
- case 'k':
- case 'K':
- /* KELVIN SIGN */
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
- break;
- case 's':
- case 'S':
- /* LATIN SMALL LETTER LONG S */
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
- break;
- case MICRO_SIGN:
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
- GREEK_SMALL_LETTER_MU);
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
- GREEK_CAPITAL_LETTER_MU);
- break;
- case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
- case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
- /* ANGSTROM SIGN */
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
- if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
- PL_fold_latin1[value]);
- }
- break;
- case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
- LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
- break;
- case LATIN_SMALL_LETTER_SHARP_S:
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
- LATIN_CAPITAL_LETTER_SHARP_S);
-
- /* Under /a, /d, and /u, this can match the two chars "ss" */
- if (! MORE_ASCII_RESTRICTED) {
- add_alternate(alternate_ptr, (U8 *) "ss", 2);
-
- /* And under /u or /a, it can match even if the target is
- * not utf8 */
- if (AT_LEAST_UNI_SEMANTICS) {
- ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
- }
- }
- break;
- case 'F': case 'f':
- case 'I': case 'i':
- case 'L': case 'l':
- case 'T': case 't':
- case 'A': case 'a':
- case 'H': case 'h':
- case 'J': case 'j':
- case 'N': case 'n':
- case 'W': case 'w':
- case 'Y': case 'y':
- /* These all are targets of multi-character folds from code
- * points that require UTF8 to express, so they can't match
- * unless the target string is in UTF-8, so no action here is
- * necessary, as regexec.c properly handles the general case
- * for UTF-8 matching */
- break;
- default:
- /* Use deprecated warning to increase the chances of this
- * being output */
- ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
- break;
- }
- }
- else if (DEPENDS_SEMANTICS
- && ! isASCII(value)
- && PL_fold_latin1[value] != value)
- {
- /* Under DEPENDS rules, non-ASCII Latin1 characters match their
- * folds only when the target string is in UTF-8. We add the fold
- * here to the list of things to match outside the bitmap, which
- * won't be looked at unless it is UTF8 (or else if something else
- * says to look even if not utf8, but those things better not happen
- * under DEPENDS semantics. */
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
- }
-
- return stored;
-}
-
-
-PERL_STATIC_INLINE U8
-S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
-{
- /* This inline function sets a bit in the bitmap if not already set, and if
- * appropriate, its fold, returning the number of bits that actually
- * changed from 0 to 1 */
-
- U8 stored;
-
- PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
-
- if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
- return 0;
- }
-
- ANYOF_BITMAP_SET(node, value);
- stored = 1;
-
- if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
- stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
+/* Generate the code to add a full posix character <class> to the bracketed
+ * character class given by <node>. (<node> is needed only under locale rules)
+ * destlist is the inversion list for non-locale rules that this class is
+ * to be added to
+ * sourcelist is the ASCII-range inversion list to add under /a rules
+ * Xsourcelist is the full Unicode range list to use otherwise. */
+#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
+ if (LOC) { \
+ SV* scratch_list = NULL; \
+ \
+ /* Set this class in the node for runtime matching */ \
+ ANYOF_CLASS_SET(node, class); \
+ \
+ /* For above Latin1 code points, we use the full Unicode range */ \
+ _invlist_intersection(PL_AboveLatin1, \
+ Xsourcelist, \
+ &scratch_list); \
+ /* And set the output to it, adding instead if there already is an \
+ * output. Checking if <destlist> is NULL first saves an extra \
+ * clone. Its reference count will be decremented at the next \
+ * union, etc, or if this is the only instance, at the end of the \
+ * routine */ \
+ if (! destlist) { \
+ destlist = scratch_list; \
+ } \
+ else { \
+ _invlist_union(destlist, scratch_list, &destlist); \
+ SvREFCNT_dec(scratch_list); \
+ } \
+ } \
+ else { \
+ /* For non-locale, just add it to any existing list */ \
+ _invlist_union(destlist, \
+ (AT_LEAST_ASCII_RESTRICTED) \
+ ? sourcelist \
+ : Xsourcelist, \
+ &destlist); \
+ }
+
+/* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
+ */
+#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
+ if (LOC) { \
+ SV* scratch_list = NULL; \
+ ANYOF_CLASS_SET(node, class); \
+ _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
+ if (! destlist) { \
+ destlist = scratch_list; \
+ } \
+ else { \
+ _invlist_union(destlist, scratch_list, &destlist); \
+ SvREFCNT_dec(scratch_list); \
+ } \
+ } \
+ else { \
+ _invlist_union_complement_2nd(destlist, \
+ (AT_LEAST_ASCII_RESTRICTED) \
+ ? sourcelist \
+ : Xsourcelist, \
+ &destlist); \
+ /* Under /d, everything in the upper half of the Latin1 range \
+ * matches this complement */ \
+ if (DEPENDS_SEMANTICS) { \
+ ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
+ } \
+ }
+
+/* Generate the code to add a posix character <class> to the bracketed
+ * character class given by <node>. (<node> is needed only under locale rules)
+ * destlist is the inversion list for non-locale rules that this class is
+ * to be added to
+ * sourcelist is the ASCII-range inversion list to add under /a rules
+ * l1_sourcelist is the Latin1 range list to use otherwise.
+ * Xpropertyname is the name to add to <run_time_list> of the property to
+ * specify the code points above Latin1 that will have to be
+ * determined at run-time
+ * run_time_list is a SV* that contains text names of properties that are to
+ * be computed at run time. This concatenates <Xpropertyname>
+ * to it, apppropriately
+ * This is essentially DO_POSIX, but we know only the Latin1 values at compile
+ * time */
+#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
+ l1_sourcelist, Xpropertyname, run_time_list) \
+ /* First, resolve whether to use the ASCII-only list or the L1 \
+ * list */ \
+ DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, \
+ ((AT_LEAST_ASCII_RESTRICTED) ? sourcelist : l1_sourcelist),\
+ Xpropertyname, run_time_list)
+
+#define DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(node, class, destlist, sourcelist, \
+ Xpropertyname, run_time_list) \
+ /* If not /a matching, there are going to be code points we will have \
+ * to defer to runtime to look-up */ \
+ if (! AT_LEAST_ASCII_RESTRICTED) { \
+ Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
+ } \
+ if (LOC) { \
+ ANYOF_CLASS_SET(node, class); \
+ } \
+ else { \
+ _invlist_union(destlist, sourcelist, &destlist); \
+ }
+
+/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
+ * this and DO_N_POSIX */
+#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
+ l1_sourcelist, Xpropertyname, run_time_list) \
+ if (AT_LEAST_ASCII_RESTRICTED) { \
+ _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
+ } \
+ else { \
+ Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
+ if (LOC) { \
+ ANYOF_CLASS_SET(node, namedclass); \
+ } \
+ else { \
+ SV* scratch_list = NULL; \
+ _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
+ if (! destlist) { \
+ destlist = scratch_list; \
+ } \
+ else { \
+ _invlist_union(destlist, scratch_list, &destlist); \
+ SvREFCNT_dec(scratch_list); \
+ } \
+ if (DEPENDS_SEMANTICS) { \
+ ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
+ } \
+ } \
}
- return stored;
-}
-
STATIC void
S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
{
/* code points this node matches that can't be stored in the bitmap */
SV* nonbitmap = NULL;
- /* The items that are to match that aren't stored in the bitmap, but are a
- * result of things that are stored there. This is the fold closure of
- * such a character, either because it has DEPENDS semantics and shouldn't
- * be matched unless the target string is utf8, or is a code point that is
- * too large for the bit map, as for example, the fold of the MICRO SIGN is
- * above 255. This all is solely for performance reasons. By having this
- * code know the outside-the-bitmap folds that the bitmapped characters are
- * involved with, we don't have to go out to disk to find the list of
- * matches, unless the character class includes code points that aren't
- * storable in the bit map. That means that a character class with an 's'
- * in it, for example, doesn't need to go out to disk to find everything
- * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
- * empty unless there is something whose fold we don't know about, and will
- * have to go out to the disk to find. */
- SV* l1_fold_invlist = NULL;
+ /* inversion list of code points this node matches only when the target
+ * string is in UTF-8. (Because is under /d) */
+ SV* depends_list = NULL;
/* List of multi-character folds that are matched by this node */
AV* unicode_alternate = NULL;
#ifdef EBCDIC
+ /* In a range, counts how many 0-2 of the ends of it came from literals,
+ * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
UV literal_endpoint = 0;
#endif
UV stored = 0; /* how many chars stored in the bitmap */
/* Invert if asking for the complement */
if (value == 'P') {
-
- /* Add to any existing list */
- if (! properties) {
- properties = invlist_clone(invlist);
- _invlist_invert(properties);
- }
- else {
- invlist = invlist_clone(invlist);
- _invlist_invert(invlist);
- _invlist_union(properties, invlist, &properties);
- SvREFCNT_dec(invlist);
- }
+ _invlist_union_complement_2nd(properties, invlist, &properties);
/* The swash can't be used as-is, because we've
* inverted things; delay removing it to here after
swash = NULL;
}
else {
- if (! properties) {
- properties = invlist_clone(invlist);
- }
- else {
- _invlist_union(properties, invlist, &properties);
- }
+ _invlist_union(properties, invlist, &properties);
}
}
Safefree(name);
}
break;
case 'x':
- if (*RExC_parse == '{') {
- I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX;
- char * const e = strchr(RExC_parse++, '}');
- if (!e)
- vFAIL("Missing right brace on \\x{}");
-
- numlen = e - RExC_parse;
- value = grok_hex(RExC_parse, &numlen, &flags, NULL);
- RExC_parse = e + 1;
- }
- else {
- I32 flags = PERL_SCAN_DISALLOW_PREFIX;
- numlen = 2;
- value = grok_hex(RExC_parse, &numlen, &flags, NULL);
+ RExC_parse--; /* function expects to be pointed at the 'x' */
+ {
+ const char* error_msg;
+ bool valid = grok_bslash_x(RExC_parse,
+ &value,
+ &numlen,
+ &error_msg,
+ 1);
RExC_parse += numlen;
+ if (! valid) {
+ vFAIL(error_msg);
+ }
}
if (PL_encoding && value < 0x100)
goto recode_encoding;
ckWARN4reg(RExC_parse,
"False [] range \"%*.*s\"",
w, w, rangebegin);
-
- stored +=
- set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
- if (prevvalue < 256) {
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
- }
- else {
- nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
- }
+ nonbitmap = add_cp_to_invlist(nonbitmap, '-');
+ nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
}
range = 0; /* this was not a true range */
}
if (!SIZE_ONLY) {
- const char *what = NULL;
- char yesno = 0;
/* Possible truncation here but in some 64-bit environments
* the compiler gets heartburn about switch on 64-bit values.
* A similar issue a little earlier when switching on value.
* --jhi */
switch ((I32)namedclass) {
-
- case _C_C_T_(ALNUMC, isALNUMC_L1, isALNUMC, "XPosixAlnum", "XPosixAlnum");
- case _C_C_T_(ALPHA, isALPHA_L1, isALPHA, "XPosixAlpha", "XPosixAlpha");
- case _C_C_T_(BLANK, isBLANK_L1, isBLANK, "XPosixBlank", "XPosixBlank");
- case _C_C_T_(CNTRL, isCNTRL_L1, isCNTRL, "XPosixCntrl", "XPosixCntrl");
- case _C_C_T_(GRAPH, isGRAPH_L1, isGRAPH, "XPosixGraph", "XPosixGraph");
- case _C_C_T_(LOWER, isLOWER_L1, isLOWER, "XPosixLower", "__XPosixLower_i");
- case _C_C_T_(PRINT, isPRINT_L1, isPRINT, "XPosixPrint", "XPosixPrint");
- case _C_C_T_(PSXSPC, isPSXSPC_L1, isPSXSPC, "XPosixSpace", "XPosixSpace");
- case _C_C_T_(PUNCT, isPUNCT_L1, isPUNCT, "XPosixPunct", "XPosixPunct");
- case _C_C_T_(UPPER, isUPPER_L1, isUPPER, "XPosixUpper", "__XPosixUpper_i");
- /* \s, \w match all unicode if utf8. */
- case _C_C_T_(SPACE, isSPACE_L1, isSPACE, "SpacePerl", "SpacePerl");
- case _C_C_T_(ALNUM, isWORDCHAR_L1, isALNUM, "Word", "Word");
- case _C_C_T_(XDIGIT, isXDIGIT_L1, isXDIGIT, "XPosixXDigit", "XPosixXDigit");
- case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace");
- case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace");
+
+ case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
+ break;
+ case ANYOF_NALNUMC:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
+ break;
+ case ANYOF_ALPHA:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
+ break;
+ case ANYOF_NALPHA:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
+ break;
case ANYOF_ASCII:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_ASCII);
- else {
- for (value = 0; value < 128; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
+ if (LOC) {
+ ANYOF_CLASS_SET(ret, namedclass);
}
- yesno = '+';
- what = NULL; /* Doesn't match outside ascii, so
- don't want to add +utf8:: */
+ else {
+ _invlist_union(properties, PL_ASCII, &properties);
+ }
break;
case ANYOF_NASCII:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_NASCII);
- else {
- for (value = 128; value < 256; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &l1_fold_invlist, &unicode_alternate);
+ if (LOC) {
+ ANYOF_CLASS_SET(ret, namedclass);
}
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
- yesno = '!';
- what = "ASCII";
- break;
+ else {
+ _invlist_union_complement_2nd(properties,
+ PL_ASCII, &properties);
+ if (DEPENDS_SEMANTICS) {
+ ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
+ }
+ }
+ break;
+ case ANYOF_BLANK:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixBlank, PL_XPosixBlank);
+ break;
+ case ANYOF_NBLANK:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixBlank, PL_XPosixBlank);
+ break;
+ case ANYOF_CNTRL:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixCntrl, PL_XPosixCntrl);
+ break;
+ case ANYOF_NCNTRL:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixCntrl, PL_XPosixCntrl);
+ break;
case ANYOF_DIGIT:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
+ /* There are no digits in the Latin1 range outside of
+ * ASCII, so call the macro that doesn't have to resolve
+ * them */
+ DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
+ PL_PosixDigit, "XPosixDigit", listsv);
+ break;
+ case ANYOF_NDIGIT:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+ break;
+ case ANYOF_GRAPH:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
+ break;
+ case ANYOF_NGRAPH:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
+ break;
+ case ANYOF_HORIZWS:
+ /* For these, we use the nonbitmap, as /d doesn't make a
+ * difference in what these match. There would be problems
+ * if these characters had folds other than themselves, as
+ * nonbitmap is subject to folding. It turns out that \h
+ * is just a synonym for XPosixBlank */
+ _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
+ break;
+ case ANYOF_NHORIZWS:
+ _invlist_union_complement_2nd(nonbitmap,
+ PL_XPosixBlank, &nonbitmap);
+ break;
+ case ANYOF_LOWER:
+ case ANYOF_NLOWER:
+ { /* These require special handling, as they differ under
+ folding, matching Cased there (which in the ASCII range
+ is the same as Alpha */
+
+ SV* ascii_source;
+ SV* l1_source;
+ const char *Xname;
+
+ if (FOLD && ! LOC) {
+ ascii_source = PL_PosixAlpha;
+ l1_source = PL_L1Cased;
+ Xname = "Cased";
+ }
else {
- /* consecutive digits assumed */
- for (value = '0'; value <= '9'; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
+ ascii_source = PL_PosixLower;
+ l1_source = PL_L1PosixLower;
+ Xname = "XPosixLower";
+ }
+ if (namedclass == ANYOF_LOWER) {
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ ascii_source, l1_source, Xname, listsv);
+ }
+ else {
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
+ properties, ascii_source, l1_source, Xname, listsv);
}
- yesno = '+';
- what = "Digit";
break;
- case ANYOF_NDIGIT:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
+ }
+ case ANYOF_PRINT:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
+ break;
+ case ANYOF_NPRINT:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
+ break;
+ case ANYOF_PUNCT:
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
+ break;
+ case ANYOF_NPUNCT:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
+ break;
+ case ANYOF_PSXSPC:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixSpace, PL_XPosixSpace);
+ break;
+ case ANYOF_NPSXSPC:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixSpace, PL_XPosixSpace);
+ break;
+ case ANYOF_SPACE:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PerlSpace, PL_XPerlSpace);
+ break;
+ case ANYOF_NSPACE:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PerlSpace, PL_XPerlSpace);
+ break;
+ case ANYOF_UPPER: /* Same as LOWER, above */
+ case ANYOF_NUPPER:
+ {
+ SV* ascii_source;
+ SV* l1_source;
+ const char *Xname;
+
+ if (FOLD && ! LOC) {
+ ascii_source = PL_PosixAlpha;
+ l1_source = PL_L1Cased;
+ Xname = "Cased";
+ }
else {
- /* consecutive digits assumed */
- for (value = 0; value < '0'; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
- for (value = '9' + 1; value < 256; value++)
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) value, &l1_fold_invlist, &unicode_alternate);
+ ascii_source = PL_PosixUpper;
+ l1_source = PL_L1PosixUpper;
+ Xname = "XPosixUpper";
}
- yesno = '!';
- what = "Digit";
- if (AT_LEAST_ASCII_RESTRICTED ) {
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
+ if (namedclass == ANYOF_UPPER) {
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ ascii_source, l1_source, Xname, listsv);
}
- break;
+ else {
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
+ properties, ascii_source, l1_source, Xname, listsv);
+ }
+ break;
+ }
+ case ANYOF_ALNUM: /* Really is 'Word' */
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+ break;
+ case ANYOF_NALNUM:
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+ break;
+ case ANYOF_VERTWS:
+ /* For these, we use the nonbitmap, as /d doesn't make a
+ * difference in what these match. There would be problems
+ * if these characters had folds other than themselves, as
+ * nonbitmap is subject to folding */
+ _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
+ break;
+ case ANYOF_NVERTWS:
+ _invlist_union_complement_2nd(nonbitmap,
+ PL_VertSpace, &nonbitmap);
+ break;
+ case ANYOF_XDIGIT:
+ DO_POSIX(ret, namedclass, properties,
+ PL_PosixXDigit, PL_XPosixXDigit);
+ break;
+ case ANYOF_NXDIGIT:
+ DO_N_POSIX(ret, namedclass, properties,
+ PL_PosixXDigit, PL_XPosixXDigit);
+ break;
case ANYOF_MAX:
/* this is to handle \p and \P */
break;
vFAIL("Invalid [::] class");
break;
}
- if (what && ! (AT_LEAST_ASCII_RESTRICTED)) {
- /* Strings such as "+utf8::isWord\n" */
- Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n", yesno, what);
- }
continue;
}
"False [] range \"%*.*s\"",
w, w, rangebegin);
}
- if (!SIZE_ONLY)
- stored +=
- set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
+ if (!SIZE_ONLY)
+ nonbitmap = add_cp_to_invlist(nonbitmap, '-');
} else
range = 1; /* yeah, it's a range! */
continue; /* but do it the next time */
/* now is the next time */
if (!SIZE_ONLY) {
- if (prevvalue < 256) {
- const IV ceilvalue = value < 256 ? value : 255;
- IV i;
-#ifdef EBCDIC
- /* In EBCDIC [\x89-\x91] should include
- * the \x8e but [i-j] should not. */
- if (literal_endpoint == 2 &&
- ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
- (isUPPER(prevvalue) && isUPPER(ceilvalue))))
- {
- if (isLOWER(prevvalue)) {
- for (i = prevvalue; i <= ceilvalue; i++)
- if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
- }
- } else {
- for (i = prevvalue; i <= ceilvalue; i++)
- if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
- }
- }
- }
- else
-#endif
- for (i = prevvalue; i <= ceilvalue; i++) {
- stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
- }
- }
- if (value > 255) {
- const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
- const UV natvalue = NATIVE_TO_UNI(value);
- nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
- }
-#ifdef EBCDIC
- literal_endpoint = 0;
+#ifndef EBCDIC
+ nonbitmap = _add_range_to_invlist(nonbitmap, prevvalue, value);
+#else
+ UV* this_range = _new_invlist(1);
+ _append_range_to_invlist(this_range, prevvalue, value);
+
+ /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
+ * If this range was specified using something like 'i-j', we want
+ * to include only the 'i' and the 'j', and not anything in
+ * between, so exclude non-ASCII, non-alphabetics from it.
+ * However, if the range was specified with something like
+ * [\x89-\x91] or [\x89-j], all code points within it should be
+ * included. literal_endpoint==2 means both ends of the range used
+ * a literal character, not \x{foo} */
+ if (literal_endpoint == 2
+ && (prevvalue >= 'a' && value <= 'z')
+ || (prevvalue >= 'A' && value <= 'Z'))
+ {
+ _invlist_intersection(this_range, PL_ASCII, &this_range, );
+ _invlist_intersection(this_range, PL_Alpha, &this_range, );
+ }
+ _invlist_union(nonbitmap, this_range, &nonbitmap);
+ literal_endpoint = 0;
#endif
}
range = 0; /* this range (if it was one) is done now */
}
-
-
if (SIZE_ONLY)
return ret;
/****** !SIZE_ONLY AFTER HERE *********/
- /* If folding and there are code points above 255, we calculate all
- * characters that could fold to or from the ones already on the list */
+ /* If folding, we calculate all characters that could fold to or from the
+ * ones already on the list */
if (FOLD && nonbitmap) {
UV start, end; /* End points of code point ranges */
SV* fold_intersection = NULL;
- /* This is a list of all the characters that participate in folds
- * (except marks, etc in multi-char folds */
- if (! PL_utf8_foldable) {
- SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
- PL_utf8_foldable = _swash_to_invlist(swash);
- SvREFCNT_dec(swash);
- }
-
- /* This is a hash that for a particular fold gives all characters
- * that are involved in it */
- if (! PL_utf8_foldclosures) {
-
- /* If we were unable to find any folds, then we likely won't be
- * able to find the closures. So just create an empty list.
- * Folding will effectively be restricted to the non-Unicode rules
- * hard-coded into Perl. (This case happens legitimately during
- * compilation of Perl itself before the Unicode tables are
- * generated) */
- if (invlist_len(PL_utf8_foldable) == 0) {
- PL_utf8_foldclosures = newHV();
- } else {
- /* If the folds haven't been read in, call a fold function
- * to force that */
- if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES+1];
- STRLEN dummy_len;
-
- /* This particular string is above \xff in both UTF-8 and
- * UTFEBCDIC */
- to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
- assert(PL_utf8_tofold); /* Verify that worked */
- }
- PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
- }
- }
+ const UV highest_index = invlist_len(nonbitmap) - 1;
+
+ /* In the Latin1 range, the characters that can be folded-to or -from
+ * are precisely the alphabetic characters. If the highest code point
+ * is within Latin1, we can use the compiled-in list, and not have to
+ * go out to disk. If the last element in the array is in the
+ * inversion list set, it starts a range that goes to infinity, so the
+ * maximum of the inversion list is definitely above Latin1.
+ * Otherwise, it starts a range that isn't in the set, so the max is
+ * one less than it */
+ if (! ELEMENT_RANGE_MATCHES_INVLIST(highest_index)
+ && invlist_array(nonbitmap)[highest_index] <= 256)
+ {
+ _invlist_intersection(PL_L1PosixAlpha, nonbitmap, &fold_intersection);
+ }
+ else {
+
+ /* This is a list of all the characters that participate in folds
+ * (except marks, etc in multi-char folds */
+ if (! PL_utf8_foldable) {
+ SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
+ PL_utf8_foldable = _swash_to_invlist(swash);
+ SvREFCNT_dec(swash);
+ }
+
+ /* This is a hash that for a particular fold gives all characters
+ * that are involved in it */
+ if (! PL_utf8_foldclosures) {
+
+ /* If we were unable to find any folds, then we likely won't be
+ * able to find the closures. So just create an empty list.
+ * Folding will effectively be restricted to the non-Unicode
+ * rules hard-coded into Perl. (This case happens legitimately
+ * during compilation of Perl itself before the Unicode tables
+ * are generated) */
+ if (invlist_len(PL_utf8_foldable) == 0) {
+ PL_utf8_foldclosures = newHV();
+ }
+ else {
+ /* If the folds haven't been read in, call a fold function
+ * to force that */
+ if (! PL_utf8_tofold) {
+ U8 dummy[UTF8_MAXBYTES+1];
+ STRLEN dummy_len;
+
+ /* This particular string is above \xff in both UTF-8
+ * and UTFEBCDIC */
+ to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
+ assert(PL_utf8_tofold); /* Verify that worked */
+ }
+ PL_utf8_foldclosures =
+ _swash_inversion_hash(PL_utf8_tofold);
+ }
+ }
- /* Only the characters in this class that participate in folds need be
- * checked. Get the intersection of this class and all the possible
- * characters that are foldable. This can quickly narrow down a large
- * class */
- _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
+ /* Only the characters in this class that participate in folds need
+ * be checked. Get the intersection of this class and all the
+ * possible characters that are foldable. This can quickly narrow
+ * down a large class */
+ _invlist_intersection(PL_utf8_foldable, nonbitmap,
+ &fold_intersection);
+ }
/* Now look at the foldable characters in this class individually */
invlist_iterinit(fold_intersection);
while (invlist_iternext(fold_intersection, &start, &end)) {
UV j;
+ /* Locale folding for Latin1 characters is deferred until runtime */
+ if (LOC && start < 256) {
+ start = 256;
+ }
+
/* Look at every character in the range */
for (j = start; j <= end; j++) {
- /* Get its fold */
U8 foldbuf[UTF8_MAXBYTES_CASE+1];
STRLEN foldlen;
- const UV f =
- _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
+ UV f;
+
+ if (j < 256) {
+
+ /* We have the latin1 folding rules hard-coded here so that
+ * an innocent-looking character class, like /[ks]/i won't
+ * have to go out to disk to find the possible matches.
+ * XXX It would be better to generate these via regen, in
+ * case a new version of the Unicode standard adds new
+ * mappings, though that is not really likely, and may be
+ * caught by the default: case of the switch below. */
+
+ if (PL_fold_latin1[j] != j) {
+
+ /* ASCII is always matched; non-ASCII is matched only
+ * under Unicode rules */
+ if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
+ nonbitmap =
+ add_cp_to_invlist(nonbitmap, PL_fold_latin1[j]);
+ }
+ else {
+ depends_list =
+ add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
+ }
+ }
+
+ if (HAS_NONLATIN1_FOLD_CLOSURE(j)
+ && (! isASCII(j) || ! MORE_ASCII_RESTRICTED))
+ {
+ /* Certain Latin1 characters have matches outside
+ * Latin1, or are multi-character. To get here, 'j' is
+ * one of those characters. None of these matches is
+ * valid for ASCII characters under /aa, which is why
+ * the 'if' just above excludes those. The matches
+ * fall into three categories:
+ * 1) They are singly folded-to or -from an above 255
+ * character, e.g., LATIN SMALL LETTER Y WITH
+ * DIAERESIS and LATIN CAPITAL LETTER Y WITH
+ * DIAERESIS;
+ * 2) They are part of a multi-char fold with another
+ * latin1 character; only LATIN SMALL LETTER
+ * SHARP S => "ss" fits this;
+ * 3) They are part of a multi-char fold with a
+ * character outside of Latin1, such as various
+ * ligatures.
+ * We aren't dealing fully with multi-char folds, except
+ * we do deal with the pattern containing a character
+ * that has a multi-char fold (not so much the inverse).
+ * For types 1) and 3), the matches only happen when the
+ * target string is utf8; that's not true for 2), and we
+ * set a flag for it.
+ *
+ * The code below adds the single fold closures for 'j'
+ * to the inversion list. */
+ switch (j) {
+ case 'k':
+ case 'K':
+ /* KELVIN SIGN */
+ nonbitmap =
+ add_cp_to_invlist(nonbitmap, 0x212A);
+ break;
+ case 's':
+ case 'S':
+ /* LATIN SMALL LETTER LONG S */
+ nonbitmap =
+ add_cp_to_invlist(nonbitmap, 0x017F);
+ break;
+ case MICRO_SIGN:
+ nonbitmap = add_cp_to_invlist(nonbitmap,
+ GREEK_SMALL_LETTER_MU);
+ nonbitmap = add_cp_to_invlist(nonbitmap,
+ GREEK_CAPITAL_LETTER_MU);
+ break;
+ case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
+ case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
+ /* ANGSTROM SIGN */
+ nonbitmap =
+ add_cp_to_invlist(nonbitmap, 0x212B);
+ break;
+ case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
+ nonbitmap = add_cp_to_invlist(nonbitmap,
+ LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
+ break;
+ case LATIN_SMALL_LETTER_SHARP_S:
+ nonbitmap = add_cp_to_invlist(nonbitmap,
+ LATIN_CAPITAL_LETTER_SHARP_S);
+
+ /* Under /a, /d, and /u, this can match the two
+ * chars "ss" */
+ if (! MORE_ASCII_RESTRICTED) {
+ add_alternate(&unicode_alternate,
+ (U8 *) "ss", 2);
+
+ /* And under /u or /a, it can match even if
+ * the target is not utf8 */
+ if (AT_LEAST_UNI_SEMANTICS) {
+ ANYOF_FLAGS(ret) |=
+ ANYOF_NONBITMAP_NON_UTF8;
+ }
+ }
+ break;
+ case 'F': case 'f':
+ case 'I': case 'i':
+ case 'L': case 'l':
+ case 'T': case 't':
+ case 'A': case 'a':
+ case 'H': case 'h':
+ case 'J': case 'j':
+ case 'N': case 'n':
+ case 'W': case 'w':
+ case 'Y': case 'y':
+ /* These all are targets of multi-character
+ * folds from code points that require UTF8 to
+ * express, so they can't match unless the
+ * target string is in UTF-8, so no action here
+ * is necessary, as regexec.c properly handles
+ * the general case for UTF-8 matching */
+ break;
+ default:
+ /* Use deprecated warning to increase the
+ * chances of this being output */
+ ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
+ break;
+ }
+ }
+ continue;
+ }
+
+ /* Here is an above Latin1 character. We don't have the rules
+ * hard-coded for it. First, get its fold */
+ f = _to_uni_fold_flags(j, foldbuf, &foldlen,
+ ((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
+ | ((LOC)
+ ? FOLD_FLAGS_LOCALE
+ : (MORE_ASCII_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
if (foldlen > (STRLEN)UNISKIP(f)) {
/* If any of the folded characters of this are in the
* Latin1 range, tell the regex engine that this can
- * match a non-utf8 target string. The only multi-byte
- * fold whose source is in the Latin1 range (U+00DF)
- * applies only when the target string is utf8, or
- * under unicode rules */
- if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
- while (loc < e) {
-
- /* Can't mix ascii with non- under /aa */
- if (MORE_ASCII_RESTRICTED
- && (isASCII(*loc) != isASCII(j)))
- {
- goto end_multi_fold;
- }
- if (UTF8_IS_INVARIANT(*loc)
- || UTF8_IS_DOWNGRADEABLE_START(*loc))
- {
- /* Can't mix above and below 256 under LOC
- */
- if (LOC) {
- goto end_multi_fold;
- }
- ANYOF_FLAGS(ret)
- |= ANYOF_NONBITMAP_NON_UTF8;
- break;
- }
- loc += UTF8SKIP(loc);
- }
- }
+ * match a non-utf8 target string. */
+ while (loc < e) {
+ if (UTF8_IS_INVARIANT(*loc)
+ || UTF8_IS_DOWNGRADEABLE_START(*loc))
+ {
+ ANYOF_FLAGS(ret)
+ |= ANYOF_NONBITMAP_NON_UTF8;
+ break;
+ }
+ loc += UTF8SKIP(loc);
+ }
add_alternate(&unicode_alternate, foldbuf, foldlen);
- end_multi_fold: ;
- }
-
- /* This is special-cased, as it is the only letter which
- * has both a multi-fold and single-fold in Latin1. All
- * the other chars that have single and multi-folds are
- * always in utf8, and the utf8 folding algorithm catches
- * them */
- if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
- stored += set_regclass_bit(pRExC_state,
- ret,
- LATIN_SMALL_LETTER_SHARP_S,
- &l1_fold_invlist, &unicode_alternate);
}
}
- else {
- /* Single character fold. Add everything in its fold
- * closure to the list that this node should match */
+ else {
+ /* Single character fold of above Latin1. Add everything
+ * in its fold closure to the list that this node should
+ * match */
SV** listp;
/* The fold closures data structure is a hash with the keys
/* /aa doesn't allow folds between ASCII and non-;
* /l doesn't allow them between above and below
* 256 */
- if ((MORE_ASCII_RESTRICTED
- && (isASCII(c) != isASCII(j)))
- || (LOC && ((c < 256) != (j < 256))))
+ if ((MORE_ASCII_RESTRICTED && (isASCII(c) != isASCII(j)))
+ || (LOC && ((c < 256) != (j < 256))))
{
continue;
}
- if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
- stored += set_regclass_bit(pRExC_state,
- ret,
- (U8) c,
- &l1_fold_invlist, &unicode_alternate);
- }
- /* It may be that the code point is already in
- * this range or already in the bitmap, in
- * which case we need do nothing */
- else if ((c < start || c > end)
- && (c > 255
- || ! ANYOF_BITMAP_TEST(ret, c)))
- {
+ /* Folds involving non-ascii Latin1 characters
+ * under /d are added to a separate list */
+ if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
+ {
nonbitmap = add_cp_to_invlist(nonbitmap, c);
+ }
+ else {
+ depends_list = add_cp_to_invlist(depends_list, c);
}
}
}
}
- }
+ }
}
SvREFCNT_dec(fold_intersection);
}
- /* Combine the two lists into one. */
- if (l1_fold_invlist) {
- if (nonbitmap) {
- _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
- SvREFCNT_dec(l1_fold_invlist);
- }
- else {
- nonbitmap = l1_fold_invlist;
- }
- }
-
/* And combine the result (if any) with any inversion list from properties.
* The lists are kept separate up to now because we don't want to fold the
* properties */
if (properties) {
- if (nonbitmap) {
- _invlist_union(nonbitmap, properties, &nonbitmap);
- SvREFCNT_dec(properties);
- }
- else {
- nonbitmap = properties;
- }
+ if (AT_LEAST_UNI_SEMANTICS) {
+ if (nonbitmap) {
+ _invlist_union(nonbitmap, properties, &nonbitmap);
+ SvREFCNT_dec(properties);
+ }
+ else {
+ nonbitmap = properties;
+ }
+ }
+ else {
+
+ /* Under /d, we put the things that match only when the target
+ * string is utf8, into a separate list */
+ SV* nonascii_but_latin1_properties = NULL;
+ _invlist_intersection(properties, PL_Latin1,
+ &nonascii_but_latin1_properties);
+ _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
+ &nonascii_but_latin1_properties);
+ _invlist_subtract(properties, nonascii_but_latin1_properties,
+ &properties);
+ if (nonbitmap) {
+ _invlist_union(nonbitmap, properties, &nonbitmap);
+ SvREFCNT_dec(properties);
+ }
+ else {
+ nonbitmap = properties;
+ }
+
+ if (depends_list) {
+ _invlist_union(depends_list, nonascii_but_latin1_properties,
+ &depends_list);
+ SvREFCNT_dec(nonascii_but_latin1_properties);
+ }
+ else {
+ depends_list = nonascii_but_latin1_properties;
+ }
+ }
}
/* Here, <nonbitmap> contains all the code points we can determine at
- * compile time that we haven't put into the bitmap. Go through it, and
+ * compile time that match under all conditions. Go through it, and
* for things that belong in the bitmap, put them there, and delete from
* <nonbitmap> */
if (nonbitmap) {
- /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
- * possibly only should match when the target string is UTF-8 */
- UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
-
/* This gets set if we actually need to modify things */
bool change_invlist = FALSE;
int i;
/* Quit if are above what we should change */
- if (start > max_cp_to_set) {
+ if (start > 255) {
break;
}
change_invlist = TRUE;
/* Set all the bits in the range, up to the max that we are doing */
- high = (end < max_cp_to_set) ? end : max_cp_to_set;
+ high = (end < 255) ? end : 255;
for (i = start; i <= (int) high; i++) {
if (! ANYOF_BITMAP_TEST(ret, i)) {
ANYOF_BITMAP_SET(ret, i);
}
}
- /* Done with loop; set <nonbitmap> to not include any code points that
- * are in the bitmap */
+ /* Done with loop; remove any code points that are in the bitmap from
+ * <nonbitmap> */
if (change_invlist) {
- SV* keep_list = _new_invlist(2);
- _append_range_to_invlist(keep_list, max_cp_to_set + 1, UV_MAX);
- _invlist_intersection(nonbitmap, keep_list, &nonbitmap);
- SvREFCNT_dec(keep_list);
+ _invlist_subtract(nonbitmap, PL_Latin1, &nonbitmap);
}
/* If have completely emptied it, remove it completely */
}
}
+ /* Combine the two lists into one. */
+ if (depends_list) {
+ if (nonbitmap) {
+ _invlist_union(nonbitmap, depends_list, &nonbitmap);
+ SvREFCNT_dec(depends_list);
+ }
+ else {
+ nonbitmap = depends_list;
+ }
+ }
+
/* Here, we have calculated what code points should be in the character
* class. <nonbitmap> does not overlap the bitmap except possibly in the
* case of DEPENDS rules.
* there should not be overlap unless is /d rules. */
_invlist_invert(nonbitmap);
+ /* Any swash can't be used as-is, because we've inverted things */
+ if (swash) {
+ SvREFCNT_dec(swash);
+ swash = NULL;
+ }
+
for (i = 0; i < 256; ++i) {
if (ANYOF_BITMAP_TEST(ret, i)) {
ANYOF_BITMAP_CLEAR(ret, i);
else {
/* There is no overlap for non-/d, so just delete anything
* below 256 */
- SV* keep_list = _new_invlist(2);
- _append_range_to_invlist(keep_list, 256, UV_MAX);
- _invlist_intersection(nonbitmap, keep_list, &nonbitmap);
- SvREFCNT_dec(keep_list);
+ _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
}
}
}
return ret;
}
-#undef _C_C_T_
/* reg_skipcomment()
for (;;) {
regnode * const temp = regnext(scan);
#ifdef EXPERIMENTAL_INPLACESCAN
- if (PL_regkind[OP(scan)] == EXACT)
- if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
+ if (PL_regkind[OP(scan)] == EXACT) {
+ bool has_exactf_sharp_s; /* Unexamined in this routine */
+ if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
return EXACT;
+ }
#endif
if ( exact ) {
switch (OP(scan)) {
case EXACTFA:
case EXACTFU:
case EXACTFU_SS:
- case EXACTFU_NO_TRIE:
+ case EXACTFU_TRICKYFOLD:
case EXACTFL:
if( exact == PSEUDO )
exact= OP(scan);
SvREFCNT_dec(r->saved_copy);
#endif
Safefree(r->offs);
+ SvREFCNT_dec(r->qr_anoncv);
}
/* reg_temp_copy()
{
struct regexp *ret;
struct regexp *const r = (struct regexp *)SvANY(rx);
- register const I32 npar = r->nparens+1;
PERL_ARGS_ASSERT_REG_TEMP_COPY;
SvLEN_set(ret_x, 0);
SvSTASH_set(ret_x, NULL);
SvMAGIC_set(ret_x, NULL);
- Newx(ret->offs, npar, regexp_paren_pair);
- Copy(r->offs, ret->offs, npar, regexp_paren_pair);
+ if (r->offs) {
+ const I32 npar = r->nparens+1;
+ Newx(ret->offs, npar, regexp_paren_pair);
+ Copy(r->offs, ret->offs, npar, regexp_paren_pair);
+ }
if (r->substrs) {
Newx(ret->substrs, 1, struct reg_substr_data);
StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
ret->saved_copy = NULL;
#endif
ret->mother_re = rx;
+ SvREFCNT_inc_void(ret->qr_anoncv);
return ret_x;
}
if (ri->u.offsets)
Safefree(ri->u.offsets); /* 20010421 MJD */
#endif
+ if (ri->code_blocks) {
+ int n;
+ for (n = 0; n < ri->num_code_blocks; n++)
+ SvREFCNT_dec(ri->code_blocks[n].src_regex);
+ Safefree(ri->code_blocks);
+ }
+
if (ri->data) {
int n = ri->data->count;
- PAD* new_comppad = NULL;
- PAD* old_comppad;
- PADOFFSET refcnt;
while (--n >= 0) {
/* If you add a ->what type here, update the comment in regcomp.h */
switch (ri->data->what[n]) {
case 'a':
+ case 'r':
case 's':
case 'S':
case 'u':
case 'f':
Safefree(ri->data->data[n]);
break;
- case 'p':
- new_comppad = MUTABLE_AV(ri->data->data[n]);
- break;
- case 'o':
- if (new_comppad == NULL)
- Perl_croak(aTHX_ "panic: pregfree comppad");
- PAD_SAVE_LOCAL(old_comppad,
- /* Watch out for global destruction's random ordering. */
- (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
- );
- OP_REFCNT_LOCK;
- refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
- OP_REFCNT_UNLOCK;
- if (!refcnt)
- op_free((OP_4tree*)ri->data->data[n]);
-
- PAD_RESTORE_LOCAL(old_comppad);
- SvREFCNT_dec(MUTABLE_SV(new_comppad));
- new_comppad = NULL;
- break;
- case 'n':
+ case 'l':
+ case 'L':
break;
case 'T':
{ /* Aho Corasick add-on structure for a trie node.
}
RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
+ ret->qr_anoncv = MUTABLE_CV(sv_dup_inc((const SV *)ret->qr_anoncv, param));
if (ret->pprivate)
RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
Copy(ri->program, reti->program, len+1, regnode);
-
+
+ reti->num_code_blocks = ri->num_code_blocks;
+ if (ri->code_blocks) {
+ int n;
+ Newxc(reti->code_blocks, ri->num_code_blocks, struct reg_code_block,
+ struct reg_code_block);
+ Copy(ri->code_blocks, reti->code_blocks, ri->num_code_blocks,
+ struct reg_code_block);
+ for (n = 0; n < ri->num_code_blocks; n++)
+ reti->code_blocks[n].src_regex = (REGEXP*)
+ sv_dup_inc((SV*)(ri->code_blocks[n].src_regex), param);
+ }
+ else
+ reti->code_blocks = NULL;
reti->regstclass = NULL;
for (i = 0; i < count; i++) {
d->what[i] = ri->data->what[i];
switch (d->what[i]) {
- /* legal options are one of: sSfpontTua
- see also regcomp.h and pregfree() */
+ /* see also regcomp.h and regfree_internal() */
case 'a': /* actually an AV, but the dup function is identical. */
+ case 'r':
case 's':
case 'S':
- case 'p': /* actually an AV, but the dup function is identical. */
case 'u': /* actually an HV, but the dup function is identical. */
d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
break;
struct regnode_charclass_class);
reti->regstclass = (regnode*)d->data[i];
break;
- case 'o':
- /* Compiled op trees are readonly and in shared memory,
- and can thus be shared without duplication. */
- OP_REFCNT_LOCK;
- d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
- OP_REFCNT_UNLOCK;
- break;
case 'T':
/* Trie stclasses are readonly and can thus be shared
* without duplication. We free the stclass in pregfree
((reg_trie_data*)ri->data->data[i])->refcount++;
OP_REFCNT_UNLOCK;
/* Fall through */
- case 'n':
+ case 'l':
+ case 'L':
d->data[i] = ri->data->data[i];
break;
default:
Copy(&PL_reg_state, state, 1, struct re_save_state);
- PL_reg_start_tmp = 0;
- PL_reg_start_tmpl = 0;
PL_reg_oldsaved = NULL;
PL_reg_oldsavedlen = 0;
PL_reg_maxiter = 0;
* Local variables:
* c-indentation-style: bsd
* c-basic-offset: 4
- * indent-tabs-mode: t
+ * indent-tabs-mode: nil
* End:
*
- * ex: set ts=8 sts=4 sw=4 noet:
+ * ex: set ts=8 sts=4 sw=4 et:
*/